1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTQwMi4wNTYwIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","updated":"2014-02-04T01:34:25.000Z","paperID":"1402.0560","published":"2014-02-04T01:34:25.000Z","authors":"[\"Javier Garcia\",\"Fernando Fernandez\"]","title":"Safe Exploration of State and Action Spaces in Reinforcement Learning","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-05T13:54:07.999Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9zYWZlLWV4cGxvcmF0aW9uLW9mLXN0YXRlLWFuZC1hY3Rpb24tc3BhY2VzIn0=","type":"pwc","url":"https://paperswithcode.com/paper/safe-exploration-of-state-and-action-spaces","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[{"id":"eyJuYW1lIjoic2FmZSBleHBsb3JhdGlvbiIsInR5cGUiOiJ0YXNrIn0=","name":"safe exploration","description":"Safe exploration in machine learning involves an agent learning to make decisions in an environment without causing harmful outcomes. It's used in real-world scenarios where trial-and-error learning could lead to dangerous results, such as autonomous driving or healthcare.","scoreTrending":null,"count":{"stars":307,"papers":128,"models":159},"__typename":"Tag"},{"id":"eyJuYW1lIjoiZWZmaWNpZW50IGV4cGxvcmF0aW9uIiwidHlwZSI6InRhc2sifQ==","name":"efficient exploration","description":"In efficient exploration, the input is an environment with unknown rewards, and the output is an optimal policy for action selection that maximizes rewards. This task is used in reinforcement learning scenarios like game playing or robotics, where the model needs to balance between exploring new actions and exploiting known rewards.","scoreTrending":0.13197289423216624,"count":{"stars":1086,"papers":462,"models":696},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[{"author":null,"node":{"id":"eyJhZGRyZXNzIjoiZmpncG9sb0BpbmYudWMzbS5lcyJ9","address":"fjgpolo@inf.uc3m.es","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJjZDJmMTc0YS04ZDRmLTQwOTgtOTc5OC0xMmJiMTdhYTdmYTEifQ==","name":"francisco javier garcia polo","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTQwMi4wNTYwIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1402.0560"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNjU4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.06587"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wNDcwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.04706"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wOTM1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.09352"},{"id":"eyJwYXBlcklEIjoiMjExMi4wNDQ5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.04494"}]}]}},{"author":"fernando fernandez","node":{"id":"eyJhZGRyZXNzIjoiZmZlcm5hbmRAaW5mLnVjM20uZXMifQ==","address":"ffernand@inf.uc3m.es","name":"Fernando Fernández","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"XRDnkTMAAAAJ"},{"thirdPartyID":"eUqUND4AAAAJ"}],"twitter":[{"avatar":null,"username":"FernandoFerReb"}],"location":[],"owner":[{"id":"eyJ1aWQiOiJiY2YzMzNhYS1mYmFiLTQyYjgtYWJkNC1mYjBhZTBhOTU2YWEifQ==","name":"fernando fernandez","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTQwMi4wNTYwIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1402.0560"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wNDcwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.04706"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wOTM1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.09352"},{"id":"eyJwYXBlcklEIjoiMjExMi4wNDQ5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.04494"}]}]}}]},"__typename":"paper","authorArray":["Javier Garcia","Fernando Fernandez"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"1402.0560","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"1402.0560","publisher":"arxiv","paperJSON":{"title":"Safe Exploration of State and Action Spaces in Reinforcement Learning","paperID":"1402.0560","avgLineHeight":13.55,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"In this paper, we consider the important problem of safe exploration in reinforcement learning. While reinforcement learning is well-suited to domains with complex transition dynamics and high-dimensional state-action spaces, an additional challenge is posed by the need for safe and efficient exploration. ","element":"span"},{"text":"Traditional exploration techniques are not particularly useful for solving dangerous tasks, where the trial and error process may lead to the selection of actions whose execution in some states may result in damage to the learning system (or any other system). Consequently, when an agent begins an interaction with a dangerous and high-dimensional state-action space, an important question arises; namely, that of how to avoid (or at least minimize) damage caused by the exploration of the state-action space. We introduce the PI-SRL algorithm which safely improves suboptimal albeit robust behaviors for continuous state and action control tasks and which efficiently learns from the experience gained from the environment. We evaluate the proposed method in four complex tasks: automatic car parking, pole-balancing, helicopter hovering, and business management.","element":"span"}]]},{"heading":"1. Introduction","paragraphs":[[{"text":"Reinforcement learning (RL) (Sutton & Barto, 1998) is a type of machine learning whose main goal is that of finding a policy that moves an agent optimally in an environment, generally formulated as a ","element":"span"},{"style":{"fontWeight":"bold"},"text":"M","element":"span"},{"text":"arkov ","element":"span"},{"style":{"fontWeight":"bold"},"text":"D","element":"span"},{"text":"ecision ","element":"span"},{"style":{"fontWeight":"bold"},"text":"P","element":"span"},{"text":"rocess (MDP). Many RL methods are being used in important and complex tasks (e.g., robot control see Smart & Kaelbling, 2002; Hester, Quinlan, & Stone, 2011, stochastic games see Mannor, 2004; Konen & Bartz-Beielstein, 2009 and control optimization of complex dynamical systems see Salkham, Cunningham, Garg, & Cahill, 2008). While most RL tasks are focused on maximizing a long-term cumulative reward, RL researchers are paying increasing attention not only to long-term reward maximization, but also to the safety of approaches to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"text":"equential ","element":"span"},{"style":{"fontWeight":"bold"},"text":"D","element":"span"},{"text":"ecision ","element":"span"},{"style":{"fontWeight":"bold"},"text":"P","element":"span"},{"text":"$31","element":"span"},{"style":{"height":16.4},"width":186.34,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/1-0.png","element":"img","alt":" ϵ−greedy","inline":true,"padRight":true},{"text":"may even result in constant helicopter crashes (especially where there is a high probability of random action selection). Another example can be found in portfolio theory where analysts are expected to find a portfolio that maximizes profit while avoiding risks of considerable losses (Luenberger, 1998). Since the maximization of expected returns does not necessarily prevent rare occurrences of large negative outcomes, a different criteria for safe exploration is needed. The exploration process in which new policies are evaluated must be conducted with extreme care. Indeed, for such environments, a method is required which not only explores the state-action space, but which does so in a safe manner.","element":"span"}],[{"text":"In this paper, we propose the ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"P","element":"span"},{"style":{"fontStyle":"italic"},"text":"olicy ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"I","element":"span"},{"style":{"fontStyle":"italic"},"text":"mprovement through ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontStyle":"italic"},"text":"afe ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"R","element":"span"},{"style":{"fontStyle":"italic"},"text":"einforcement ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"L","element":"span"},{"style":{"fontStyle":"italic"},"text":"earning ","element":"span"},{"text":"$32","element":"span"},{"text":"In the first, baseline behavior (robust albeit suboptimal) is approximated using behavioral cloning techniques (Anderson, Draper, & Peterson, 2000; Abbott, 2008). In order to achieve this goal, case-based reasoning (CBR) techniques (Aamodt & Plaza, 1994; Bartsch-Sprl, Lenz, & Hbner, 1999) were used which have been successfully applied to imitation tasks in the past (Floyd & Esfandiari, 2010; Floyd, Esfandiari, & Lam, 2008). In the second step, the PI-SRL algorithm attempts to safely explore the state-action space in order to build a more accurate policy from previously-learned behavior. Thus, the set of cases (i.e., state-action pairs) obtained in the previous phase is improved through the safe exploration of the state-action space. To perform this exploration, small amounts of Gaussian noise are randomly added to the greedy actions of the baseline policy approach. The exploration strategy has been used successfully in previous works (Argall, Chernova, Veloso, & Browning, 2009; Van Hasselt & Wiering, 2007).","element":"span"}],[{"text":"The novelty of the present study is in the use of two new, main components: (i) a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"risk function ","element":"span"},{"text":"to determine the degree of risk of a particular state and (ii) a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"baseline behavior ","element":"span"},{"text":"capable of producing safe actions in supposedly risky states (i.e., states that can lead to damage or injury). In addition, we present a new definition of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"risk ","element":"span"},{"text":"based on what for the agent is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"unknown ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"known space","element":"span"},{"text":"$33","element":"span"}],[{"text":"Regarding the organization of the remainder of the paper, Section 2 introduces key definitions, while Section 3 describes in detail the learning approach proposed. In Section 4, the evaluation performed in the four above mentioned domains is presented. ","element":"span"},{"text":"Section 5 discusses related work and Section 6 summarizes the main conclusions of our study. In these sections, the term ","element":"span"},{"style":{"fontStyle":"italic"},"text":"return ","element":"span"},{"text":"is used to refer to the expected cumulative future discounted reward ","element":"span"},{"style":{"height":19.49},"width":273.42,"height":48.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/2-0.png","element":"img","alt":" R = �∞t=0 γtrt","inline":true},{"text":", while the term ","element":"span"},{"style":{"fontStyle":"italic"},"text":"reward ","element":"span"},{"text":"is used to refer to a single real value used to ","element":"span"},{"text":"evaluate the selection of an action in a particular state and it is denoted by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":".","element":"span"}]]},{"heading":"2. Deﬁnitions","paragraphs":[[{"text":"To illustrate the concept of safety used in our approach, a navigation problem is presented below in Figure 1. In the navigation problem presented in Figure 1, a control policy must be learned to get from a particular start state to a goal state, given a set of demonstration trajectories. In this environment, we assume the task to be difficult due to a stochastic and complex dynamic of the environment (e.g., an extremely irregular surface in the case of a robot navigation domain or wind effects in the case of the helicopter hover task). This stochasticity makes it impossible to complete the task using exactly the same trajectory every time. Additionally, the problem supposes that a set of demonstrations from a baseline controller performing the task (the continuous black lines) are also given. ","element":"span"},{"text":"This set of demonstrations is composed of different trajectories covering a well-defined region of the state space (the region within the rectangle).","element":"span"}],[{"text":"Our approach is based on the addition of small amounts of Gaussian noise or perturbations to the baseline trajectories in order to find new and better ways of completing the task. This noise will affect the baseline trajectories in different ways, depending on the amount of noise added which, in turn, depends on the amount of risk to be taken. If no risk is desired, the noise added to the baseline trajectories will be 0 and, consequently, no new or improved behavior will be discovered (nevertheless, the robot will never fall off the cliff and the helicopter will never crash). If, however, an intermediate level of risk is desired, small amounts of noise will be added to the baseline trajectories and new trajectories (the","element":"span"}],[{"style":{"width":"81%"},"width":1401,"height":597,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-0.png","element":"img"}],[{"text":"Figure 1: Exploration strategy based on the addition of small amounts of noise to baseline policy behavior. Continuous lines represent the baseline behavior, while newly explored behaviors are indicated by the dotted and dashed lines.","element":"figcaption","subtype":"caption"}],[{"text":"dotted blue lines) to complete the task are discovered. In some cases, the exploration of new trajectories leads the robot to unknown regions of the state space (the dashed red lines). The robot is assumed to be able to detect such situations with a risk function and use the baseline behavior to return to safe, known states. If, instead, a very high risk is desired, large amounts of noise will be added to the baseline trajectories, leading to the discovery of new trajectories (but also to a higher probability that the robot gets damaged). The iteration of this process leads the robot to progressively and safely explore the state and action spaces in order to find new and improved ways to complete the task. The degree of safety in the exploration, however, will depend on the risk taken.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.1 Error and Non-Error States","element":"span"}],[{"text":"In this paper, we follow as far we can the notation presented in Geibel et al. (2005) for the definition of our concept of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"risk","element":"span"},{"text":". In their study, Geibel et al. associate risk with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"error states ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"non-error states","element":"span"},{"text":", with the former understood as a state in which it is considered undesirable or dangerous to enter.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 1 ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"Error and non-error states","element":"span"},{"style":{"fontStyle":"italic"},"text":". Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be a set of states and ","element":"span"},{"text":"Φ ","element":"span"},{"style":{"height":13.2},"width":222.4,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-1.png","element":"img","alt":" ⊂ S the set","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"of error states. A state ","element":"span"},{"style":{"height":12.8},"width":120.98,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-2.png","element":"img","alt":" s ∈ Φ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is an undesirable terminal state where the control of the agent ends when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is reached with damage or injury to the agent, the learning system or any external entities. The set ","element":"span"},{"text":"Γ ","element":"span"},{"style":{"height":13.2},"width":77.69,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-3.png","element":"img","alt":" ⊂ S","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is considered a set of non-error terminal states with ","element":"span"},{"text":"Γ ","element":"span"},{"style":{"height":15.2},"width":150.48,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-4.png","element":"img","alt":" ∩ Φ = ∅","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and where the control of the agent ends normally without damage or injury.","element":"span"}],[{"text":"In terms of RL, if the agent enters an error state, the current episode ends with damage to the learning system (or other systems); whereas if it enters a non-error state, the episode ends normally and without damage. Thus, Geibel et al. define the risk of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"with respect to policy ","element":"span"},{"style":{"height":17.6},"width":135.62,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-5.png","element":"img","alt":" π, ρπ(s","inline":true},{"text":"), as the probability that the state sequence (","element":"span"},{"style":{"height":18.22},"width":338.74,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-6.png","element":"img","alt":"si)i≥0 with s0 = s","inline":true},{"text":", generated by the execution of policy ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-7.png","element":"img","alt":" π","inline":true},{"text":", terminates in an error state ","element":"span"},{"style":{"height":10.4},"width":74.35,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-8.png","element":"img","alt":" s′ ∈","inline":true,"padRight":true},{"text":"Φ. By definition, ","element":"span"},{"style":{"height":17.6},"width":182.94,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-9.png","element":"img","alt":" ρπ(s) = 1","inline":true,"padRight":true},{"text":"if ","element":"span"},{"style":{"height":18},"width":1047.36,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-10.png","element":"img","alt":" s ∈ Φ. If s ∈ Γ, then ρπ(s) = 0 because Φ ∩ Γ = ∅","inline":true},{"text":". For states ","element":"span"},{"style":{"height":17.6},"width":160.89,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-11.png","element":"img","alt":" s /∈ Φ ∪","inline":true,"padRight":true},{"text":"Γ, the risk taken depends on the actions selected by the policy ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/3-12.png","element":"img","alt":" π","inline":true},{"text":". With these definitions, we have the theoretical framework with which to introduce our own definition of the risk associated with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"known ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"unknown states","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.2 Known and Unknown States in Continuous Action and State Spaces","element":"span"}],[{"text":"We assume a continuous, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":"-dimensional state space ","element":"span"},{"style":{"height":13.6},"width":139.97,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-0.png","element":"img","alt":" S ⊂ ℜn ","inline":true,"padRight":true},{"text":"where each state ","element":"span"},{"style":{"height":17.6},"width":283.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-1.png","element":"img","alt":" s = (s1, s2, . . . ,","inline":true},{"style":{"height":17.6},"width":142.99,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-2.png","element":"img","alt":"sn) ∈ S","inline":true,"padRight":true},{"text":"is a vector of real numbers and each dimension has an individual domain ","element":"span"},{"style":{"height":17.69},"width":157.38,"height":44.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-3.png","element":"img","alt":" Dsi ⊂ ℜ.","inline":true,"padRight":true},{"text":"Similarly, we assume a continuous and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":"-dimensional action space ","element":"span"},{"style":{"height":13.6},"width":400.12,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-4.png","element":"img","alt":" A ⊂ ℜm where each","inline":true,"padRight":true},{"text":"action ","element":"span"},{"style":{"height":17.6},"width":489.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-5.png","element":"img","alt":" a = (a1, a2, . . . , am) ∈ A","inline":true,"padRight":true},{"text":"is a vector of real numbers and each dimension has an individual domain ","element":"span"},{"style":{"height":17.69},"width":176.25,"height":44.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-6.png","element":"img","alt":" Dai ⊂ ℜ.","inline":true,"padRight":true},{"text":"Additionally, the agent considered here is endowed with a memory, or case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", of the size ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-7.png","element":"img","alt":" η","inline":true},{"text":". Each memory element represents a state-action pair, or case, the agent has experienced before.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 2 ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"(Case-base)","element":"span"},{"style":{"fontStyle":"italic"},"text":". A case-base is a set of cases ","element":"span"},{"style":{"height":18.62},"width":313.45,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-8.png","element":"img","alt":" B = {c1 . . . , cη}","inline":true},{"style":{"fontStyle":"italic"},"text":". Every case ","element":"span"},{"style":{"height":10.62},"width":30.88,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-9.png","element":"img","alt":"ci","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"consists of a state-action pair ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":106.96,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-10.png","element":"img","alt":"si, ai)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the agent has experienced in the past and with an associated value ","element":"span"},{"style":{"height":17.6},"width":658.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-11.png","element":"img","alt":" V (si). Thus, ci =< si, ai, V (si) >","inline":true},{"style":{"fontStyle":"italic"},"text":", where the first element represents the case’s problem part and corresponds to the state ","element":"span"},{"style":{"height":10.62},"width":32.45,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-12.png","element":"img","alt":" si","inline":true},{"style":{"fontStyle":"italic"},"text":", the following element ","element":"span"},{"style":{"height":10.62},"width":35.06,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-13.png","element":"img","alt":" ai","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"depicts the case solution (i.e., the action expected when the agent is in the state ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-14.png","element":"img","alt":" si","inline":true},{"style":{"fontStyle":"italic"},"text":") and the final element ","element":"span"},{"style":{"height":17.6},"width":103.1,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-15.png","element":"img","alt":"V (si)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the value function associated with the state ","element":"span"},{"style":{"height":10.62},"width":32.45,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-16.png","element":"img","alt":" si","inline":true},{"style":{"fontStyle":"italic"},"text":". Each state ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-17.png","element":"img","alt":" si","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is composed of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"style":{"fontStyle":"italic"},"text":"continuous state variables and each action ","element":"span"},{"style":{"height":10.62},"width":35.07,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-18.png","element":"img","alt":" ai","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is composed of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"style":{"fontStyle":"italic"},"text":"continuous action variables.","element":"span"}],[{"text":"When the agent receives a new state ","element":"span"},{"style":{"height":13.02},"width":36.46,"height":32.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-19.png","element":"img","alt":" sq","inline":true},{"text":", it first retrieves the nearest neighbor of ","element":"span"},{"style":{"height":16.62},"width":91.12,"height":41.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-20.png","element":"img","alt":" sq in","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"according to a given similarity metric and then performs the associated action. In this paper, we use Euclidean distance as our similarity metric (Equation 1).","element":"span"}],[{"style":{"width":"65%"},"width":1131,"height":159,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-21.png","element":"img"}],[{"text":"The Euclidean distance metric is useful when the value function is expected to be continuous and smooth throughout the state space (Santamar´ıa, Sutton, & Ram, 1998). However, since the value function is unknown a priori and the Euclidean distance metric is not particularly suitable for many problems, many researchers have begun to ask how the distance metric itself can learn or adapt in order to achieve better results (Taylor, Kulis, & Sha, 2011). While the use of distance metric learning techniques would certainly be desirable in order to induce a more powerful distance metric for a specific domain, such a consideration lies outside the scope of the present study. In this paper, therefore, we have focused only on domains in which Euclidean distance has been proven successful (i.e., it has been successfully applied to car parking (Cichosz, 1995), pole-balancing (Martin H & de Lope, 2009), helicopter hovering control (Martin H & de Lope, 2009) and SIMBA (Borrajo et al., 2010).","element":"span"}],[{"text":"Traditionally, case-based approaches use a ","element":"span"},{"style":{"height":16.4},"width":349.32,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-22.png","element":"img","alt":" density threshold θ","inline":true,"padRight":true},{"text":"in order to determine when a new case should be added to the memory. When the distance of the nearest neighbor to ","element":"span"},{"style":{"height":13.02},"width":36.46,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-23.png","element":"img","alt":"sq","inline":true,"padRight":true},{"text":"is greater than ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-24.png","element":"img","alt":" θ","inline":true},{"text":", a new case is added. In this sense, the parameter ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-25.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"defines the size of the classification region for each case in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(Figure 2). If a new case ","element":"span"},{"style":{"height":13.02},"width":36.46,"height":32.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-26.png","element":"img","alt":" sq","inline":true,"padRight":true},{"text":"is within the classification region of a case ","element":"span"},{"style":{"height":10.62},"width":30.88,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-27.png","element":"img","alt":" ci","inline":true},{"text":", it is considered to be a known state. Hence, the cases in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"describe a case-based policy of the agent ","element":"span"},{"style":{"height":20.31},"width":50.88,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-28.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"and its associated value function ","element":"span"},{"style":{"height":18.04},"width":93.89,"height":45.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/4-29.png","element":"img","alt":" V πθB.","inline":true}],[{"style":{"width":"63%"},"width":1105,"height":605,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-0.png","element":"img"}],[{"text":"Figure 2: Known and Unknown states.","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"Definition 3 ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"(Known/Unknown states)","element":"span"},{"style":{"fontStyle":"italic"},"text":". Given a case-base ","element":"span"},{"style":{"height":18.62},"width":493.05,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-1.png","element":"img","alt":" B = {c1 . . . , cη} composed","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"of cases ","element":"span"},{"style":{"height":17.6},"width":348,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-2.png","element":"img","alt":" ci = (si, ai, V (si))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a density threshold ","element":"span"},{"style":{"height":17.42},"width":231.76,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-3.png","element":"img","alt":" θ, a state sq","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is considered known when ","element":"span"},{"text":"min","element":"span"},{"style":{"height":18.62},"width":336.49,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-4.png","element":"img","alt":"1≤i≤η d(sq, si) ≤ θ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and unknown in all other cases. Formally, ","element":"span"},{"text":"Ω ","element":"span"},{"style":{"height":14.8},"width":75.06,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-5.png","element":"img","alt":" ⊆ S","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the set of known states, while ","element":"span"},{"text":"Υ ","element":"span"},{"style":{"height":14.8},"width":75.06,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-6.png","element":"img","alt":" ⊆ S","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the set of unknown states with ","element":"span"},{"text":"Ω ","element":"span"},{"style":{"height":15.2},"width":467.48,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-7.png","element":"img","alt":" ∩ Υ = ∅ and Ω ∪ Υ = S.","inline":true}],[{"text":"With Definition 3, states can be identified as known or unknown. ","element":"span"},{"text":"When the agent receives a new state ","element":"span"},{"style":{"height":10.4},"width":68.42,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-8.png","element":"img","alt":" s ∈","inline":true,"padRight":true},{"text":"Ω, it performs the action ","element":"span"},{"style":{"height":10.62},"width":35.07,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-9.png","element":"img","alt":" ai","inline":true,"padRight":true},{"text":"of the case ","element":"span"},{"style":{"height":17.6},"width":434.37,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-10.png","element":"img","alt":" ci for which d(s, si) =","inline":true,"padRight":true},{"text":"min","element":"span"},{"style":{"height":18.62},"width":227.56,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-11.png","element":"img","alt":"1≤j≤η d(s, sj","inline":true},{"text":"). However, if the agent receives a state ","element":"span"},{"style":{"height":10.4},"width":67.52,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-12.png","element":"img","alt":" s ∈","inline":true,"padRight":true},{"text":"Υ where, by definition, the distance to any state in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"is larger than ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-13.png","element":"img","alt":" θ","inline":true},{"text":", no case is retrieved. Consequently, the action to be performed from that state is unknown to the agent.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 4 ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"(Case-Based risk function)","element":"span"},{"style":{"fontStyle":"italic"},"text":". Given a case base ","element":"span"},{"style":{"height":18.62},"width":489.97,"height":46.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-14.png","element":"img","alt":" B = {c1 . . . , cη} composed","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"of cases ","element":"span"},{"style":{"height":17.6},"width":336.99,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-15.png","element":"img","alt":" ci = (si, ai, V (si))","inline":true},{"style":{"fontStyle":"italic"},"text":", the risk for each state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is defined as Equation 2.","element":"span"}],[{"style":{"width":"72%"},"width":1247,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-16.png","element":"img"}],[{"text":"Thus, ","element":"span"},{"style":{"height":22.03},"width":685.21,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-17.png","element":"img","alt":" ϱπθB(s) = 1 holds if s ∈ Υ (i.e., s","inline":true,"padRight":true},{"text":"is unknown), such that the state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"is not associated with any case and, hence, the action to be performed in the given situation is unknown. If ","element":"span"},{"style":{"height":22.04},"width":447.08,"height":55.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-18.png","element":"img","alt":" s ∈ Ω, then ϱπθB(s) = 0.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Definition 5 ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"(Safe case-based policy)","element":"span"},{"style":{"fontStyle":"italic"},"text":". The case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-19.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"derived from a case- ","element":"span"},{"style":{"fontStyle":"italic"},"text":"base ","element":"span"},{"style":{"height":18.62},"width":315.91,"height":46.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-20.png","element":"img","alt":" B = {c1. . . . , cη}","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is safe when, from any initial known state ","element":"span"},{"style":{"height":10.62},"width":37.46,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-21.png","element":"img","alt":" s0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"style":{"fontStyle":"italic"},"text":", the execution of ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-22.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"always produces known non-error states with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"width":"74%"},"width":1285,"height":70,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-23.png","element":"img"}],[{"text":"Additionally, it is assumed here that the probability that the state sequence (","element":"span"},{"style":{"height":18.22},"width":208.79,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-24.png","element":"img","alt":"si)i≥0 from","inline":true,"padRight":true},{"text":"any known state ","element":"span"},{"style":{"height":12.22},"width":86.32,"height":30.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-25.png","element":"img","alt":" s0 ∈","inline":true,"padRight":true},{"text":"Ω, generated by executing policy ","element":"span"},{"style":{"height":20.31},"width":50.88,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-26.png","element":"img","alt":" πθB","inline":true},{"text":", terminates in an error state ","element":"span"},{"style":{"height":22.04},"width":720.77,"height":55.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/5-27.png","element":"img","alt":"s ∈ Φ is ρπθB(s0) = 0 (i.e., Ω ∩ Φ = ∅).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Definition 6 ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"(Safe case-based coverage)","element":"span"},{"style":{"fontStyle":"italic"},"text":". The coverage of a single state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"style":{"fontStyle":"italic"},"text":"with respect to a safe case-base ","element":"span"},{"style":{"height":18.62},"width":311.09,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-0.png","element":"img","alt":" B = {c1. . . . , cη}","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is defined as the state ","element":"span"},{"style":{"height":18.62},"width":637.5,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-1.png","element":"img","alt":" si for which min1≤i≤η d(s, si) ≤ θ.","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Therefore, we assume that the safe case-based does not provide actions for the entire state space, but rather only for known states ","element":"span"},{"style":{"height":13.2},"width":118.3,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-2.png","element":"img","alt":" s ∈ Ω.","inline":true}],[{"text":"Figure 3 graphically represents the relationship between known/unknown and error/non-error states. The green area in the image denotes the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":253.25,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-3.png","element":"img","alt":" πθB learnt, an","inline":true,"padRight":true},{"text":"area of the state space corresponding to the initial known space. An agent following the policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-4.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"will always be in the green area and all resulting episodes will end without ","element":"span"},{"text":"damages. Consequently, a subset of non-error states will also form part of the known space. Formally, let Γ","element":"span"},{"style":{"height":15.1},"width":172.65,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-5.png","element":"img","alt":"Ω and ΓΥ","inline":true,"padRight":true},{"text":"be subsets of non-error states belonging to the known and unknown spaces, respectively, with Γ","element":"span"},{"style":{"height":15.1},"width":475.56,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-6.png","element":"img","alt":"Ω ∪ ΓΥ = Γ. Then ΓΩ ⊂","inline":true,"padRight":true},{"text":"Ω. The yellow area in the Figure, by contrast, represents the unknown space Υ. In this space will be found all error states, as well as a subset of remaining non-error states. Formally, Γ","element":"span"},{"style":{"height":15.1},"width":355.5,"height":37.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-7.png","element":"img","alt":"Υ ⊂ Υ and Φ ⊂ Υ.","inline":true}],[{"text":"Understood in this way, the PI-SRL algorithm can be summed up as follows:","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"As a first step, learn the known space (green area) from the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":65.99,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-8.png","element":"img","alt":" πθB.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"As a second step, adjust the known space (green area) and unknown space (yellow area) in order to explore new and improved behaviors while avoiding error states (red area). During this process of adjusting the known space to the space used for safe and better policies, the algorithm can “forget” ineffectual known states, as will be shown in Section 4.","element":"span"}],[{"style":{"width":"57%"},"width":1002,"height":655,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/6-9.png","element":"img"}],[{"text":"Figure 3: Known/unknown and error/non-error states given the Case Base ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B","element":"figcaption","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"2.3 The Advantages of Using Prior Knowledge and Predetermined Exploration Policies","element":"span"}],[{"text":"In the present subsection, the advantages of using teacher knowledge in RL, namely (i) to provide initial knowledge about the task to be learned and (ii) to support the exploration process, are highlighted. Furthermore, we explain why we believe this knowledge to be indispensable in RL for tackling highly complex and realistic problems with large, continuous state and action spaces and in which a particular action may result in an undesirable consequence.","element":"span"}],[{"text":"2.3.1 Providing Initial Knowledge about the Task","element":"span"}],[{"text":"Most RL algorithms begin learning without any previous knowledge about the task to be learnt. In such cases, exploration strategies such as ","element":"span"},{"style":{"height":16.4},"width":202.04,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/7-0.png","element":"img","alt":" ϵ − greedy","inline":true,"padRight":true},{"text":"$34","element":"span"}],[{"text":"$35","element":"span"}],[{"text":"2.3.2 Supporting the Exploration Process","element":"span"}],[{"text":"$36","element":"span"}],[{"text":"As this paper supposes that such a teacher is available for the task to be learned, the teacher is taken as the baseline behavior. Although some studies have examined the use of robotic teachers, hand-written control policies and simulated planners, the great majority to date have made use of human teachers. This paper uses suboptimal automatic controllers as teachers, with ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-0.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"taken as the teacher’s policy.","element":"span"}],[{"style":{"height":17.6},"width":933.75,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-1.png","element":"img","alt":"Definition 7 (Baseline behavior). Policy πT","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is considered the baseline behavior about which three assumptions are made: (i) it is able to provide safe demonstrations of the task to be learnt from which prior knowledge can be extracted; (ii) it is able to support the subsequent exploration process, advising suboptimal actions in unknown states to reduce the probability of entering into error states and return the system to a known situation; and (iii) its performance is far from optimal.","element":"span"}],[{"text":"While optimal baseline behaviors are certainly ideal to behave safely, non-optimal behaviors are often easy (or easier) to implement or generate than optimal ones. The PI-SRL algorithm uses the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-2.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"in two different ways. First, it uses the safe demonstrations of ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-3.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"to provide prior knowledge about the task. In this step, the algorithm builds the initial known space of the agent derived from the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":234.68,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-4.png","element":"img","alt":" πθB with the","inline":true,"padRight":true},{"text":"purpose of mimicking ","element":"span"},{"style":{"height":20.31},"width":283.99,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-5.png","element":"img","alt":" πT through πθB","inline":true},{"text":". In the second step, PI-SRL uses ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-6.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"to support the ","element":"span"},{"text":"subsequent exploration process conducted to improve the abilities of the previously-learnt ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-7.png","element":"img","alt":"πθB","inline":true},{"text":". As the exploration process continues, an action of ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-8.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"is requested only when required, ","element":"span"},{"text":"that is, when the agent is in an unknown state (Figure 4). In this step, ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-9.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"acts as a backup policy in the case of an unknown state with the intention of guiding the learning away from catastrophic errors or, at least, reducing their frequency. It is important to note that the baseline behavior cannot demonstrate the correct action for every possible state. However, while the baseline behavior might not be able to indicate the best action in all cases, the action it supplies should, at the very least, be safer than that obtained through random exploration.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.4 The Risk Parameter","element":"span"}],[{"text":"In order to maximize exploration safety, it seems advisable that movement through the state space not be arbitrary, but rather that known space be expanded only gradually by starting from a known state. Such an exploration is carried out through the perturbation of the state-action trajectories generated by the policy ","element":"span"},{"style":{"height":20.31},"width":50.88,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/8-10.png","element":"img","alt":" πθB","inline":true},{"text":". Perturbation of the trajectories ","element":"span"},{"text":"is accomplished by the addition of Gaussian random noise to the actions in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"in order to obtain new ways of completing the task. Thus, the Gaussian exploration takes place","element":"span"}],[{"style":{"width":"57%"},"width":997,"height":539,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-0.png","element":"img"}],[{"text":"Figure 4: The exploration process in PI-SRL requests actions of the baseline behavior, ","element":"figcaption","subtype":"caption"},{"style":{"height":10.8},"width":63.29,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-1.png","element":"img","alt":" πT ,","inline":true,"padRight":true},{"text":"when it is really required.","element":"figcaption","subtype":"caption"}],[{"text":"around the current approximation of the action ","element":"span"},{"style":{"height":10.62},"width":35.07,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-2.png","element":"img","alt":" ai","inline":true,"padRight":true},{"text":"for the current known state ","element":"span"},{"style":{"height":15.6},"width":232.27,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-3.png","element":"img","alt":" sc ∈ Ω, with","inline":true},{"style":{"height":18.62},"width":943.66,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-4.png","element":"img","alt":"ci = (si, ai, V (si)) and d(sc, si) = min1≤j≤η d(s, sj","inline":true},{"text":"). The action performed is sampled from a Gaussian distribution with the mean at the action output given by the instance selected in ","element":"span"},{"style":{"height":15.02},"width":222.32,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-5.png","element":"img","alt":" B. When ai","inline":true,"padRight":true},{"text":"denotes the algorithm action output, the probability of selecting action ","element":"span"},{"style":{"height":12.89},"width":48.59,"height":32.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-6.png","element":"img","alt":" a′i,","inline":true},{"style":{"height":18.09},"width":122.32,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-7.png","element":"img","alt":"π(s, a′i","inline":true},{"text":") is computed using Equation 4.","element":"span"}],[{"style":{"width":"73%"},"width":1267,"height":69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-8.png","element":"img"}],[{"text":"The shape of the Gaussian distribution depends on parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-9.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"(standard deviation). In this study, ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-10.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"is used as a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"width parameter","element":"span"},{"text":". While large ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-11.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"values imply a wide bell-shaped distribution, increasing the probability of selecting actions ","element":"span"},{"style":{"height":12.89},"width":39.06,"height":32.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-12.png","element":"img","alt":" a′i ","inline":true,"padRight":true},{"text":"very different from ","element":"span"},{"text":"the current action ","element":"span"},{"style":{"height":15.6},"width":239.08,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-13.png","element":"img","alt":" ai, a small σ","inline":true,"padRight":true},{"text":"value implies a narrow bell-shaped distribution, increasing the probability of selecting actions ","element":"span"},{"style":{"height":12.89},"width":39.06,"height":32.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-14.png","element":"img","alt":" a′i ","inline":true,"padRight":true},{"text":"very similar to the current action ","element":"span"},{"style":{"height":17.94},"width":340.48,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-15.png","element":"img","alt":" ai. When σ2 = 0,","inline":true,"padRight":true},{"text":"we assume ","element":"span"},{"style":{"height":17.6},"width":484.96,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-16.png","element":"img","alt":" π(s, ai) = 1. Hence, the σ","inline":true,"padRight":true},{"text":"value is directly related to the amount of perturbation added to the state-action trajectories generated by the policy ","element":"span"},{"style":{"height":20.31},"width":261.56,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-17.png","element":"img","alt":" πθB. Higher σ","inline":true,"padRight":true},{"text":"values imply ","element":"span"},{"text":"greater perturbations (more Gaussian noise) and a greater probability of visiting unknown states.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 8 ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"(Risk Parameter)","element":"span"},{"style":{"fontStyle":"italic"},"text":". The parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-18.png","element":"img","alt":" σ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is considered a risk parameter. Large values of ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-19.png","element":"img","alt":" σ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"increase the probability of visiting distant unknown states and, hence, increase the probability of reaching error states.","element":"span"}],[{"text":"These exploratory actions drive the agent to the edge of the known space and force it to go slightly beyond, into the unknown space, in search of better, safer behaviors. After a period of time, the execution of these exploratory actions increases the known space and improves the abilities of the previously-learned safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":262.4,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-20.png","element":"img","alt":" πθB. The risk","inline":true,"padRight":true},{"text":"parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-21.png","element":"img","alt":" σ","inline":true},{"text":", as well as ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/9-22.png","element":"img","alt":" θ","inline":true},{"text":", are design parameters that must be selected by the user. In Section 3.3, guidelines for this selection are offered.","element":"span"}],[{"text":"It is important to note that the approach proposed in this study is based on two logical assumptions in RL derived from the following generalization principles (Kaelbling, Littman, & Moore, 1996; Sutton & Barto, 1998):","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"(i) Nearby states have similar optimal actions","element":"span"},{"text":". In continuous state spaces, it is impossible for the agent to visit every state and store its value (or optimal action) in a table. This is why generalization techniques are needed. In large, smooth state spaces, similar states are expected to have similar values and similar optimal actions. Therefore, it is possible to use experience gathered from the environment with a limited subset of the state space and produce a reliable approximation over a much larger subset (Boyan, Moore, & Sutton, 1995; Hu, Kostiadis, Hunter, & Kalyviotis, 2001; Fern´andez & Borrajo, 2008). One must also note that, in the proposed domains, an optimal action is also considered to be a safe action in the sense that it never produces error states (i.e., no action is considered optimal that leads the agent to a catastrophic situation).","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"(ii) Similar actions in similar states tend to produce similar effects","element":"span"},{"text":". Considering a deterministic domain, the action ","element":"span"},{"style":{"height":10.62},"width":35.06,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-0.png","element":"img","alt":" at","inline":true,"padRight":true},{"text":"performed in state ","element":"span"},{"style":{"height":10.62},"width":32.45,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-1.png","element":"img","alt":" st","inline":true,"padRight":true},{"text":"always produces the same state ","element":"span"},{"style":{"height":11.82},"width":76.02,"height":29.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-2.png","element":"img","alt":" st+1","inline":true},{"text":". In a stochastic domain, it is understood intuitively that the execution of the action ","element":"span"},{"style":{"height":14.22},"width":236.64,"height":35.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-3.png","element":"img","alt":" at in state st","inline":true,"padRight":true},{"text":"will produce similar effects (i.e., it produces states ","element":"span"},{"style":{"height":20.61},"width":386.79,"height":51.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-4.png","element":"img","alt":" {s1t+1, s2t+1, s3t+1, . . .}","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":23.09},"width":552.88,"height":57.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-5.png","element":"img","alt":" ∀i, j i ̸= j dist(sit+1, sjt+1) ≈","inline":true,"padRight":true},{"text":"0). Additionally, the execution of the action ","element":"span"},{"style":{"height":12.32},"width":140.84,"height":30.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-6.png","element":"img","alt":" a′t ∼ at","inline":true,"padRight":true},{"text":"in a state ","element":"span"},{"style":{"height":12.32},"width":134.49,"height":30.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-7.png","element":"img","alt":" s′t ∼ st ","inline":true,"padRight":true},{"text":"produces states ","element":"span"},{"style":{"height":23.09},"width":1058.39,"height":57.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-8.png","element":"img","alt":" {s′1t+1, s′2t+1, s′3t+1, . . .} where ∀i, j dist(s′it+1, sjt+1) ≈ 0.","inline":true,"padRight":true},{"text":"As explained earlier, the present study uses Euclidean distance as a similarity metric, as it has been proven successful in the proposed domains. As a result of this assumption, approximation techniques can be used, such that actions that generate similar effects can be grouped together as one action (Jiang, 2004). In continuous action spaces, the need for generalization techniques is even greater (Kaelbling et al., 1996). In this paper, the assumption also allows us to assume that low values of ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-9.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"increase the probability of visiting known states and, hence, of exploring less and taking less risks, while greater values of ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-10.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"increase the probability of reaching error states.","element":"span"}]]},{"heading":"3. The PI-SRL Algorithm","paragraphs":[[{"text":"The PI-SRL algorithm is composed of two main steps described in detail below.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3.1 First Step: Modeling Baseline Behaviors by CBR","element":"span"}],[{"text":"The first step of PI-SRL is an approach for behavioral cloning, using CBR to allow a software agent to behave in a similar manner to a teacher policy (baseline behavior) ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-11.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"(Floyd et al., 2008). Whereas LfD approaches are named differently according to what is learned (Argall et al., 2009), to prevent terminological inconsistencies here, we consider behavioral cloning (also known as imitation learning) to be an area of LfD whose goal is the reproduction/mimicking of the underlying teacher policy ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-12.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"(Peters, Tedrake, Roy, & Morimoto, 2010; Abbott, 2008).","element":"span"}],[{"text":"When using CBR for behavioral cloning, a case can be built using the agent’s state received from the environment, as well as the corresponding action command performed by the teacher. In PI-SRL, the objective of the first step is to properly imitate the behavior of ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-13.png","element":"img","alt":"πT","inline":true,"padRight":true},{"text":"using the cases stored in a case-base. At this point, an important question arises; namely, how a case-base ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-14.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"can be learnt using the sample trajectories provided by ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-15.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"such that, at the end of the learning process, the resulting policy derived from ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-16.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"mimics the behavior of ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-17.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"? Baseline behavior is a function that maps states to actions ","element":"span"},{"style":{"height":15.5},"width":218.76,"height":38.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/10-18.png","element":"img","alt":" πT : S → A","inline":true,"padRight":true},{"text":"or, in other words, a function that, given a state ","element":"span"},{"style":{"height":15.02},"width":119.74,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-0.png","element":"img","alt":" si ∈ S","inline":true},{"text":", provides the corresponding action ","element":"span"},{"style":{"height":15.42},"width":200.44,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-1.png","element":"img","alt":" ai ∈ A. In","inline":true,"padRight":true},{"text":"this paper, we want to build a policy ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-2.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"derived from a case-base composed of cases (","element":"span"},{"style":{"height":18.22},"width":114.97,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-3.png","element":"img","alt":"sj, aj)","inline":true,"padRight":true},{"text":"such that, for a new state ","element":"span"},{"style":{"height":13.02},"width":36.46,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-4.png","element":"img","alt":" sq","inline":true},{"text":", the case with the minimum Euclidean distance ","element":"span"},{"style":{"height":18.22},"width":247.74,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-5.png","element":"img","alt":" dist(sq, sj) is","inline":true,"padRight":true},{"text":"retrieved and the corresponding action ","element":"span"},{"style":{"height":13.02},"width":38.06,"height":32.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-6.png","element":"img","alt":" aj","inline":true,"padRight":true},{"text":"is returned. Intuitively, it can be assumed that ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-7.png","element":"img","alt":"πB","inline":true,"padRight":true},{"text":"can be built simply by storing all cases (","element":"span"},{"style":{"height":11.2},"width":88.44,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-8.png","element":"img","alt":"si, ai","inline":true},{"text":") gathered from one interaction between ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-9.png","element":"img","alt":"πT","inline":true,"padRight":true},{"text":"and the environment during a limited number of episodes ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":". At the end of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"episodes, one expects the resulting ","element":"span"},{"style":{"height":10.3},"width":50.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-10.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"to be able to properly mimic the behavior of ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-11.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". However, informal experimentation in the helicopter hovering domain shows this not to be the case (Section 4.3). In helicopter hovering, after ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"= 100 episodes and the prohibitive number of 600,000 cases stored, the policy derived from the case-base ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-12.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"is unable to correctly imitate the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-13.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"and, instead, continuously crashes the helicopter. Indeed, in order for ","element":"span"},{"style":{"height":14.3},"width":318.86,"height":35.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-14.png","element":"img","alt":" πB to mimic πT","inline":true,"padRight":true},{"text":"in large continuous and stochastic domains, the approach requires a larger number of episodes and, consequently, a prohibitive number of cases. In fact, to perfectly mimic ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-15.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"in these domains, an infinite number of cases would be required. Figure 5 attempts to explain why we believe that this learning process does not work. In it, the region of the space represented by simply storing cases derived from ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-16.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"in the form ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"= (","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":") is shown. Each stored case (red circles) covers an area of the space and represents the centroid of a Voronoi region.","element":"span"}],[{"style":{"width":"66%"},"width":1147,"height":675,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-17.png","element":"img"}],[{"text":"Figure 5: Effects of storing all training cases.","element":"figcaption","subtype":"caption"}],[{"text":"If the previously-learned policy ","element":"span"},{"style":{"height":10.3},"width":50.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-18.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"is used when a new state ","element":"span"},{"style":{"height":13.02},"width":36.46,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-19.png","element":"img","alt":" sq","inline":true,"padRight":true},{"text":"is presented, the action ","element":"span"},{"style":{"height":13.02},"width":38.07,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-20.png","element":"img","alt":" aj","inline":true,"padRight":true},{"text":"is performed, corresponding to the case ","element":"span"},{"style":{"height":18.22},"width":210.75,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-21.png","element":"img","alt":" cj = (sj, aj","inline":true},{"text":") where the Euclidean distance ","element":"span"},{"style":{"height":18.22},"width":184.49,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-22.png","element":"img","alt":"dist(sq, sj","inline":true},{"text":") is less than that with all other stored cases. However, if we use the policy ","element":"span"},{"style":{"height":13.5},"width":103.63,"height":33.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-23.png","element":"img","alt":" πT to","inline":true,"padRight":true},{"text":"provide an action in the situation ","element":"span"},{"style":{"height":13.02},"width":36.45,"height":32.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-24.png","element":"img","alt":" sq","inline":true},{"text":", the action ","element":"span"},{"style":{"height":10.62},"width":35.06,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-25.png","element":"img","alt":" ai","inline":true,"padRight":true},{"text":"is provided which is different than ","element":"span"},{"style":{"height":13.02},"width":52.6,"height":32.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-26.png","element":"img","alt":" aj.","inline":true,"padRight":true},{"text":"At this point, the policy ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-27.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"can be said to classify the state ","element":"span"},{"style":{"height":17.42},"width":528.64,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-28.png","element":"img","alt":" sq as the obtained class aj,","inline":true,"padRight":true},{"text":"while the policy ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-29.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"can be said to classify the state ","element":"span"},{"style":{"height":17.42},"width":482.24,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-30.png","element":"img","alt":" sq as the desired class ai","inline":true,"padRight":true},{"text":"(insofar as ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-31.png","element":"img","alt":"πT","inline":true,"padRight":true},{"text":"is the policy to be mimicked), with ","element":"span"},{"style":{"height":18.22},"width":203.8,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-32.png","element":"img","alt":" |ai − aj| >","inline":true,"padRight":true},{"text":"0. Furthermore, ","element":"span"},{"style":{"height":18.22},"width":155.94,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-33.png","element":"img","alt":" |ai − aj|","inline":true,"padRight":true},{"text":"is understood as the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"classification error","element":"span"},{"text":". ","element":"span"},{"text":"If the case-base stored all the possible pairs (","element":"span"},{"style":{"height":17.6},"width":275.45,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-34.png","element":"img","alt":"si, ai) that πT","inline":true,"padRight":true},{"text":"were able to generate in the domain, the actions ","element":"span"},{"style":{"height":17.42},"width":175.1,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-35.png","element":"img","alt":" aj and ai","inline":true,"padRight":true},{"text":"would always be identical, with ","element":"span"},{"style":{"height":18.22},"width":1728.17,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/11-36.png","element":"img","alt":"dist(sq, sj) = 0 and |ai − aj| = 0. However, in a stochastic and large, continuous domain, it","inline":true,"padRight":true},{"text":"is impossible to store all such cases. The sum of all such classification errors in an episode leads to the visiting of unexplored regions of the case space (i.e., regions where the new state ","element":"span"},{"style":{"height":13.02},"width":36.45,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-0.png","element":"img","alt":" sq","inline":true,"padRight":true},{"text":"received from the environment has a Euclidean distance ","element":"span"},{"style":{"height":18.22},"width":437.51,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-1.png","element":"img","alt":" dist(sq, sj) >> θ with","inline":true,"padRight":true},{"text":"respect to the closest case ","element":"span"},{"style":{"height":18.22},"width":332.24,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-2.png","element":"img","alt":" cj = (sj, aj) in B","inline":true},{"text":"). When these unexplored regions are visited, the difference between the obtained class derived from ","element":"span"},{"style":{"height":10.3},"width":50.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-3.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"and the desired class derived from ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-4.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"is large (i.e., ","element":"span"},{"style":{"height":18.22},"width":232.71,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-5.png","element":"img","alt":" |ai − aj| >>","inline":true,"padRight":true},{"text":"0) and the probability that error states might be visited greatly increases.","element":"span"}],[{"text":"It may be concluded, therefore, that simply storing the pairs ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"= (","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":") generated by ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-6.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"is not sufficient to properly mimic its behavior. For this reason, the algorithm in Figure 6 below has been proposed.","element":"span"}],[{"style":{"width":"66%"},"width":1142,"height":1060,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-7.png","element":"img"}],[{"text":"Figure 6: CBR algorithm for behavioral cloning.","element":"figcaption","subtype":"caption"}],[{"text":"In the first step of the algorithm, the state-value function ","element":"span"},{"style":{"height":22.03},"width":131.31,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-8.png","element":"img","alt":" V πθB(si","inline":true},{"text":") is initialized to 0 (see line 07). The value ","element":"span"},{"style":{"height":22.03},"width":131.31,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-9.png","element":"img","alt":" V πθB(si","inline":true},{"text":") for each case is computed in the second step of the algorithm in Section 3.2. Additionally, this step uses the case-based risk function (Equation 2) to determine whether a new state ","element":"span"},{"style":{"height":10.84},"width":38.45,"height":27.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-10.png","element":"img","alt":" sk","inline":true,"padRight":true},{"text":"should be considered risky (line 08). If the new state is not risky (i.e., it is a known state ","element":"span"},{"style":{"height":12.44},"width":85.3,"height":31.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-11.png","element":"img","alt":" sk ∈","inline":true,"padRight":true},{"text":"Ω), a 1-nearest neighbor strategy is followed (line 09). Otherwise, the algorithm performs the action ","element":"span"},{"style":{"height":10.84},"width":41.06,"height":27.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-12.png","element":"img","alt":" ak","inline":true,"padRight":true},{"text":"using the baseline behavior ","element":"span"},{"style":{"height":15.1},"width":171.46,"height":37.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-13.png","element":"img","alt":" πT and a","inline":true,"padRight":true},{"text":"new case ","element":"span"},{"style":{"height":17.6},"width":273.33,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/12-14.png","element":"img","alt":" cnew = (sk, ak,","inline":true,"padRight":true},{"text":"0) is built and added to the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(line 13). Starting with an empty case-base, the learning algorithm continuously increases its competence by storing new experiences. However, there are a number of reasons why the inflow of new cases should be limited. Large case-bases increase the time required to find the closest cases to a new example. While this may be partially solved using techniques to reduce the retrieval time (e.g., k","element":"span"},{"style":{"fontStyle":"italic"},"text":"-d trees ","element":"span"},{"text":"that have been used in this work), they nevertheless do not reduce the storage requirements. Several approaches to the removal of ineffectual cases during training exist, including Aha’s IBx algorithms (Aha, 1992) or any nearest prototype approach (Fernandez & Isasi, 2008). When the number of cases stored in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"exceeds a critical value ","element":"span"},{"style":{"height":17.6},"width":256.53,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-0.png","element":"img","alt":" ∥B∥ > η such","inline":true,"padRight":true},{"text":"that the realization of a retrieval within a certain amount of time cannot be guaranteed, the removal of some cases is inevitable. An efficient approach to such a problem is through the removal of the least-frequently-used elements of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(line 18).","element":"span"}],[{"text":"The result of this step is a constrained case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"describing the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-1.png","element":"img","alt":"πθB ","inline":true,"padRight":true},{"text":"that mimics the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-2.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":", though perhaps with some deviation (line 20). ","element":"span"},{"text":"Formally, let ","element":"span"},{"style":{"height":17.6},"width":100.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-3.png","element":"img","alt":" U(πT","inline":true,"padRight":true},{"text":") be an estimate of the utility of the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-4.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"computed by averaging the sum of rewards accumulated in each of ","element":"span"},{"style":{"height":14.7},"width":59.06,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-5.png","element":"img","alt":" NT","inline":true,"padRight":true},{"text":"trials. Then, ","element":"span"},{"style":{"height":20.31},"width":312.45,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-6.png","element":"img","alt":" U(πθB) ≤ U(πT ).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"3.2 Second Step: Improving the Learned Baseline Behavior","element":"span"}],[{"text":"In this step of the PI-SRL algorithm, the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-7.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"learned in the previous ","element":"span"},{"text":"step is improved by the safe exploration of the state-action space. First, for each case ","element":"span"},{"style":{"height":12.22},"width":73.9,"height":30.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-8.png","element":"img","alt":" ci ∈","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", the state-value function ","element":"span"},{"style":{"height":22.03},"width":131.31,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-9.png","element":"img","alt":" V πθB(si","inline":true},{"text":") is computed following a Monte Carlo (MC) approach (Figure 7).","element":"span"}],[{"style":{"width":"62%"},"width":1074,"height":628,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-10.png","element":"img"}],[{"text":"Figure 7: Monte Carlo algorithm for the computation of state-value function for each case.","element":"figcaption","subtype":"caption"}],[{"text":"This algorithm is similar in spirit to a first-visit MC method for ","element":"span"},{"style":{"height":12.4},"width":55.15,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-11.png","element":"img","alt":" V π ","inline":true,"padRight":true},{"text":"(Sutton & Barto, 1998), adapted in this paper to work with a policy given by a case-base. In the algorithm shown in Figure 7, all returns for each state ","element":"span"},{"style":{"height":14.62},"width":121.32,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-12.png","element":"img","alt":" si ∈ B","inline":true,"padRight":true},{"text":"are accumulated and averaged, following the policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-13.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"derived by the case base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(see line 09). It is important to note that in the ","element":"span"},{"text":"algorithm the term ","element":"span"},{"style":{"fontStyle":"italic"},"text":"return ","element":"span"},{"text":"following the first occurrence of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"refers to the expected return of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"(i.e., the expected cumulative future discounted reward starting from that state), whereas ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Returns ","element":"span"},{"text":"refers to a list composed of each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"return ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"in different episodes. ","element":"span"},{"text":"One of the principal reasons for using the MC method is that it allows us to quickly and easily estimate state values ","element":"span"},{"style":{"height":22.04},"width":131.31,"height":55.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-14.png","element":"img","alt":" V πθB(si","inline":true},{"text":") for each case ","element":"span"},{"style":{"height":14.62},"width":126.77,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-15.png","element":"img","alt":" ci ∈ B","inline":true},{"text":". In addition, MC methods have been shown to be successful in a wide variety of domains (Sutton & Barto, 1998). Once the state-value function ","element":"span"},{"style":{"height":22.03},"width":131.31,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-16.png","element":"img","alt":" V πθB(si","inline":true},{"text":") is computed for each case ","element":"span"},{"style":{"height":14.62},"width":129.32,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-17.png","element":"img","alt":" ci ∈ B","inline":true},{"text":", small amounts of Gaussian noise are randomly added to the actions of the policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/13-18.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"in order to obtain new and improved ways ","element":"span"},{"text":"to complete the task. The algorithm used to improve the baseline behavior learned in the previous step is depicted in Figure 8. The algorithm is composed of four steps performed in each episode.","element":"span"}],[{"text":"- ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(a) Initialization step","element":"span"},{"text":". The algorithm initializes the list used to store cases occurring during an episode and sets the cumulative reward counter of the episode to 0.","element":"span"}],[{"text":"- ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(b) Case Generation","element":"span"},{"text":". ","element":"span"},{"text":"The algorithm builds a case for each step of an episode. For each new state ","element":"span"},{"style":{"height":10.84},"width":38.46,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-0.png","element":"img","alt":" sk","inline":true},{"text":", the closest case ","element":"span"},{"style":{"height":17.6},"width":340.39,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-1.png","element":"img","alt":" < s, a, V (s) >∈ B","inline":true,"padRight":true},{"text":"is computed using the Euclidean distance metric from Equation 1 (see line 09 in algorithm of Figure 8). In order to determine the perceived degree of risk of the new state ","element":"span"},{"style":{"height":10.84},"width":38.45,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-2.png","element":"img","alt":" sk","inline":true},{"text":", the case-based risk function is used (line 10). If ","element":"span"},{"style":{"height":22.03},"width":449.38,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-3.png","element":"img","alt":" ϱπθB(sk) = 0, then sk ∈","inline":true,"padRight":true},{"text":"Ω (known state). In this case, the action ","element":"span"},{"style":{"height":10.84},"width":41.07,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-4.png","element":"img","alt":" ak","inline":true,"padRight":true},{"text":"performed is computed using Equation 4 and a new case ","element":"span"},{"style":{"height":17.6},"width":420.51,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-5.png","element":"img","alt":" cnew =< s, ak, V (s) >","inline":true,"padRight":true},{"text":"is built to be added to the list of cases having occurred in the episode (line 13). It is important to note that the new case ","element":"span"},{"style":{"height":17.6},"width":293.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-6.png","element":"img","alt":" < s, ak, V (s) >","inline":true,"padRight":true},{"text":"is built replacing the action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"corresponding to the closest case in ","element":"span"},{"style":{"height":17.6},"width":361.77,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-7.png","element":"img","alt":" < s, a, V (s) >∈ B","inline":true},{"text":", with the new action ","element":"span"},{"style":{"height":10.84},"width":41.06,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-8.png","element":"img","alt":" ak","inline":true,"padRight":true},{"text":"resulting from the application of random Gaussian noise to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"in the Equation 4. Thus, the algorithm only produces smooth changes in the cases of ","element":"span"},{"style":{"height":15.24},"width":306.26,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-9.png","element":"img","alt":" B where ak ∼ a","inline":true},{"text":". If, however, ","element":"span"},{"style":{"height":22.03},"width":523.48,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-10.png","element":"img","alt":" ϱπθB(sk) = 1, the state sk ∈","inline":true,"padRight":true},{"text":"Υ (i.e., unknown state [line 14]). In unknown states, the action ","element":"span"},{"style":{"height":10.84},"width":41.06,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-11.png","element":"img","alt":" ak","inline":true,"padRight":true},{"text":"performed is suggested by the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-12.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"which defines safe behavior (line 15). A new case ","element":"span"},{"style":{"height":14.8},"width":247.63,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-13.png","element":"img","alt":" < sk, ak, 0 >","inline":true,"padRight":true},{"text":"is built and added to the list of cases in the episode and actions will be performed using ","element":"span"},{"style":{"height":15.1},"width":233.86,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-14.png","element":"img","alt":" πT until the","inline":true,"padRight":true},{"text":"agent is not in a known state. Finally, the reward obtained in the episode is accumulated, where ","element":"span"},{"style":{"height":17.6},"width":139.26,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-15.png","element":"img","alt":" r(sk, ak","inline":true},{"text":") is the immediate reward obtained when action ","element":"span"},{"style":{"height":10.84},"width":41.06,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-16.png","element":"img","alt":" ak","inline":true,"padRight":true},{"text":"is performed in state ","element":"span"},{"style":{"height":10.84},"width":38.45,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-17.png","element":"img","alt":" sk","inline":true,"padRight":true},{"text":"(line 18).","element":"span"}],[{"text":"- ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(c) Computing the state-value function for the unknown states","element":"span"},{"text":". In this step, the state-value function of the states considered to be unknown in the previous step is computed. In the previous step (line 17), the state-value function for these states is set at 0. The algorithm proceeds in a manner similar to the first-visit MC algorithm in Figure 7. In this case, the return for each unknown state ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-18.png","element":"img","alt":" si","inline":true,"padRight":true},{"text":"is computed, but not averaged since only one episode is considered (line 24 and 25). The return for each ","element":"span"},{"style":{"height":10.62},"width":32.45,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-19.png","element":"img","alt":" si","inline":true,"padRight":true},{"text":"is computed, taking into account the first visit of the state ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-20.png","element":"img","alt":" si","inline":true,"padRight":true},{"text":"in the episode (each occurrence of a state in an episode is called a visit to ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-21.png","element":"img","alt":" si","inline":true},{"text":"), although the state ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-22.png","element":"img","alt":" si","inline":true,"padRight":true},{"text":"could appear multiple times in the rest of the episode.","element":"span"}],[{"text":"- ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(d) Updating the cases in ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"B ","element":"span"},{"style":{"fontWeight":"bold"},"text":"using experience gathered","element":"span"},{"text":". Updates in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"are made with the cases gathered from episodes with a cumulative reward similar to that of the best episode found to that point using the threshold Θ (line 27). In this way, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"good sequences ","element":"span"},{"text":"are provided for the updates since it has been shown that such sequences of experiences can cause an adaptive agent to converge to a stable and useful policy, whereas ","element":"span"},{"style":{"fontStyle":"italic"},"text":"bad sequences ","element":"span"},{"text":"may cause an agent to converge to an unstable or bad policy (Wyatt, 1997). This also prevents the degradation of the initial performance of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"as computed in the first step of the algorithm through the use of bad episodes, or episodes with errors, for updates. In this step, two types of updates appear, namely, replacements and additions of new cases. Again, the algorithm iterates for each case ","element":"span"},{"style":{"height":17.6},"width":730.19,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-23.png","element":"img","alt":" ci = (si, ai, V (si)) ∈ listCasesEpisode","inline":true,"padRight":true},{"text":"(line 29). If ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-24.png","element":"img","alt":" si","inline":true,"padRight":true},{"text":"is a known state (line 30), we compute the case ","element":"span"},{"style":{"height":17.6},"width":370.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-25.png","element":"img","alt":" < si, a, V (si) >∈ B","inline":true,"padRight":true},{"text":"corresponding to the state ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-26.png","element":"img","alt":" si","inline":true,"padRight":true},{"text":"(line 31). One should note that the case ","element":"span"},{"style":{"height":17.6},"width":730.19,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-27.png","element":"img","alt":" ci = (si, ai, V (si)) ∈ listCasesEpisode","inline":true,"padRight":true},{"text":"was built in line 13 of the algorithm, replacing the action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"corresponding to the case ","element":"span"},{"style":{"height":17.6},"width":537.29,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-28.png","element":"img","alt":" < si, a, V (si) >∈ B with the","inline":true,"padRight":true},{"text":"new action ","element":"span"},{"style":{"height":10.62},"width":35.07,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/14-29.png","element":"img","alt":" ai","inline":true,"padRight":true},{"text":"and resulting from the application of random Gaussian noise to the action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"}],[{"style":{"width":"94%"},"width":1632,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-0.png","element":"img"}],[{"text":"00 ","element":"span"},{"text":"Given the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", and the maximum number of cases ","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-1.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"01 ","element":"span"},{"text":"Given the baseline behavior ","element":"span"},{"style":{"height":7.74},"width":40.45,"height":19.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-2.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"02 ","element":"span"},{"text":"Given the update threshold Θ 03 ","element":"span"},{"text":"1. Set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"maxTotalRwEpisode ","element":"span"},{"text":"= 0, the maximum cumulative reward reached in an episode 04 ","element":"span"},{"text":"2. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Repeat ","element":"span"},{"text":"05 ","element":"span"},{"text":"(a) ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"Initialization step","element":"span"},{"text":":","element":"span"}],[{"style":{"width":"67%"},"width":1159,"height":655,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-3.png","element":"img"}],[{"text":"20 ","element":"span"},{"text":"Set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1 21 ","element":"span"},{"text":"(c) ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"Computing the state-value function for the unknown states","element":"span"},{"text":": ","element":"span"},{"style":{"height":10.74},"width":447.45,"height":26.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-4.png","element":"img","alt":"22 for each instance ci in","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"listCasesEpisode ","element":"span"},{"style":{"height":17.48},"width":746.62,"height":43.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-5.png","element":"img","alt":"23 if ϱπθB (si) = 1 then // unknown state","inline":true},{"style":{"height":18.41},"width":1451.98,"height":46.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-6.png","element":"img","alt":"24 return(si) := �kj=n γj−nr(sj, aj) // n is the first ocurrence of si in the episode","inline":true},{"style":{"height":12.8},"width":540.7,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-7.png","element":"img","alt":"25 V (si) := return(si)","inline":true,"padRight":true},{"text":"26 ","element":"span"},{"text":"(d) ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"Updating the cases in B using the experience gathered","element":"span"},{"text":": 27 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"totalRwEpisode > ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"maxTotalRwEpisode ","element":"span"},{"style":{"height":12.8},"width":161.46,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-8.png","element":"img","alt":" − Θ) then","inline":true,"padRight":true},{"text":"28 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"maxTotalRwEpisode ","element":"span"},{"text":":= ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"max","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"maxTotalRwEpisode, totalRwEpisode","element":"span"},{"text":") ","element":"span"},{"style":{"height":12.8},"width":733.5,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-9.png","element":"img","alt":"29 for each case ci =< si, ai, V (si) > in","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"listCasesEpisode ","element":"span"},{"style":{"height":17.48},"width":743.01,"height":43.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-10.png","element":"img","alt":"30 if ϱπθB (si) = 0 then // known state","inline":true},{"style":{"height":12.8},"width":1253.21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-11.png","element":"img","alt":"31 Compute the case < si, a, V (si) >∈ B corresponding to the state si","inline":true},{"style":{"height":13.15},"width":876.62,"height":32.87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-12.png","element":"img","alt":"32 Compute δ = r(si, ai) + γV (si+1) − V (si)","inline":true}],[{"style":{"width":"94%"},"width":1632,"height":469,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-13.png","element":"img"}],[{"text":"Figure 8: Description of step two of PI-SRL algorithm.","element":"figcaption","subtype":"caption"}],[{"text":"by the Equation 4. Then, the temporal distance (TD) error ","element":"span"},{"style":{"height":12.8},"width":20,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-14.png","element":"img","alt":" δ","inline":true,"padRight":true},{"text":"is computed (line 32). If ","element":"span"},{"style":{"height":15.6},"width":113.04,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-15.png","element":"img","alt":" δ > 0,","inline":true,"padRight":true},{"text":"performing the action ","element":"span"},{"style":{"height":10.62},"width":35.06,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/15-16.png","element":"img","alt":" ai","inline":true,"padRight":true},{"text":"results in a positive change for the value of a state. The action, in turn, could potentially lead to a higher return and, thus, to a better policy. Van Hasselt and Wiering (2007) also update the value function using only the actions that potentially lead to a higher return. If the TD error ","element":"span"},{"style":{"height":12.8},"width":20,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-0.png","element":"img","alt":" δ","inline":true,"padRight":true},{"text":"is positive, ","element":"span"},{"style":{"height":10.62},"width":35.06,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-1.png","element":"img","alt":" ai","inline":true,"padRight":true},{"text":"is considered to be a good selection and is reinforced. In the algorithm, this reinforcement is carried out by updating the output of the case ","element":"span"},{"style":{"height":17.6},"width":472.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-2.png","element":"img","alt":" < si, a, V (si) >∈ B at ai","inline":true,"padRight":true},{"text":"(line 34). Therefore, an update to the case-base only occurs when the TD error is positive. This is similar to a linear reward-inaction update for learning automata (Narendra & Thathachar, 1974, 1989) in which the sign of the TD error is used as a measure of success. PI-SRL only updates the case-base when actual improvements have been observed, thus avoiding slow learning when there are plateaus in the value space and TD errors are small. It has been shown empirically that this procedure can result in better policies than when step size depends on the size of the TD error (Van Hasselt & Wiering, 2007). It is important to note that these replacements produce smooth changes in the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"since an action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"is replaced only if ","element":"span"},{"style":{"height":10.62},"width":35.07,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-3.png","element":"img","alt":" ai","inline":true,"padRight":true},{"text":"results in a higher ","element":"span"},{"style":{"height":17.6},"width":330.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-4.png","element":"img","alt":" V (si) and ai ∼ a.","inline":true,"padRight":true},{"text":"This form of updating can be understood as a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"risk-seeking ","element":"span"},{"text":"approach, overweighting only transitions to successor states that promise an above-average return (Mihatsch & Neuneier, 2002). Additionally, it prevents the degradation of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", ensuring that replacements are made only when an action can potentially lead to a higher ","element":"span"},{"style":{"height":17.6},"width":115.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-5.png","element":"img","alt":" V (si).","inline":true}],[{"text":"If, instead, ","element":"span"},{"style":{"height":10.62},"width":32.46,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-6.png","element":"img","alt":" si","inline":true,"padRight":true},{"text":"is not a known state, the case ","element":"span"},{"style":{"height":10.62},"width":30.88,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-7.png","element":"img","alt":" ci","inline":true,"padRight":true},{"text":"is added to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(line 37). Finally, the algorithm removes cases from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"if necessary (line 39). Complex scoring metrics to calculate which cases are to be removed for a given moment have been proposed by several authors. Forbes and Andres (2002) suggest the removal of cases that contribute least to the overall approximation, while Driessens and Ramon (2003) pursue a more error-oriented view and propose the deletion of cases that contribute most to the prediction error of other examples. The principal drawback of these more sophisticated measures is their complexity. ","element":"span"},{"text":"The determination of the case(s) to be removed involves the computation of a score value for each ","element":"span"},{"style":{"height":14.62},"width":127.75,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-8.png","element":"img","alt":" ci ∈ B","inline":true},{"text":", which in turn requires at least one retrieval and regression, respectively, for each ","element":"span"},{"style":{"height":18.22},"width":249.32,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-9.png","element":"img","alt":" cj ∈ B (j ̸= i","inline":true},{"text":"). Such entire repeated sweeps through the case-base entail an enormous computational load. ","element":"span"},{"text":"Gabel and Riedmiller (2005) compute a different score metric for each ","element":"span"},{"style":{"height":14.62},"width":133.88,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-10.png","element":"img","alt":" ci ∈ B","inline":true},{"text":", requiring the computation of the set of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"-nearest neighbors around ","element":"span"},{"style":{"height":10.62},"width":44.41,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-11.png","element":"img","alt":" ci.","inline":true,"padRight":true},{"text":"Such approaches are not well-suited to systems learning with adjusted time requirements and with a high-dimensional state space, requiring the use of larger case-bases than those proposed here. Rather, in this paper, we propose the removal of the least-frequently-used cases. The idea seems intuitive insofar as the least-frequently-used cases usually contain worse estimates of a corresponding state’s value; although the strategy might lead to a function approximator that “forgets” some of the valuable experience made in the past (e.g., ","element":"span"},{"style":{"fontStyle":"italic"},"text":"corner cases","element":"span"},{"text":"). Despite this, PI-SRL performs successfully in all domains proposed using the strategy, as demonstrated in Section 4. Thus, the ability to forget ineffectual known states described in Section 2 is a result of the algorithm removing ","element":"span"},{"style":{"height":17.6},"width":272.82,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-12.png","element":"img","alt":" ∥B∥ − η cases","inline":true,"padRight":true},{"text":"from the least-frequently-used cases of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3.3 Parameter Setting Design","element":"span"}],[{"text":"One of the main difficulties of applying the PI-SRL algorithm to a given problem is to decide on an appropriate set of parameter values for the threshold ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-13.png","element":"img","alt":" θ","inline":true},{"text":", the risk parameter ","element":"span"},{"style":{"height":10.8},"width":38.5,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-14.png","element":"img","alt":" σ,","inline":true,"padRight":true},{"text":"the update threshold Θ and the maximum number of cases ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/16-15.png","element":"img","alt":" η","inline":true},{"text":". An incorrect value for the parameter ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-0.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"can lead to mislabeling a state as known when it is really unknown, potentially leading to damage or injury in the agent. In the case of the risk parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-1.png","element":"img","alt":" σ","inline":true},{"text":", high values can continuously result in damage or injury; while low values are safe, but do not allow for exploration of the state-action space sufficient for reaching a near-optimal policy. Unlike ","element":"span"},{"style":{"height":12.8},"width":158.9,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-2.png","element":"img","alt":"θ and σ","inline":true},{"text":", the parameter Θ is not related to risk, but instead is directly related to the performance of the algorithm. Parameter Θ is used to determine how good an episode must be with respect to the best episode obtained, since only the best episodes are used to update the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":". If the Θ value is too large, bad episodes may be used to update ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(influencing the convergence and performance of the algorithm). If, instead, Θ is too low, the number of updates in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"may be insufficient for improving the baseline behavior. Finally, a very high ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-3.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"value allows for large case-bases, increasing the computational effort during retrieval and degrading the efficiency of the system. By contrast, a very low ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-4.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"value might excessively restrict the size of the case-base and thus negatively affect the final performance of the algorithm. In this subsection, a solid perspective is given on the automatic definition of these parameters. The parameter setting proposed here are taken as a suitable set of heuristics tested successfully in a wide variety of domains (Section 4).","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"style":{"height":12.8},"width":268.23,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-5.png","element":"img","alt":" Parameter θ","inline":true},{"text":": The parameter is domain-dependent and related to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"average size ","element":"span"},{"text":"of the actions. In this paper, the value for this parameter has been established by computing the mean distance between states during an execution of the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-6.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". Expressed in another way, the execution of the policy ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-7.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"provides a state-action sequence of the form ","element":"span"},{"style":{"height":11.82},"width":594.52,"height":29.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-8.png","element":"img","alt":" s1 → a1 → s2 → a2 → . . . → sn","inline":true},{"text":". Thus, the value of ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-9.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"is computed using Equation 5.","element":"span"}],[{"style":{"width":"67%"},"width":1159,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-10.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"style":{"height":12.4},"width":274.9,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-11.png","element":"img","alt":" Parameter σ","inline":true},{"text":": Several authors agree that it is impossible to completely avoid all accidents (Moldovan & Abbeel, 2012; Geibel & Wysotzki, 2005). It is important to note that PI-SRL is completely safe only if the first step of the algorithm is executed. However, by proceeding in this way, the performance of the algorithm is heavily limited by the abilities of the baseline behavior. ","element":"span"},{"text":"The running of the subsequent exploratory process is inevitable if learner performance is to be improved beyond that of the baseline behavior. Since the agent operates in a state of incomplete knowledge of the domain and its dynamic, it is inevitable during the exploratory process that unknown regions of the state space will be visited where the agent may reach an error state. However, it is possible to adjust the risk parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-12.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"to determine the level of risk assumed during this exploratory process. In this paper, we start with low ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-13.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"values (low risk) which we gradually increase. Specifically, we propose beginning with ","element":"span"},{"style":{"height":15.13},"width":259.23,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/17-14.png","element":"img","alt":"σ = 9 × 10−7 ","inline":true,"padRight":true},{"text":"and increasing this value iteratively until either an accurate policy is obtained or the amount of damage or injury is high.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Parameter ","element":"span"},{"text":"Θ: The value of this parameter is set relative to the best episode obtained. In this paper, the Θ value is set to 5% of the cumulative reward of the best episode obtained.","element":"span"}],[{"style":{"width":"95%"},"width":1650,"height":768,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-0.png","element":"img"}],[{"text":"Figure 9: Trajectories generated by the baseline policy ","element":"figcaption","subtype":"caption"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-1.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"in a deterministic, slightly stochastic and highly stochastic domain.","element":"figcaption","subtype":"caption"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"style":{"height":16},"width":264.38,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-2.png","element":"img","alt":" Parameter η","inline":true},{"text":": Previously, we estimated the maximum number of cases ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-3.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"to be stored in the case-base as being the estimated maximum number of cases required to properly mimic the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-4.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". What follows is a description of how this value is computed. Figure 9 presents the trajectories (sequences of states) followed by the baseline policy ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-5.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"in three different domains: deterministic, slightly stochastic and highly stochastic. For each domain, different sequences of the states produced by ","element":"span"},{"style":{"height":10.7},"width":129.11,"height":26.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-6.png","element":"img","alt":" πT are","inline":true,"padRight":true},{"text":"represented ","element":"span"},{"style":{"height":17.6},"width":1390.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-7.png","element":"img","alt":" {s00, s01, s02, . . . , s0n}, {s00, s11, s12, . . . , s1n},. . ., {s00, sm1, sm2, . . . , smn},","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":17.42},"width":441.63,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-8.png","element":"img","alt":" sji is the i-th state, s00","inline":true,"padRight":true},{"text":"the initial state and ","element":"span"},{"style":{"height":13.02},"width":57.01,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-9.png","element":"img","alt":" sjn","inline":true,"padRight":true},{"text":"the final state of the resulting trajectory in episode ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":". In the deterministic domain, the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"different executions of ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-10.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"always result in the same trajectory. In this case, we set the maximum number of cases to ","element":"span"},{"style":{"height":12},"width":107.42,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-11.png","element":"img","alt":" η = n","inline":true,"padRight":true},{"text":"with all the cases computed in the episode being stored.","element":"span"}],[{"text":"In the slightly stochastic domain, the trajectories produced in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"different episodes are different, but only slightly so. Here, we suppose the case-base at the beginning to be empty. Additionally, we assume that all states ","element":"span"},{"style":{"height":17.6},"width":408.47,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-12.png","element":"img","alt":" {s00, s01, s02, . . . , s0n}","inline":true,"padRight":true},{"text":"corresponding to the first trajectory produced in the domain will be stored in the case-base. Furthermore, for each domain we execute ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"different episodes, obtaining ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"different trajectories. Following the execution of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"episodes, we compute the maximum distance between the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th state of the first trajectory (previously added to the case-base) and the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th state produced in the trajectory ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"such that max","element":"span"},{"style":{"height":18.22},"width":447.07,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-13.png","element":"img","alt":"1≤j≤m d(s0i, sji). In the","inline":true,"padRight":true},{"text":"slightly stochastic domain, this maximum distance does not exceed the threshold ","element":"span"},{"style":{"height":12.8},"width":70.95,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-14.png","element":"img","alt":" θ in","inline":true,"padRight":true},{"text":"any case such that max","element":"span"},{"style":{"height":18.22},"width":380.07,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-15.png","element":"img","alt":"1≤j≤m d(s0i, sji) < θ","inline":true},{"text":". At this point, we assume the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th state in trajectory ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"to have at least one neighbor with a distance less than ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-16.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"(corresponding to the state ","element":"span"},{"style":{"height":10.62},"width":49.39,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-17.png","element":"img","alt":" s0i","inline":true},{"text":"). Thus, the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th state in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"is not added to the case-base.","element":"span"}],[{"text":"By contrast, in a highly stochastic domain, this maximum distance greatly exceeds the threshold ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-18.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"in all the cases such that max","element":"span"},{"style":{"height":18.22},"width":417.34,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/18-19.png","element":"img","alt":"1≤j≤m d(s0i, sji) >> θ","inline":true},{"text":". In this domain, we estimate the total number of cases that will be added to the case-base in the following","element":"span"}],[{"text":"way. For each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th state in the sequence of the first trajectory, we estimate the number of cases to be added to the case-base as","element":"span"},{"style":{"height":32},"width":364.75,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/19-0.png","element":"img","alt":"� max1≤j≤m d(s0i,sji)θ �","inline":true},{"text":"or, in other words, we compute the number of intervals in the range [0","element":"span"},{"style":{"height":18.22},"width":383,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/19-1.png","element":"img","alt":", max1≤j≤m d(s0i, sji","inline":true},{"text":")] with a width of ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/19-2.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"(the threshold used to decide whether a new case is to be added or not to the case-base). Consequently, the estimated number of cases added to the case-base, taking into account all states in the sequence, is computed as ","element":"span"},{"style":{"height":32},"width":650.37,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/19-3.png","element":"img","alt":"�ni=0� max1≤j≤m d(s0i,sji)θ �. Finally,","inline":true,"padRight":true},{"text":"the estimated maximum number of cases is computed as shown in Equation 6.","element":"span"}],[{"style":{"width":"68%"},"width":1176,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/19-4.png","element":"img"}],[{"text":"It is important to remember that in a deterministic domain, the summation in equation 6 is equal to 0 and that, therefore, ","element":"span"},{"style":{"height":12},"width":107.42,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/19-5.png","element":"img","alt":" η = n","inline":true},{"text":". The increase of the value of this element is related to the increase of stochasticity of the environment, insofar as the greater stochasticity of the environment increases the number of cases required. Finally, if the number of cases is very large or nearly infinite, the threshold ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/19-6.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"can be increased to make more restrictive the addition of new cases to the case-base. However, this increase may also adversely affect the final performance of the algorithm.","element":"span"}]]},{"heading":"4. Experimental Results","paragraphs":[[{"text":"This section presents the experimental results collected from the use of PI-SRL for policy learning in four different domains presented in order of increasing complexity (i.e., increasing number of variables describing states and actions): the car parking problem (Lee & Lee, 2008), pole-balancing (Sutton & Barto, 1998), helicopter hovering (Ng et al., 2003) and the business simulator SIMBA (Borrajo et al., 2010). ","element":"span"},{"text":"$37","element":"span"},{"style":{"height":17.6},"width":83.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/19-7.png","element":"img","alt":" ρπ(s","inline":true},{"text":") of reaching a terminal error state (e.g., a helicopter crash ending agent control), starting at some initial state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":". In this case, a new value function with the weighted sum of the risk probability, ","element":"span"},{"style":{"height":15.93},"width":42.56,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-0.png","element":"img","alt":"ρπ","inline":true},{"text":", and value function, ","element":"span"},{"style":{"height":12.4},"width":55.15,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-1.png","element":"img","alt":" V π","inline":true},{"text":", is used (Equation 7).","element":"span"}],[{"style":{"width":"62%"},"width":1086,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-2.png","element":"img"}],[{"text":"The parameter ","element":"span"},{"style":{"height":16.4},"width":67.22,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-3.png","element":"img","alt":" ξ ≥","inline":true,"padRight":true},{"text":"0 determines the influence of the ","element":"span"},{"style":{"height":17.6},"width":95.71,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-4.png","element":"img","alt":" V π(s","inline":true},{"text":")-values compared to the ","element":"span"},{"style":{"height":17.6},"width":114.54,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-5.png","element":"img","alt":" ρπ(s)-","inline":true,"padRight":true},{"text":"values. For ","element":"span"},{"style":{"height":20.11},"width":181.25,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-6.png","element":"img","alt":" ξ = 0, V πξ ","inline":true,"padRight":true},{"text":"corresponds to the computation of minimum risk policies. For large ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-7.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"values, the original value function multiplied by ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-8.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"dominates the weighted criterion. While Geibel and Wysotzki (2005) consider only finite (discretized) action sets in their study, their algorithm has been adapted here for continuous action sets. We use CBR for value and risk function approximation and a Gaussian exploration around the current action. In the experiments, for each domain, three different ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-9.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"values are used, modifying the influence of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"-values compared to the ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-10.png","element":"img","alt":" ρ","inline":true},{"text":"-values. In all cases, the goal is to improve the control policy while, at the same time, minimizing the number of episodes with agent damage or injury. In each domain, we establish different risk levels by modifying risk parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-11.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"values according to the procedure described in subsection 3.3. It is important to note that one baseline behavior used to initialize the evolutionary RL approach is exactly the same as that used subsequently in the first and second step of PI-SRL. Furthermore, the case-base in the risk-sensitive approach does not begin from scratch since it is initialized with the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-12.png","element":"img","alt":" πθB","inline":true},{"text":". This makes the comparison of performances as fair as possible, but ","element":"span"},{"text":"taking into account that the different techniques make its own use of the baseline behaviors.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.1 Car Parking Problem","element":"span"}],[{"text":"The car parking problem is represented in Figure 10 and originates from the RL literature (Cichosz, 1996). A car, represented as the rectangle in Figure 10, is initially located inside a bounded area, represented by the dark solid lines, referred to as the driving area. The goal for the learning agent is to navigate the car from its initial position into the garage, such that the car is entirely inside, in a minimum number of steps. The car cannot move outside of the driving area. Figure 10 (b) shows the two possible paths the car can take from the starting point to the garage with an obstacle in between in order to correctly perform the task. We consider the optimal policy for the domain to be that which reaches the goal state in the shortest time and which, at the same time, is free of failures.","element":"span"}],[{"text":"The state space of the domain is described by three continuous variables, namely, the coordinates of the center of the car ","element":"span"},{"style":{"height":16.4},"width":174.95,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-13.png","element":"img","alt":" xt and yt","inline":true,"padRight":true},{"text":"and the angle ","element":"span"},{"style":{"height":15.02},"width":32.48,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-14.png","element":"img","alt":" θt","inline":true,"padRight":true},{"text":"between the car’s axis and the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X ","element":"span"},{"text":"of the coordinate system. While the car can be modeled essentially with two control inputs, speed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"v ","element":"span"},{"text":"and steering angle ","element":"span"},{"style":{"height":16.4},"width":26,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-15.png","element":"img","alt":" φ","inline":true},{"text":", let us suppose here that the car is controlled only by the steering angle (i.e., it moves at a constant speed). Thus, the action space is described by one continuous variable ","element":"span"},{"style":{"height":17.6},"width":172.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-16.png","element":"img","alt":" at ∈ [−1,","inline":true,"padRight":true},{"text":"1] corresponding to the turn radius, as used in the equations below. The agent receives a positive reward value of ","element":"span"},{"style":{"height":18.22},"width":702.67,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-17.png","element":"img","alt":" r = (1 − ς(dist(Pt, Pg))) × 10, where","inline":true},{"style":{"height":17.6},"width":209.91,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-18.png","element":"img","alt":"Pt = (xt, yt","inline":true},{"text":") is the center of the car, ","element":"span"},{"style":{"height":18.22},"width":223.97,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-19.png","element":"img","alt":" Pg = (xg, yg","inline":true},{"text":") is the center of the garage (i.e., the goal position) and ","element":"span"},{"style":{"height":10},"width":18,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-20.png","element":"img","alt":" ς","inline":true,"padRight":true},{"text":"is a normalizing function scaling the Euclidean distance ","element":"span"},{"style":{"height":18.22},"width":381.84,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-21.png","element":"img","alt":" dist(Pt, Pg) between","inline":true},{"style":{"height":17.42},"width":181.58,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/20-22.png","element":"img","alt":"Pt and Pg","inline":true,"padRight":true},{"text":"to a range [0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1] when the car is inside the garage (i.e., the reward value is greater if the car is parked correctly in the center of the garage). The agent receives a reward of -1 whenever it hits the wall or obstacle. All other steps receive a reward of -0.1. Thus, the difficulty of the problem lies not only in the reinforcement delay, but also in the fact that","element":"span"}],[{"style":{"width":"95%"},"width":1658,"height":606,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-0.png","element":"img"}],[{"text":"Figure 10: Car Parking Problem: (a) Model of the car parking problem. (b) Examples of trajectories generated by the agent to park the car in the garage.","element":"figcaption","subtype":"caption"}],[{"text":"punishments are much more frequent than positive rewards (i.e., it is much easier to hit a wall than park the car correctly). The motion of the car is described by the following equations (Lee & Lee, 2008)","element":"span"}],[{"style":{"width":"67%"},"width":1167,"height":322,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-1.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"v ","element":"span"},{"text":"is the linear velocity of the car (assumed to be a constant value), ","element":"span"},{"style":{"height":16.4},"width":156.52,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-2.png","element":"img","alt":" φ is the","inline":true,"padRight":true},{"text":"maximum steering angle (i.e., the car can change its position by a maximum angle of ","element":"span"},{"style":{"height":16.4},"width":26,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-3.png","element":"img","alt":" φ","inline":true,"padRight":true},{"text":"in both directions) and ","element":"span"},{"style":{"height":8},"width":23,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-4.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"is the simulation time step. Gaussian noise was added to the actions and rewards with a standard deviation of 0.1, since noisy interactions are inevitable in most real-world applications. Adding this noise to the actuators and the environment, we transform the deterministic domain into a stochastic domain. It is important to note that the noise added to transform the domain into a stochastic domain is independent of the Gaussian noise with standard deviation ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-5.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"(risk parameter) used to explore the state and action space in the second step of the PI-SRL algorithm. In this case, the Gaussian noise with standard deviation ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-6.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"used for exploration will be added to the noise previously added to the actuators. In this paper, ","element":"span"},{"style":{"height":17.6},"width":1107.23,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-7.png","element":"img","alt":" l = 4 (m), v = 1.0 (m/s), φ = 0.78 (rad) and τ = 0.5 (s)","inline":true,"padRight":true},{"text":"(the driving area and obstacle dimensions are detailed in Figure 10 [a]). The initial position of the car is fixed at ","element":"span"},{"style":{"height":17.6},"width":763.57,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-8.png","element":"img","alt":" xs = 4.0, ys = 4.0 and θs = 0.26 (rad","inline":true},{"text":"), while the goal position is ","element":"span"},{"style":{"height":17.42},"width":448.67,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-9.png","element":"img","alt":"xg = 22.5 and yg = 13.","inline":true},{"text":"5. For this domain, we have designed a baseline behavior ","element":"span"},{"style":{"height":15.1},"width":152.16,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-10.png","element":"img","alt":" πT with","inline":true,"padRight":true},{"text":"an average cumulative reward per trial of 4.75.","element":"span"}],[{"text":"In order to perform the PI-SRL algorithm, the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"modeling baseline behavior step ","element":"span"},{"text":"is executed. The result of this step is the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-11.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"learned from demonstrations ","element":"span"},{"text":"provided by the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-12.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"(see subsection 3.1). ","element":"span"},{"style":{"height":16.4},"width":138.57,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/21-13.png","element":"img","alt":" θ and η","inline":true,"padRight":true},{"text":"were computed following the procedure described in subsection 3.3 with resulting values of 0.01 and 207, respectively.","element":"span"}],[{"style":{"width":"96%"},"width":1663,"height":609,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-0.png","element":"img"}],[{"text":"Figure 11: Car Parking Task ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"Modeling Baseline Behavior ","element":"figcaption","subtype":"caption"},{"text":"Step: (a) Number of steps per","element":"figcaption","subtype":"caption"}],[{"style":{"width":"87%"},"width":1512,"height":155,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-1.png","element":"img"}],[{"text":"Figure 11 (a) graphically represents the execution of the modeling baseline behavior step. In it, two different learning processes are presented and, for each one, the number of steps per trial executed by the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-2.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"(continuous red lines) and the cases in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(dashed green lines) is shown. At the beginning of the learning process with an empty case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", all steps are performed using the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-3.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". As the learning process continues, new cases are added to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"and the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-4.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"is learned. At around the trials ","element":"span"},{"text":"40-50, practically all steps are performed using the cases in ","element":"span"},{"style":{"height":15.1},"width":186.14,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-5.png","element":"img","alt":" B and πT","inline":true,"padRight":true},{"text":"is rarely used, that means that a safe case-based policy has been learned. In the two learning processes shown in Figure 11 (a), the modeling baseline behavior step is performed without collisions with the wall or the obstacle. In other words, the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-6.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"is cloned safely without errors. Figure 11 (b) shows the cumulative reward for three different execution processes: the first (continuous red lines) corresponding to the performance of the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-7.png","element":"img","alt":"πT","inline":true,"padRight":true},{"text":", the second (dashed green lines) corresponding to the previously-learned safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-8.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"(derived from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":") and the third (dashed blue lines) corresponding to an instance- ","element":"span"},{"text":"based learning (IBL) approach consisting of storing cases in memory. In the IBL approach, new items are classified by examining the cases stored in memory and determining the most similar case(s) given a particular similarity metric (Euclidean distance is used in this paper). The classification of that nearest neighbor (or those nearest neighbors) is taken as the classification of the new item using a 1-nearest neighbor strategy (Aha & Kibler, 1991). For each approach, two different executions are carried out. In the IBL approach, the training process is performed saving all training cases produced by the baseline behavior ","element":"span"},{"style":{"height":16.4},"width":189.51,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-9.png","element":"img","alt":" πT during","inline":true,"padRight":true},{"text":"50 trials (so we consider this approach an IB","element":"span"},{"style":{"fontStyle":"italic"},"text":"1 ","element":"span"},{"text":"algorithm in the sense that it saves every case during the training phase, see Aha & Kibler, 1991). Figure 11 (b) shows that the safe case-based policy ","element":"span"},{"style":{"height":10.3},"width":50.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-10.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"almost perfectly mimics the behavior of the baseline behavior ","element":"span"},{"style":{"height":14.7},"width":123.16,"height":36.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/22-11.png","element":"img","alt":" πT . In","inline":true,"padRight":true},{"text":"the domain, the performance of the IB","element":"span"},{"style":{"fontStyle":"italic"},"text":"1 ","element":"span"},{"text":"approach is also similar.","element":"span"}],[{"text":"Figure 12 (a) shows the results for different risk configurations obtained by the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"improving the learned baseline behavior step","element":"span"},{"text":". For each risk configuration, two different learning pro-","element":"span"}],[{"style":{"width":"96%"},"width":1661,"height":608,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-0.png","element":"img"}],[{"text":"Figure 12: Improving the learned baseline behavior step in car parking problem: (a) Cumulative reward per episode for different risk configurations (","element":"figcaption","subtype":"caption"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-1.png","element":"img","alt":"σ","inline":true},{"text":") obtained by PI-SRL. (b) Cumulative reward per episode by the evolutionary RL and risk-sensitive RL approaches. In all cases, any episode ending in failure is marked.","element":"figcaption","subtype":"caption"}],[{"text":"cesses are performed. All trials ending in failure (car hits the wall or obstacle) are marked (blue triangles). The learning processes in Figure 12 (a) demonstrate that the number of failures increases with an increase in the parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-2.png","element":"img","alt":" σ","inline":true},{"text":". For a low level of risk (","element":"span"},{"style":{"height":19.13},"width":267.76,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-3.png","element":"img","alt":"σ = 9×10−4),","inline":true,"padRight":true},{"text":"although no failures are produced, the performance is nevertheless weak (around the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-4.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":") and constant throughout the whole of the learning process. Additional experiments have demonstrated that increasing the ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-5.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"value above ","element":"span"},{"style":{"height":15.13},"width":233.93,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-6.png","element":"img","alt":" σ = 9×10−2 ","inline":true,"padRight":true},{"text":"increases the number of failures without improving performance. Figure 12 (b) shows the results for the evolutionary and risk-sensitive RL approaches for different ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-7.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"values. Regarding the former, the number of failures is higher than that obtained by the PI-SRL approach, while its final performance is similar. In the case of the latter, performance is higher when ","element":"span"},{"style":{"height":17.6},"width":264.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-8.png","element":"img","alt":" ξ = 1.0 (value","inline":true,"padRight":true},{"text":"maximization), yet the agent consistently crashes the car into the wall.","element":"span"}],[{"text":"Figure 13 shows the mean number of failures (i.e., car collisions) and cumulative reward for each approach over 500 trials with the red circles corresponding to the PI-SRL algorithm, the black triangles to the risk-sensitive approach and the blue square to the evolutionary RL approach. Additionally, Figure 13 shows two asymptotes. The horizontal asymptote is established according to the cumulative reward obtained by the highest ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-9.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"value. The horizontal asymptote indicates that higher ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-10.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"values increase the number of failures without improving the cumulative reward (which may, in fact, get worse). The vertical asymptote at ","element":"span"},{"style":{"height":16.4},"width":1120.78,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-11.png","element":"img","alt":" Failures = 0 indicates that reducing the risk parameter σ","inline":true,"padRight":true},{"text":"does not reduce the number of failures. ","element":"span"},{"text":"Figure 13 also shows the performance for two additional risk levels, a very high level of risk (","element":"span"},{"style":{"height":15.13},"width":278.98,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-12.png","element":"img","alt":"σ = 9 × 10−1","inline":true},{"text":") and very low level of risk (","element":"span"},{"style":{"height":17.6},"width":483.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-13.png","element":"img","alt":"σ = 0), with respect to","inline":true,"padRight":true},{"text":"Figure 12. When using a very low level of risk ","element":"span"},{"style":{"height":15.6},"width":770.21,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-14.png","element":"img","alt":" σ = 0, no additional random Gaussian","inline":true,"padRight":true},{"text":"noise is added to the actions and the algorithm is free of failures, although performance does not improve with respect to the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-15.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"learned in the first step ","element":"span"},{"text":"of the algorithm. ","element":"span"},{"text":"PI-SRL with a medium level of risk (","element":"span"},{"style":{"height":15.13},"width":279.66,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-16.png","element":"img","alt":"σ = 9 × 10−4","inline":true},{"text":") also is free of failures, yet performance is also slightly improved. The PI-SRL algorithm with high level of risk (","element":"span"},{"style":{"height":15.13},"width":269.38,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/23-17.png","element":"img","alt":"σ = 9 × 10−2","inline":true},{"text":") obtains the highest cumulative reward, 3053.37, with a mean of","element":"span"}],[{"style":{"width":"52%"},"width":902,"height":645,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-0.png","element":"img"}],[{"text":"Figure 13: Mean number of failures (car collisions) and cumulative reward over 500 trials for each approach in car parking task. The means have been computed from 10 different executions.","element":"figcaption","subtype":"caption"}],[{"text":"78.8 failures. However, when using a very high level of risk (","element":"span"},{"style":{"height":15.13},"width":249.03,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-1.png","element":"img","alt":"σ = 9 × 10−1","inline":true},{"text":"), the number of failures greatly increases and, consequently, the cumulative reward decreases. As shown in Figure 12, PI-SRL with high risk (","element":"span"},{"style":{"height":15.13},"width":244.3,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-2.png","element":"img","alt":"σ = 9 × 10−2","inline":true},{"text":") and the evolutionary RL approach obtain a similar performance, while PI-SRL demonstrates a faster convergence (thus, in Figure 13, the cumulative reward obtained by PI-SRL is higher). The Pareto comparison criterion can be used to compare the solutions in Figure 13. Using this principle, one solution ","element":"span"},{"style":{"height":16.4},"width":190.18,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-3.png","element":"img","alt":" y∗ strictly","inline":true,"padRight":true},{"text":"dominates (or “is preferred to”) a solution ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y ","element":"span"},{"text":"if each parameter of ","element":"span"},{"style":{"height":16.33},"width":39.96,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-4.png","element":"img","alt":" y∗ ","inline":true,"padRight":true},{"text":"is not strictly worse than the corresponding parameter of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y ","element":"span"},{"text":"and at least one parameter is strictly better. This is written as ","element":"span"},{"style":{"height":16.33},"width":122.07,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-5.png","element":"img","alt":" y∗ ≻ y","inline":true},{"text":", indicating that ","element":"span"},{"style":{"height":16.33},"width":39.96,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-6.png","element":"img","alt":" y∗ ","inline":true,"padRight":true},{"text":"strictly dominates ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y","element":"span"},{"text":". In accordance with the Pareto principle, we can assume the points in Figure 13 corresponding to the PI-SRL solutions, save PI-SRL with very high level of risk, to be on the Pareto frontier, since these points are not strictly dominated by any other solution (i.e., no other solution has, at the same time, a higher cumulative reward and a lower number of failures than PI-SRL). In this domain, the solution of the PI-SRL with a medium level of risk strictly dominates (or “is preferred to”) the risk-sensitive solutions (PI-SRL ","element":"span"},{"style":{"height":15.53},"width":289.42,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-7.png","element":"img","alt":" σ = 9 × 10−3 ≻","inline":true,"padRight":true},{"text":"risk-sensitive) and the solution PI-SRL with a high level of risk strictly dominates the solution of the evolutionary RL solution (PI-SRL ","element":"span"},{"style":{"height":15.53},"width":294.86,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-8.png","element":"img","alt":" σ = 9 × 10−2 ≻","inline":true,"padRight":true},{"text":"evolutionary RL).","element":"span"}],[{"text":"Nevertheless, it is important to note that any ultimate decision about which approach in Figure 13 is best depends on the criteria of the researcher. If, for instance, the minimization of the number of failures is deemed the most important optimization criterion (independently of the improvement obtained with respect to the baseline behavior ","element":"span"},{"style":{"height":17.6},"width":155.25,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-9.png","element":"img","alt":" πT ), the","inline":true,"padRight":true},{"text":"best approach will be PI-SRL with a low level of risk (","element":"span"},{"style":{"height":15.13},"width":266.88,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-10.png","element":"img","alt":"σ = 9 × 10−4","inline":true},{"text":"). Similarly, if the maximization of the cumulative reward is instead judged to be the most important optimization criterion (independently of the number of failures generated), the best approach will be PI-SRL with a high level of risk (","element":"span"},{"style":{"height":19.13},"width":277.72,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/24-11.png","element":"img","alt":"σ = 9 × 10−2).","inline":true}],[{"text":"Figure 14 shows the evolution of the cases in the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(known space) in different trials for a high-risk learning process. Each graph presents the set of known states Ω (green","element":"span"}],[{"style":{"width":"92%"},"width":1607,"height":1256,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/25-0.png","element":"img"}],[{"text":"Figure 14: Car parking problem: Evolution of the known space for different trials ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T ","element":"figcaption","subtype":"caption"},{"text":"= 0 (a), ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T ","element":"figcaption","subtype":"caption"},{"text":"= 50 (b), ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T ","element":"figcaption","subtype":"caption"},{"text":"= 100 (c) and ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T ","element":"figcaption","subtype":"caption"},{"text":"= 200 (d) in a high-risk learning process (","element":"figcaption","subtype":"caption"},{"style":{"height":15.13},"width":255.13,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/25-1.png","element":"img","alt":"σ = 9 × 10−2","inline":true},{"text":"). Each graph corresponds to the situation of the state space in accordance with the case-base ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B ","element":"figcaption","subtype":"caption"},{"text":"in trial ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T","element":"figcaption","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"text":"area), error states Φ (red area), unknown states Υ (yellow area) and non-error states Γ","element":"span"},{"style":{"height":9.2},"width":25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/25-2.png","element":"img","alt":"Ω","inline":true,"padRight":true},{"text":"(orange circles). PI-SRL adapts the known space in order to find safer and better policies to complete the task. Figure 14 (a) shows the initial situation of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(corresponding to the previously-learned safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/25-3.png","element":"img","alt":" πθB","inline":true},{"text":"). It is robust in the sense that it never results in ","element":"span"},{"text":"any collisions, but suboptimal (it selects the longest parking path driving around the upper side of the obstacle). As the learning process progresses (Figure 14 (b)), PI-SRL finds a shorter path to park the car in the garage along the upper side of the obstacle (increasing the performance), but which comes closer to the obstacle than before (increasing the probability of collisions). In Figure 14 (c), PI-SRL finds a new and even shorter path, this time along the lower side of the obstacle. However, there are still cases in the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"corresponding to the older path along the upper side of the obstacle (so Figure 14 (c) indicates two paths to park the car). Finally, in Figure 14 (d), the cases corresponding to the suboptimal path along the upper side of the obstacle have been removed from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"and replaced by new cases corresponding to the safe and improved path along the lower side of the obstacle. In other words, PI-SRL adapts the known space through the exploration of the unknown space in order to find new and improved behaviors. During this process of adjusting the known space to safe and better policies, the algorithm “forgets” the previously-learned, yet ineffective known states.","element":"span"}],[{"text":"In the following experiment, it becomes apparent that if the domain is noisy enough, even when taking no risk at all (i.e., no further noise added to the actuator for exploration), the agent could nevertheless perform poorly and constantly produce collisions. The experiment also serves to explain why domain noise can never be sufficient for the efficient exploration of the space without action selection noise. In the experiment, we have intentionally added more noise to the actuators and have performed second step of PI-SRL again, however this time taking no risk (i.e., ","element":"span"},{"style":{"height":17.6},"width":1244.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-0.png","element":"img","alt":" σ = 0). In this test, we have added random Gaussian noise with","inline":true,"padRight":true},{"text":"a standard deviation of 0.3, rather than the standard deviation of 0.1 used previously, to the actuators. Figure 15 shows two executions of the second step (improving the learned baseline policy) of the PI-SRL algorithm with the x-axis indicating the number of trials, the y-axis the cumulative reward per episode and failures (i.e., collisions) marked as blue triangles. In the experiments in Figure 12 (b), the case-based policy ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-1.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"with low level of risk (","element":"span"},{"style":{"height":15.13},"width":260.14,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-2.png","element":"img","alt":"σ = 9 × 10−4","inline":true},{"text":") never produces failures. In contrast, in the experiments shown in Figure 15, the same case-based policy ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-3.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"continually collides with the wall although the risk parameter is set to 0 (","element":"span"},{"style":{"height":17.6},"width":1224.89,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-4.png","element":"img","alt":"σ = 0). Furthermore, an increase in the performance can also be","inline":true,"padRight":true},{"text":"detected.","element":"span"}],[{"style":{"width":"53%"},"width":922,"height":640,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-5.png","element":"img"}],[{"text":"Figure 15: Improving the learned baseline behavior step of car parking task: Two learning processes for risk configuration ","element":"figcaption","subtype":"caption"},{"style":{"height":12.8},"width":819.22,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-6.png","element":"img","alt":" σ = 0 and an increase in the noise in the","inline":true,"padRight":true},{"text":"actuators.","element":"figcaption","subtype":"caption"}],[{"text":"The increase of noise in the actuators in the second step of the algorithm with respect to the first step (the case-based policy ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-7.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"is learned in the first step using Gaussian random noise in the actuator with a standard deviation of 0.1, while the second step is performed using Gaussian random noise in the actuator with a standard deviation of 0.3) takes the agent beyond the known space of the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"learnt in the first step of PI-SRL and allows it to find new trajectories for parking the car in the garage. In this new situation, the exploration process is guided as follows. If a known state is reached, the agent performs the action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"retrieved from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"without the addition of Gaussian noise, since the risk parameter ","element":"span"},{"style":{"height":17.6},"width":1727.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/26-8.png","element":"img","alt":"σ = 0 (see line 11 in Figure 8 algorithm). If an unknown state is reached, the agent performs","inline":true,"padRight":true},{"text":"the action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"advised by the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-0.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"(see line 15). ","element":"span"},{"text":"Using this exploration process, if a new and better trajectory is found for parking the car in the garage, the resulting cases in the episode corresponding to unknown states are added to the case-base (see line 37), slightly improving the performance in Figure 15. It is important to note that the replacements of cases (see line 34) does not change the actions in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", since these are replaced by the same action previously retrieved from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"plus a certain amount of Gaussian noise with standard deviation ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-1.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"(see line 11). Nevertheless, given that the risk parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-2.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"has been set to 0, the actions retrieved from the case-base are not replaced. This exploration process, however, with ","element":"span"},{"style":{"height":17.6},"width":1280.86,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-3.png","element":"img","alt":" σ = 0 (i.e., taking no risk) does not lead to optimal behavior since:","inline":true}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"The actions performed in unknown situations and added to the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"are performed using the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-4.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"which is supposed perform suboptimal actions (see definition of baseline behavior).","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"The actions in the cases of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"are not replaced with improved actions. The Gaussian noise with standard deviation ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-5.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"is used to explore different and better actions than those provided by ","element":"span"},{"style":{"height":15.1},"width":182.54,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-6.png","element":"img","alt":" B and πT","inline":true,"padRight":true},{"text":"; however, in this case, the risk parameter is set to ","element":"span"},{"style":{"height":12},"width":106.7,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-7.png","element":"img","alt":" σ = 0","inline":true,"padRight":true},{"text":"and new and better actions are not discovered.","element":"span"}],[{"text":"Additional experiments demonstrate that PI-SRL behaves much worse when a higher value of noise is used in the actuators (with collisions in all episodes). We assume that taking no risk (i.e., ","element":"span"},{"style":{"height":17.6},"width":1480.47,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-8.png","element":"img","alt":" σ = 0) implies always performing the same actions while not discovering any","inline":true,"padRight":true},{"text":"newer or better actions than those provided by the learned case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"and the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-9.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". In PI-SRL, the replacements in the case-base are executed towards the more promising action which, in our case, is that which guarantees a higher return. ","element":"span"},{"text":"This is why exploration is necessary in order to obtain (near-)optimal behavior, since without exploration, new and better actions are not discovered and PI-SRL performance is limited by that of the case-based policy learned in the first step ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-10.png","element":"img","alt":" πB","inline":true,"padRight":true},{"text":"and the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-11.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"which, one must remember, is intended to perform suboptimal policies.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.2 Pole-Balancing","element":"span"}],[{"text":"As the name suggests, the objective in the pole-balancing problem is to balance a pole vertically on top of a moving cart (Sutton & Barto, 1998). The state description consists of a four-dimensional vector containing the angle ","element":"span"},{"style":{"height":16.4},"width":26,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-12.png","element":"img","alt":" φ","inline":true},{"text":", the radial speed ","element":"span"},{"style":{"height":16.4},"width":42,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-13.png","element":"img","alt":" φ′","inline":true},{"text":", the cart position ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"and the speed ","element":"span"},{"style":{"height":8.4},"width":40.94,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-14.png","element":"img","alt":" x′","inline":true},{"text":". The action consists of a real-valued force that is used to push the cart. In this study, the reward is computed to encourage actions that keep the pole as upright as possible on the cart and the cart as centered as possible on the track. Thus, the reward in step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"is computed as ","element":"span"},{"style":{"height":17.6},"width":796.79,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-15.png","element":"img","alt":" rt = 1 − (ς(φt) + ρ(xt))/2, where ς and ρ","inline":true,"padRight":true},{"text":"are normalizing functions scaling the angle ","element":"span"},{"style":{"height":16.4},"width":38,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-16.png","element":"img","alt":" φt","inline":true,"padRight":true},{"text":"and the position ","element":"span"},{"style":{"height":10.62},"width":36.94,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-17.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"to a range [0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1]. An episode is composed of 10,000 steps, although it may nevertheless end prematurely if the pole becomes unbalanced (i.e., if it has an inclination of more than twelve degrees in either direction) or the cart falls off the track (i.e., if it is more than 2.4m from the center of the track), both of which being considered failures. As in the car parking problem, Gaussian noise was added to the actions and rewards, this time with a standard deviation of 10","element":"span"},{"style":{"height":14.73},"width":57.28,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/27-18.png","element":"img","alt":"−4.","inline":true,"padRight":true},{"text":"The pole-balancing domain becomes stochastic through the addition of this noise to the actuators and reward function.","element":"span"}],[{"style":{"width":"96%"},"width":1665,"height":620,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-0.png","element":"img"}],[{"text":"Figure 16: Modeling baseline behavior step in pole-balancing task: (a) Number of steps per trial executed by case-base ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B ","element":"figcaption","subtype":"caption"},{"text":"and baseline behavior ","element":"figcaption","subtype":"caption"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-1.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". (b) Cumulative reward per trial for ","element":"figcaption","subtype":"caption"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-2.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":", the learned safe case-based policy ","element":"figcaption","subtype":"caption"},{"style":{"height":20.31},"width":50.88,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-3.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"and an IBL approach.","element":"figcaption","subtype":"caption"}],[{"text":"The hand-made baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-4.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"demonstrates the execution of a safe, yet suboptimal policy, with an average cumulative reward per episode/trial of 9292.","element":"span"}],[{"text":"In the modeling baseline behavior step of PI-SRL, the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":217.23,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-5.png","element":"img","alt":" πθB is learnt","inline":true,"padRight":true},{"text":"from demonstrations provided by the baseline behavior ","element":"span"},{"style":{"height":16.4},"width":215.84,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-6.png","element":"img","alt":" πT . θ and η","inline":true,"padRight":true},{"text":"were computed following the procedure described in subsection 3.3, with values of 0.02 and 12572, respectively. Figure 16 (a) shows two different learning processes for the modeling baseline behavior step. For each learning process, Figure 16 (a) shows the number of steps per trial executed by baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-7.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"(continuous red lines) and by the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(dashed green lines). At the beginning of the learning process, the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"is empty and all steps are performed using the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-8.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". As the learning process progresses, however, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"is filled and the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-9.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"is learnt. At the end of the learning process (after around ","element":"span"},{"text":"45-50 trials), almost all steps are performed using the cases in ","element":"span"},{"style":{"height":15.1},"width":184.45,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-10.png","element":"img","alt":" B and πT","inline":true,"padRight":true},{"text":"is rarely used. It is important to note that the modeling baseline behavior step has been performed without failures (i.e., pole disequilibrium or cart off the track) in each case. As with the previous task, Figure 16 (b) represents three independent execution processes using the previously-learned safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.88,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-11.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"(derived from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"and indicated with dashed green lines), ","element":"span"},{"text":"the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-12.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"(indicated with continuous red lines) and an approach based on IBL (indicated with dashed blue lines) (Aha & Kibler, 1991). ","element":"span"},{"text":"The average cumulative reward per episode in ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-13.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"is 9230 (Figure 16 [b]). While ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-14.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"almost perfectly clones ","element":"span"},{"style":{"height":15.6},"width":138.84,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-15.png","element":"img","alt":" πT , the","inline":true,"padRight":true},{"text":"IB","element":"span"},{"style":{"fontStyle":"italic"},"text":"1 ","element":"span"},{"text":"approach which, in most cases, results in pole disequilibrium or the cart falling off the track averages a cumulative reward per episode of 8055.","element":"span"}],[{"text":"Figure 17 (a) shows the results of PI-SRL for different risk configurations. For each configuration, the learning curves are shown for two different learning processes performed. Additionally, any episode ending in failure is marked (blue triangles). While an increase in risk increases the probability of failure, the policy obtained is nevertheless better in terms of the cumulative reward. Nevertheless, much greater risk values (","element":"span"},{"style":{"height":15.14},"width":237.5,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/28-16.png","element":"img","alt":"σ = 9×10−5","inline":true},{"text":") produce more failures without an accompanying increase in the cumulative reward. Figure 17 (b) shows the results for the evolutionary and risk-sensitive RL approaches, the former of which being","element":"span"}],[{"style":{"width":"95%"},"width":1658,"height":618,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/29-0.png","element":"img"}],[{"text":"Figure 17: Improving the learned baseline behavior step of pole-balancing task: (a) Cumulative reward per episode for different risk configurations (","element":"figcaption","subtype":"caption"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/29-1.png","element":"img","alt":"σ","inline":true},{"text":") obtained by PISRL. (b) Cumulative reward per episode obtained by the evolutionary and risk-sensitive RL approaches. In all cases, any episode ending in failure is marked.","element":"figcaption","subtype":"caption"}],[{"text":"clearly the algorithm with the greatest number of failures. In the risk-sensitive approach, for ","element":"span"},{"style":{"height":16.4},"width":124.33,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/29-2.png","element":"img","alt":" ξ = 2.","inline":true},{"text":"0 (value maximization), the agent selects actions that result in a higher value, but also in a higher risk. On the contrary, for ","element":"span"},{"style":{"height":17.6},"width":818.18,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/29-3.png","element":"img","alt":" ξ = 0 (risk minimization), when the agent","inline":true,"padRight":true},{"text":"learns the risk function (at around episode 6000), it selects actions with a lower risk (and a lower number of failures), but also with considerably weak performance. The value ","element":"span"},{"style":{"height":16.4},"width":135.22,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/29-4.png","element":"img","alt":" ξ = 0.1","inline":true,"padRight":true},{"text":"produces an intermediate policy. Consequently, it can be concluded that PI-SRL with a high level of risk obtains better policies and less failures than the evolutionary or risk-sensitive RL approaches. Figure 18 reinforces the previous conclusions.","element":"span"}],[{"style":{"width":"51%"},"width":896,"height":641,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/29-5.png","element":"img"}],[{"text":"Figure 18: Mean number of failures (pole disequilibrium or cart off the track) and cumulative reward during 500 trials for each approach in the pole-balancing task. The means have been computed from 10 different executions.","element":"figcaption","subtype":"caption"}],[{"text":"In it, the mean number of failures and cumulative reward during 12,000 trials are shown, with the red circles corresponding to PI-SRL, the black triangles corresponding to the risk-sensitive approach and the blue square corresponding to the evolutionary RL approach. The figure also shows performance for two additional risk levels, a very high level of risk (","element":"span"},{"style":{"height":15.13},"width":237.26,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-0.png","element":"img","alt":"σ = 9×10−4","inline":true},{"text":") and very low level of risk (","element":"span"},{"style":{"height":17.6},"width":948.14,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-1.png","element":"img","alt":"σ = 0), with respect to Figure 17. The cumulative","inline":true,"padRight":true},{"text":"reward and number of failures increase with the high level of risk (","element":"span"},{"style":{"height":19.13},"width":409.08,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-2.png","element":"img","alt":"σ = 9 × 10−5). This","inline":true,"padRight":true},{"text":"risk level represents an inflection point at which higher levels of risk produce more failures without an accompanying improvement in the cumulative reward. In fact, the very high level of risk (","element":"span"},{"style":{"height":15.13},"width":243.73,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-3.png","element":"img","alt":"σ = 9 × 10−4","inline":true},{"text":") results in a reduction in the cumulative reward when compared with the high level of risk (","element":"span"},{"style":{"height":15.13},"width":249.02,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-4.png","element":"img","alt":"σ = 9 × 10−5","inline":true},{"text":"). Again, the Pareto comparison criterion may be used to compare the solutions from Figure 18. In this domain, the solution from PI-SRL with a low level of risk strictly dominates the risk-sensitive solutions with ","element":"span"},{"style":{"height":16.4},"width":392.09,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-5.png","element":"img","alt":" ξ = 0.0 and ξ = 0.1,","inline":true,"padRight":true},{"text":"such that PI-SRL ","element":"span"},{"style":{"height":15.53},"width":290.87,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-6.png","element":"img","alt":" σ = 9 × 10−7 ≻","inline":true,"padRight":true},{"text":"risk-sensitive with ","element":"span"},{"style":{"height":16.4},"width":345.53,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-7.png","element":"img","alt":" ξ = 0.0 and ξ = 0.","inline":true},{"text":"1. Additionally, the solution from PI-SRL with a high level of risk strictly dominates evolutionary RL solution, such that PI-SRL ","element":"span"},{"style":{"height":15.53},"width":294.86,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-8.png","element":"img","alt":" σ = 9 × 10−5 ≻","inline":true,"padRight":true},{"text":"evolutionary RL.","element":"span"}],[{"text":"Lastly, Figure 19 shows the evolution of the known space derived from the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"in different trials for a high-risk learning process. For each graph, error states Φ (red area), the set of unknown states Υ (yellow area), the set of known states Ω (green area) and the set of non-error states Γ","element":"span"},{"style":{"height":9.2},"width":25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-9.png","element":"img","alt":"Ω","inline":true,"padRight":true},{"text":"(orange circles) are represented. The known space Ω in each graph has been computed taking cases from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"in the trials ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"= 0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"3000","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"6000 and 8000. For each graph, non-error states Γ","element":"span"},{"style":{"height":9.2},"width":25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-10.png","element":"img","alt":"Ω","inline":true,"padRight":true},{"text":"have been computed from 10 different executions of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"in the trial ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"(the orange circles representing the terminal states for each of these executions). The first graph (Figure 19 [a]) presents the initial known space resulting from the modeling baseline behavior step. The evolution in Figure 19 demonstrates two different points. First, PI-SRL progressively adapts the known space in order to encounter better behavior such that the known space tends to be compressed toward the center of the coordinates. This is so due to the fact that the reward is greater if the angle ","element":"span"},{"style":{"height":16.4},"width":26,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/30-11.png","element":"img","alt":" φ","inline":true,"padRight":true},{"text":"of the pole and the cart position ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"are 0 (i.e., the pole is as upright as possible on the cart and the cart is centered on the track). Second, the risk of failure in the pole-balancing domain is greater during early trials of the learning process. At the beginning of the learning process (Figure 19 [a]), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"= 0), some regions of the known space are close to the error space. In this situation, slight modifications of the actions consistently produce visits to the states in Φ (i.e., pole disequilibrium or cart falling off the track). As the learning process advances (Figure 19 [b], [c] and [d]), the known space is compressed toward the origin of coordinates and away from the error space. Consequently, the probability of visiting error states decreases. For example, returning to Figure 17 (a), in the high-risk learning processes, 52% of the failures (126) occur in the first 4000 trials, while the remaining 48% (117) occur in the last 8000 trials.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.3 Helicopter Hovering","element":"span"}],[{"text":"As suggested by its name, the objective of this domain is to make a helicopter hover as close as possible to a defined position for a duration established by an episode. The task is challenging for two main reasons. Firstly, both the state and action spaces are high-dimensional and continuous (more specifically, the state space is 12-dimensional and the action space","element":"span"}],[{"style":{"width":"95%"},"width":1650,"height":1314,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-0.png","element":"img"}],[{"text":"Figure 19: Pole-balancing task: Evolution of the known space for different trials ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T ","element":"figcaption","subtype":"caption"},{"text":"= 0 (a),","element":"figcaption","subtype":"caption"}],[{"style":{"width":"87%"},"width":1511,"height":151,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-1.png","element":"img"}],[{"text":"4-dimensional). Secondly, it is a generalized domain whose behavior is modified by the wind factor. A helicopter episode is composed of 6000 steps, although it may end prematurely if the helicopter crashes. The first step of PI-SRL is performed in order to imitate the baseline behavior ","element":"span"},{"style":{"height":16.4},"width":242.19,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-2.png","element":"img","alt":" πT . θ and η","inline":true,"padRight":true},{"text":"were computed following the procedure described in subection 3.3 with values of 0.3 and 49735, respectively. Once this step has been performed, the resulting safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-3.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"is able to properly imitate the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":63.29,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-4.png","element":"img","alt":" πT .","inline":true}],[{"text":"Figure 20 (a) shows two learning processes of the modeling baseline behavior step. Similar to previous tasks, as the learning processes progress, the number of steps executed by the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-5.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"is reduced while the number of steps using the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"increases. By the end of the learning process, the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"stores the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-6.png","element":"img","alt":" πθB","inline":true},{"text":". Figure 20 (b) compares the performance (in terms of cumulative reward per ","element":"span"},{"text":"episode) of ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-7.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":", the learned case-based policy ","element":"span"},{"style":{"height":20.31},"width":310.21,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-8.png","element":"img","alt":" πθB and the IB1","inline":true,"padRight":true},{"text":"approach. Regarding the ","element":"span"},{"text":"first two, the average cumulative reward per episode of ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-9.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"is -78035.93, while that obtained by ","element":"span"},{"style":{"height":20.31},"width":50.88,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-10.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"is -85130.11. Although the ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-11.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"does not perfectly mimic the baseline behavior ","element":"span"},{"style":{"height":10.8},"width":63.29,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/31-12.png","element":"img","alt":" πT ,","inline":true}],[{"style":{"width":"96%"},"width":1665,"height":618,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/32-0.png","element":"img"}],[{"text":"Figure 20: Modeling baseline behavior step of helicopter hovering task: (a) Number of steps per trial executed by case-base ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B ","element":"figcaption","subtype":"caption"},{"text":"and baseline behavior ","element":"figcaption","subtype":"caption"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/32-1.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". (b) Cumulative reward per trial by ","element":"figcaption","subtype":"caption"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/32-2.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":", the learned safe case-based policy ","element":"figcaption","subtype":"caption"},{"style":{"height":20.31},"width":294.81,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/32-3.png","element":"img","alt":" πθB and an IBL","inline":true,"padRight":true},{"text":"approach.","element":"figcaption","subtype":"caption"}],[{"text":"it nevertheless performs a safe policy without crashing the helicopter. With regard to the training process of the IB","element":"span"},{"style":{"fontStyle":"italic"},"text":"1 ","element":"span"},{"text":"approach, every case produced during 15 episodes by the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/32-4.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"is stored. Figure 20 (b) demonstrates that the IB","element":"span"},{"style":{"fontStyle":"italic"},"text":"1 ","element":"span"},{"text":"approach consistently results in helicopter crashes, with a performance extremely far from that of the learned safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.88,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/32-5.png","element":"img","alt":" πθB","inline":true},{"text":". Improvement of the policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/32-6.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"begins when the state-action space is safely ","element":"span"},{"text":"explored through the execution of step two of PI-SRL.","element":"span"}],[{"text":"Figure 21 (a) shows the results for different risk levels. While PI-SRL low and medium levels of risk levels do not produce helicopter crashes in PI-SRL, performance is nevertheless quite weak.","element":"span"}],[{"style":{"width":"94%"},"width":1633,"height":611,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/32-7.png","element":"img"}],[{"text":"Figure 21: Improving the learned baseline behavior step in helicopter hovering task: (a) Cumulative reward per episode for different risk configurations obtained by PISRL. (b) Cumulative reward per episode obtained by evolutionary and risk-sensitive RL approaches. In all cases, any episode ending in failure is marked.","element":"figcaption","subtype":"caption"}],[{"text":"Conversely, the high level of risk established produces a near-optimal policy with a low number of collisions. Extensive experimentation demonstrates that increasing the risk parameter ","element":"span"},{"style":{"height":15.13},"width":265.67,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-0.png","element":"img","alt":" σ = 9 × 10−3 ","inline":true,"padRight":true},{"text":"also increases the number of crashes without an accompanying improvement in the cumulative reward. Figure 21 (b) shows the results of the evolutionary RL approach which, it should be remembered, was selected winner of the RL Competition 2009 in the same domain (Mart´ın H. & Lope, 2009), as well as the risk-sensitive RL algorithm for different ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-1.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"values. A comparison of the results between the evolutionary RL approach and PI-SRL shows a similar cumulative reward, while also a significantly higher number of crashes from the former than from the latter. In the evolutionary approach, all crashes occur in the early steps of the learning process; while in PI-SRL, accidents occur at more advanced steps of the learning process. In the case of the risk-sensitive RL algorithm, for ","element":"span"},{"style":{"height":16.4},"width":320.92,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-2.png","element":"img","alt":"ξ = 0 and ξ = 0.","inline":true},{"text":"01 the risk function is learned at around episode 3000. At this point, the agent selects lower-risk actions and the number of crashes is considerably reduced. When ","element":"span"},{"style":{"height":16.4},"width":126.88,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-3.png","element":"img","alt":"ξ = 0.","inline":true},{"text":"4 and the agent selects actions resulting in higher values without taking risk into account, performance improves, but at the expense of an increased number of accidents. Nevertheless and whatever the ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-4.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"value, the number of crashes is higher and the performance is worse than with PI-SRL.","element":"span"}],[{"style":{"width":"50%"},"width":877,"height":627,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-5.png","element":"img"}],[{"text":"Figure 22: Mean number of failures (helicopter crashes) and cumulative reward during 5000 episodes for each approach to the helicopter hovering task. The means have been computed from 10 different executions.","element":"figcaption","subtype":"caption"}],[{"text":"The information from Figure 22, indicating the mean number of failures and cumulative reward over 5000 episodes for each approach, complements the conclusions made above. The data has been computed from 10 independent executions of each approach. ","element":"span"},{"text":"As in previous domains, PI-SRL is indicated by red circles, the risk-sensitive approach by the black triangles and the evolutionary RL approach by the blue square. Figure 22 also shows the performance for two additional risk levels, a very high level of risk (","element":"span"},{"style":{"height":19.13},"width":289.16,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-6.png","element":"img","alt":"σ = 9 × 10−2)","inline":true,"padRight":true},{"text":"and a very low level risk (","element":"span"},{"style":{"height":17.6},"width":697.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-7.png","element":"img","alt":"σ = 0), with respect to Figure 21.","inline":true,"padRight":true},{"text":"Figure 22 demonstrates that the evolutionary RL approach obtains the highest cumulative reward (","element":"span"},{"style":{"height":19.13},"width":259.55,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-8.png","element":"img","alt":"−7.13 × 107),","inline":true,"padRight":true},{"text":"followed closely by PI-SRL (","element":"span"},{"style":{"height":15.13},"width":222.6,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-9.png","element":"img","alt":"−7.57 × 107","inline":true},{"text":"). The other approaches are far from these results. Regarding the number of failures (i.e., helicopter crashes), as PI-SRL with a very low level of risk (","element":"span"},{"style":{"height":19.13},"width":752.48,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-10.png","element":"img","alt":"σ = 0), a low level of risk (σ = 9 × 10−5","inline":true},{"text":") and a medium level of risk (","element":"span"},{"style":{"height":19.13},"width":261.79,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/33-11.png","element":"img","alt":"σ = 9 × 10−4)","inline":true,"padRight":true},{"text":"produces no collisions, the PI-SRL algorithm with medium risk is preferable inasmuch as the cumulative reward is higher (","element":"span"},{"style":{"height":15.13},"width":252.66,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/34-0.png","element":"img","alt":"−18.01 × 107","inline":true},{"text":"). Using the Pareto comparison criterion, the PI-SRL solution with a high level of risk strictly dominates the solutions of the risk-sensitive approach (PI-SRL ","element":"span"},{"style":{"height":15.53},"width":284.67,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/34-1.png","element":"img","alt":" σ = 9×10−3 ≻","inline":true,"padRight":true},{"text":"risk-sensitive). Moreover, PI-SRL is not strictly dominated by any other solution.","element":"span"}],[{"style":{"width":"86%"},"width":1490,"height":1277,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/34-2.png","element":"img"}],[{"text":"Figure 23: Evolution of the known space for different episodes in the helicopter hovering task. (a) Example of representation of a single known state in a radar chart. (b), (c), and (d) Known states in episodes ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T ","element":"figcaption","subtype":"caption"},{"text":"= 0, ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T ","element":"figcaption","subtype":"caption"},{"text":"= 500 and ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T ","element":"figcaption","subtype":"caption"},{"text":"= 4000, respectively, in a high-risk learning process (","element":"figcaption","subtype":"caption"},{"style":{"height":15.13},"width":253.83,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/34-3.png","element":"img","alt":"σ = 9 × 10−3","inline":true},{"text":"). Each graph corresponds to the situation of the known space according to the case-base ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B ","element":"figcaption","subtype":"caption"},{"text":"in episode ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"T","element":"figcaption","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"text":"As with the pole-balancing domain, Figure 23 shows the evolution of the known space according to the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"in different episodes for a high-risk learning process. In this case, radar charts are used due to the high number of features describing the states. A radar chart is a graphical method for displaying multivariate data two-dimensionally. In the Figure, each axis represents one of the features of the state and, to preserve the simplicity of the representation, the charts are generated normalizing the absolute values of the features between 0 and 1. Figure 23 (a) is an example of a representation of a single known state. The value of each axis corresponds to the value of an individual feature in a state and a line is drawn connecting the feature values for each axis. While the line in Figure 23 (a) represents a single state, Figures 23 (b), (c) and (d) show the known space according to the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"in episodes 0, 500 and 4000, respectively. These three charts do not represent a single state, but rather all the states in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"for the corresponding episode. Thus, for each graph, the set of known states is marked Ω (green area). A state is considered an error state if a single feature value for that state is greater than 1. The limits (marked by a red line in the graphs) have been computed taking into account that the helicopter crashes if (i) the velocity along any of the main axes exceeds 5 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m/s","element":"span"},{"text":", (ii) the position of the helicopter is off by more than 20 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":", (iii) the angular rate around any of the main axes exceeds 2 ","element":"span"},{"style":{"height":17.6},"width":262.79,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/35-0.png","element":"img","alt":" × 2π rad/s or","inline":true,"padRight":true},{"text":"(iv) the orientation is more than 30 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"degrees ","element":"span"},{"text":"from the target orientation. As with previous tasks, Figure 23 indicates two different matters. First, as the learning proceeds, the known space derived from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"is adjusted to the space used for better and safer policies. In the helicopter domain, the agent tries to hover the helicopter as close as possible to a target position (i.e., the origin of coordinates), since the immediate rewards are greater the closer the helicopter hovers to the origin. Thus, the known space starts to expand (Figure 23 [b]) and, progressively, is concentrated at the origin of coordinates (Figure 23 [c] and [d]). With regard to the second matter, the probability of crashing is very low since, from the very beginning, the known space already appears concentrated at the origin and far from the error space (Figure 23 [b]). In other words, from the very beginning, all features of the known space (i.e., forward, sideways and downward velocities; x, y, and z coordinates; x, y and z angular-rates; and x, y and z quaternation) are very far from error space limits, decreasing the probability of visiting an error state.","element":"span"}],[{"text":"In the previous experiments, the second step of PI-SRL has been performed using an initial case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"free of failures that is built into the first step of the algorithm. The following experiments show the performance of the second step of PI-SRL when different initial policies are used. Figure 24 (a) shows the performance of these policies used as initial policies. The continuous black line indicates the performance of the initial safe case-based policy ","element":"span"},{"style":{"height":10.3},"width":50.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/35-1.png","element":"img","alt":" πB","inline":true},{"text":", with an average cumulative reward per episode of -85,130.11, used in the previous experiments prior to the execution of step two in the algorithm. The remaining lines in the Figure correspond to the performance of three different initializations of the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"used in the new experiments, prior to the execution of step two of the algorithm. Using a very poor initial policy (dashed green lines) with which the helicopter crashed in nearly all of the episodes, the average cumulative reward per episode was calculated at -108,548.03. Using a different poor (albeit less poor) initial policy (continuous red lines) with which the helicopter crashed occasionally, the average cumulative reward per episode was -91,723.89. Finally, a near-optimal policy (dashed blue lines) whereby helicopter hovering is free of failures yields an average cumulative reward per episode of -13,940.1.","element":"span"}],[{"text":"The Figure 24 (b) shows performance in the second step (improving the baseline behavior step) of PI-SRL, starting from a case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"corresponding to the very poor, poor and the near-optimal policies presented in Figure 24 (a). In Figure 24 (b), the dashed blue lines correspond to the use of a case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"containing the near-optimal policy, the continuous red lines correspond to the use of a case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"containing the poor policy and the dashed green lines correspond to the use of a case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"containing the very poor policy. All the experiments in the Figure have been conducted using a high level of risk in the domain","element":"span"}],[{"style":{"width":"96%"},"width":1660,"height":615,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/36-0.png","element":"img"}],[{"text":"Figure 24: (a) The performance of different initial policies in the helicopter hovering task. (b) The performance of different executions of the second step of PI-SRL, each starting from a case-base ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B ","element":"figcaption","subtype":"caption"},{"text":"containing a policy of three different types: very poor, poor and near-optimal.","element":"figcaption","subtype":"caption"}],[{"text":"(","element":"span"},{"style":{"height":15.13},"width":238.34,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/36-1.png","element":"img","alt":"σ = 9×10−3","inline":true},{"text":"). The graph indicates that with the use of a near-optimal policy for an initial policy and a high level of risk level, the case-base does not worsen performance which, in fact, appears to improve slightly. The second step of PI-SRL prevents the degradation of the initial performance of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", since no updates of cases in the case-base are made using bad episodes. In other words, the updates in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"$38","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(see lines corresponding to a high level of risk, ","element":"span"},{"style":{"height":15.13},"width":253.1,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/36-2.png","element":"img","alt":" σ = 9 × 10−3","inline":true},{"text":", in Figure 21 (a)). Finally, the dashed green lines in Figure 24 (b) show that the use of a very poor initial policy with many failures results in decreased performance and a higher number of failures produced, even though it is nevertheless able to learn better behavior. In this case, the algorithm falls into a local minimum, probably biased by the very poor initialization. ","element":"span"},{"text":"In both cases with poor policies, the number of failures is higher at the beginning of the learning process and decreases as the learning process proceeds. While both the poor and very poor initial policies are very close to the error space, this is in stark contrast to the initial policy shown in Figure 23 which, from the very beginning, already appears concentrated at the origin, far from the error space. As the learning process proceeds, the different policies are compressed away from the error space and the number of failures decreases.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.4 SIMBA","element":"span"}],[{"text":"Business simulators are powerful tools for improving management decision-making processes. An example of such a tool is the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIMulator for Business Administration ","element":"span"},{"text":"(SIMBA) (Borrajo et al., 2010). SIMBA is a competitive simulator, since agents can compete against other agents through their management of different virtual companies. ","element":"span"},{"text":"$39","element":"span"}],[{"style":{"width":"96%"},"width":1672,"height":619,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-0.png","element":"img"}],[{"text":"Figure 25: Modeling baseline behavior step in SIMBA Task: (a) Number of steps per trial executed by case-base ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B ","element":"figcaption","subtype":"caption"},{"text":"and baseline behavior ","element":"figcaption","subtype":"caption"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-1.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":". (b) Cumulative reward per trial by ","element":"figcaption","subtype":"caption"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-2.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":", the learned safe case-based policy ","element":"figcaption","subtype":"caption"},{"style":{"height":20.31},"width":50.87,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-3.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"and an IBL approach.","element":"figcaption","subtype":"caption"}],[{"text":"Figure 25 (a) shows the evolution of the number of steps executed by the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-4.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"and the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"during two learning processes performing the modeling baseline behavior step. ","element":"span"},{"style":{"height":16.4},"width":143.94,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-5.png","element":"img","alt":" θ and η","inline":true,"padRight":true},{"text":"were computed following the procedure described in subsection 3.3 and have values of 1 ","element":"span"},{"style":{"height":15.13},"width":105.28,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-6.png","element":"img","alt":" × 102 ","inline":true,"padRight":true},{"text":"and 513, respectively. In few episodes (approximately 25), the safe case-based policy ","element":"span"},{"style":{"height":20.3},"width":50.88,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-7.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"is learned. Figure 25 (b) shows the performance of the previously- ","element":"span"},{"text":"learned ","element":"span"},{"style":{"height":20.31},"width":132.68,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-8.png","element":"img","alt":" πθB, πT","inline":true,"padRight":true},{"text":"and the IB","element":"span"},{"style":{"fontStyle":"italic"},"text":"1 ","element":"span"},{"text":"approach. In this study, the mean profits per episode of ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/37-9.png","element":"img","alt":" πT","inline":true}],[{"style":{"width":"95%"},"width":1649,"height":615,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-0.png","element":"img"}],[{"text":"Figure 26: Improving the learned baseline behavior step in SIMBA task: (a) The mean profits per episode for different risk configurations obtained by the PI-SRL agent against five hand-coded agents. (b) The mean profits per episode obtained by the evolutionary and risk-sensitive RL agent against five hand-coded agents. In each cases, any episode ending in failure (bankruptcy) is noted.","element":"figcaption","subtype":"caption"}],[{"text":"are 5.24 million Euros, while those obtained for ","element":"span"},{"style":{"height":20.31},"width":50.88,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-1.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"are 4.02 million Euros. In the IB","element":"span"},{"style":{"fontStyle":"italic"},"text":"1 ","element":"span"},{"text":"approach, all cases generated using the baseline behavior ","element":"span"},{"style":{"height":10.3},"width":48.88,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-2.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"during 25 episodes are stored. The experiments demonstrate that in SIMBA, in contrast with the previous domains, storing all cases is sufficient for obtaining a safe policy with a performance similar to that using the modeling baseline behavior step (with mean profits per episode of 3.98 million Euros). Once the safe case-based policy ","element":"span"},{"style":{"height":20.31},"width":50.87,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-3.png","element":"img","alt":" πθB ","inline":true,"padRight":true},{"text":"is learned, we execute the improving the learned baseline ","element":"span"},{"text":"behavior step.","element":"span"}],[{"text":"Similar to the findings in earlier tasks, Figure 26 (a) indicates that while low and medium levels of risk do not produce bankruptcies, performance is nevertheless weak. The highest level of risk produces a near-optimal policy with a low number number of failures. ","element":"span"},{"text":"By contrast, Figure 26 (b) presents the results for the evolutionary and risk-sensitive RL approaches, with the former being clearly that which yields the highest number of failures. In the risk-sensitive case, the number of bankruptcies in all cases is insufficient for learning the risk function ","element":"span"},{"style":{"height":12},"width":34.56,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-4.png","element":"img","alt":" ρ.","inline":true,"padRight":true},{"text":"The comparative results in Figure 26 show that PI-SRL with ","element":"span"},{"style":{"height":15.13},"width":223.72,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-5.png","element":"img","alt":"σ = 9 × 101 ","inline":true,"padRight":true},{"text":"obtains better policies and less failures than the evolutionary or risk-sensitive RL approaches.","element":"span"}],[{"text":"Figure 27 shows a graphical representation of the different solutions in this domain. It shows the mean number of failures and cumulative reward for the different approaches over 100 episodes, with data computed from 10 independent executions of each approach. In the Figure, red circles correspond to the PI-SRL algorithm, black triangles correspond to the risk-sensitive approach and the blue square corresponds to the evolutionary RL approach. Figure 27 also shows the performance for two additional risk levels, very high (","element":"span"},{"style":{"height":19.13},"width":233.92,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-6.png","element":"img","alt":"σ = 9 × 102)","inline":true,"padRight":true},{"text":"and very low (","element":"span"},{"style":{"height":17.6},"width":1463.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-7.png","element":"img","alt":"σ = 0), with respect the Figure 26. The experiments in Figure 27 demonstrate","inline":true,"padRight":true},{"text":"that PI-SRL with a high level of risk (","element":"span"},{"style":{"height":15.14},"width":219.54,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-8.png","element":"img","alt":"σ = 9 × 101","inline":true},{"text":") obtains the highest cumulative reward, 6693.58. Additionally, PI-SRL with a very low level of risk (","element":"span"},{"style":{"height":17.6},"width":521.06,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-9.png","element":"img","alt":"σ = 0), a low level of risk","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":15.13},"width":241.97,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-10.png","element":"img","alt":"σ = 9 × 10−1","inline":true},{"text":") and a medium level of risk (","element":"span"},{"style":{"height":15.13},"width":215.62,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/38-11.png","element":"img","alt":"σ = 9 × 100","inline":true},{"text":") are the approaches with the lowest","element":"span"}],[{"style":{"width":"51%"},"width":890,"height":637,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/39-0.png","element":"img"}],[{"text":"Figure 27: Mean number of failures (company bankruptcies) and the cumulative reward over 100 episodes for each approach to the SIMBA task. The means have been computed from 10 different executions.","element":"figcaption","subtype":"caption"}],[{"text":"mean number of failures, 0.0. However, PI-SRL with a medium level of risk is preferred inasmuch as its performance is superior in terms of cumulative reward. PI-SRL with a very high level risk (","element":"span"},{"style":{"height":15.13},"width":211.8,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/39-1.png","element":"img","alt":"σ = 9×102","inline":true},{"text":") increases the number of failures and obtains a lower cumulative reward when compared to PI-SRL with a high level of risk. Using the Pareto comparison criterion, PI-SRL with a high level of risk strictly dominates all other solutions (PI-SRL ","element":"span"},{"style":{"height":15.53},"width":255.18,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/39-2.png","element":"img","alt":"σ = 9×101 ≻","inline":true,"padRight":true},{"text":"risk-sensitive and PI-SRL ","element":"span"},{"style":{"height":15.53},"width":255.18,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/39-3.png","element":"img","alt":" σ = 9×101 ≻","inline":true,"padRight":true},{"text":"evolutionary RL), while the approach is not strictly dominated by any other solution.","element":"span"}],[{"text":"Due to the difficulty of representing the high-dimensional state and action space of the SIMBA domain, no graphs are provided with the evolution of the known space.","element":"span"}]]},{"heading":"5. Related Work","paragraphs":[[{"text":"Reinforcement learning (RL) and case-based reasoning (CBR) techniques have been combined in the literature in different ways. In the work of Bianchi et al. (2009), a new approach is presented permitting the use of cases as heuristics to speed up RL algorithms. Additionally, Sharma et al. (2007) use a combination of CBR and RL (called CARL) to achieve transfer while playing against the Game AI across a variety of scenarios in MadRTS TM, a commercial Real Time Strategy game. CBR has also been used for state value function approximation in RL (Gabel & Riedmiller, 2005). However, the present study is, to our knowledge, the first time that CBR and RL have been used in conjunction for safe exploration in dangerous domains. In the field of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"safe reinforcement learning","element":"span"},{"text":", three principal trends can be observed: (i) approaches based on return and its variance, (ii) risk-sensitive approaches based on the definition of error states and (iii) approaches using teachers.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"5.1 Approaches Based on the Return and its Variance","element":"span"}],[{"text":"In the literature, it has long been known that the optimal policy and the optimal expected return of an MDP are quite sensitive to parameter variations (even an optimal policy may perform badly in some cases due to the stochastic nature of the problem). To mitigate this problem, the agent can try to maximize the return associated with the worst-case scenario, even though the case may be highly unlikely. Thus, in this trend, the risk refers to the worst outcomes of the return ","element":"span"},{"style":{"height":19.5},"width":284.46,"height":48.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-0.png","element":"img","alt":" R = �∞t=0 γtrt","inline":true,"padRight":true},{"text":"or its variance. An example of such an approach ","element":"span"},{"text":"is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"worst-case control ","element":"span"},{"text":"where the worst possible outcome of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R ","element":"span"},{"text":"is to be optimized (Coraluppi & Marcus, 1999; Heger, 1994). In worst case control strategies, the optimality criterion is exclusively focused on risk-avoiding policies. A policy is considered to be optimal if its worst-case return is superior. The approach, however, is too restrictive inasmuch as it takes very rare scenarios fully into account.","element":"span"}],[{"text":"The ","element":"span"},{"style":{"height":12.8},"width":179.41,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-1.png","element":"img","alt":" α−value","inline":true,"padRight":true},{"text":"of the return ˆ","element":"span"},{"style":{"height":10.62},"width":60.31,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-2.png","element":"img","alt":"mα","inline":true,"padRight":true},{"text":"introduced by Heger (1994) can be seen as an extension of the worst case control of MDPs. This concept establishes that the returns ","element":"span"},{"style":{"height":15.02},"width":216.07,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-3.png","element":"img","alt":" R < ˆmα of","inline":true,"padRight":true},{"text":"a policy that occur with a probability lower than ","element":"span"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-4.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"are neglected. The algorithm is less pessimistic than pure worst case control, given that extremely rare scenarios have no effect on the policy. In the work of Heger et al., the idea of weighting return and risk, namely the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"expected value-variance criterion","element":"span"},{"text":", is also introduced.","element":"span"}],[{"text":"In risk-sensitive control based on the use of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"exponential utility functions","element":"span"},{"text":", the return ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R ","element":"span"},{"text":"is transformed to reflect a subjective measure of utility. Instead of maximizing the expected value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R","element":"span"},{"text":", the objective here is to maximize ","element":"span"},{"style":{"height":19.93},"width":547.74,"height":49.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-5.png","element":"img","alt":" U = β−1logE(eβR), where β","inline":true,"padRight":true},{"text":"is a parameter and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R ","element":"span"},{"text":"is the usual return. It can be shown that, depending on the parameter ","element":"span"},{"style":{"height":16.4},"width":195.67,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-6.png","element":"img","alt":" β, policies","inline":true,"padRight":true},{"text":"with a high variance ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"R","element":"span"},{"text":") are penalized (","element":"span"},{"style":{"height":16.4},"width":77.24,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-7.png","element":"img","alt":"β <","inline":true,"padRight":true},{"text":"0) or enforced (","element":"span"},{"style":{"height":16.4},"width":77.24,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-8.png","element":"img","alt":"β >","inline":true,"padRight":true},{"text":"0). Instead, Neuneier and Mihatsch (2002) consider the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"worst-case-outcomes ","element":"span"},{"text":"of a policy, (i.e., risk related to the variability of the return). In the study, the authors demonstrate that the learning algorithm interpolates between ","element":"span"},{"style":{"fontStyle":"italic"},"text":"risk-neutral ","element":"span"},{"text":"and the worst-case criterion and has the same limiting behavior as exponential utility functions. It should be noted that these approaches based on the variability of the return or its worst possible outcomes are not suited for problems where a policy with a small variance can produce a large risk (Geibel & Wysotzki, 2005). Our view of risk in the present study, however, is not concerned with the variance of the return or its worst possible outcome, but instead with the fact that processes generally possess unsafe states that should be avoided. Consequently, we address a different class of problems than those dealt with by approaches focusing on the variability of the return.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"5.2 Risk-sensitive Approaches based on Error States.","element":"span"}],[{"text":"In this second trend of approaches, the concept of risk is based on the definition of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"error states ","element":"span"},{"text":"or fatal transitions. ","element":"span"},{"text":"Thus, Geibel et al. (2005) , for instance, establish the risk function as the probability of entering in an error state. Instead, Hans et al (2008) consider a transition to be fatal if the corresponding reward is less than a given threshold ","element":"span"},{"style":{"height":12.4},"width":107.8,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-9.png","element":"img","alt":" τ. In","inline":true,"padRight":true},{"text":"the first case and as demonstrated in Section 4, ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/40-10.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"text":"is learned by TD methods which require that error states (i.e., car collisions, pole-balancing disequilibrium, helicopter crashes and company bankruptcies) be visited repeatedly in order to approximate the risk function and, subsequently, avoid dangerous situations. In the second case, the concept of risk is again joined with that of reward. Moreover, the above mentioned studies either (i) assume that the system dynamics are known, (ii) tolerate undesirable states during exploration or, in contrast with our paper, (iii) do not deal with problems with high-dimensional and continuous state-action spaces. Regarding the latter, while Geibel et al. write that their approach can also be extended to continuous action sets (e.g., by using an actor-critic method), they do not give any more details on how this may be done with entirely continuous problems. In Section 4, we present an approach that solves the problem.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"5.3 Approaches Using Teachers","element":"span"}],[{"text":"The last trend in the approaches is based on the use of teachers in three different ways: (i) to bootstrap the learning algorithm (i.e., as an initialization procedure), (ii) to derive a policy from a finite demonstration set and (iii) to guide the exploration process.","element":"span"}],[{"text":"5.3.1 Bootstrapping the Learning Algorithm","element":"span"}],[{"text":"In the work of Driessens and Sˇzeroski (2004), a bootstraping procedure is used for relational RL in which a finite set of demonstrations are recorded from a human expert and later presented to a regression algorithm. This allows the regression algorithm to build a partial Q-function which can later be used to guide further exploration of the state space using a Boltzmann exploration strategy. Smart and Kaelbling (2000) also use examples, training runs to bootstrap the Q-learning approach for their HEDGER algorithm. The initial knowledge bootstrapped into the Q-learning approach allows the agent to learn more effectively and helps reduce the time spent with random actions. Teacher behaviors are also used as a form of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"population seeding ","element":"span"},{"text":"$3a","element":"span"}],[{"text":"5.3.2 Deriving a Policy from a Finite Set of Demonstrations","element":"span"}],[{"text":"$3b","element":"span"}],[{"text":"5.3.3 Guiding the Exploration Process","element":"span"}],[{"text":"$3c","element":"span"}],[{"text":"In all works mentioned for this trend, no explicit definition of risk is ever given.","element":"span"}]]},{"heading":"6. Conclusions","paragraphs":[[{"text":"In this work, PI-SRL, an algorithm for policy improvement through safe reinforcement learning in high-risk tasks, is described. The main contributions of this algorithm are the definitions of a novel case-based risk function and a baseline behavior for the safe exploration of the state-action space. ","element":"span"},{"text":"The use of the case-based risk function presented is possible inasmuch as the policy is stored as a case-base. This represents a clear advantage over other approaches, e.g., evolutionary RL (Mart´ın H. & Lope, 2009; Koppejan & Whiteson, 2011) where the extraction of knowledge about the known space by the agent is impossible using the weights of the neural-networks. Additionally, a completely different notion of risk from others found in the literature is presented. ","element":"span"},{"text":"$3d","element":"span"}],[{"text":"This paper presents the PI-SRL algorithm in great detail and demonstrates its effective-ness in four entirely different continuous domains: the car parking problem, pole-balancing, helicopter hovering and business management (SIMBA). The experiments presented in this paper demonstrate different characteristics about the learning capabilities of the PI-SRL algorithm.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(i) PI-SRL obtains higher quality solutions. ","element":"span"},{"text":"The experiments in Section 4 demonstrate that, save in the helicopter hovering task, PI-SRL obtains in all cases the best cumulative reward per episode and the least number of failures. Additionally, using the Pareto comparison criterion it can be said that, save the very high risk configuration in the car parking problem, our approach is not strictly dominated by any other approach.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(ii) PI-SRL adjusts the initial known space to safe and better policies. ","element":"span"},{"text":"The initial known space resulting from the first step of PI-SRL, modeling baseline behavior, is adjusted and improved in the second step of the algorithm, improving the learned baseline behavior. Additionally, the experiments demonstrate that the adjustment process can compress the known space away from the error space (e.g., pole-balancing domain, subsection 4.2, and helicopter hovering domain, subsection 4.3) or, on other occasions, can require the known space to move closer to the error space (e.g., car parking problem, subsection 4.1) in the event that better policies are be found there.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(iii) PI-SRL works well in domains with differently structured state-action spaces and where the value function can vary sharply. ","element":"span"},{"text":"Although the car parking problem, the pole-balancing domain, the helicopter hovering task and the business simulator all represent very differently structured problems, experiments in the study nevertheless demonstrate that PI-SRL performs well in each. Furthermore, even in such domains as the car parking problem in which the value function varies sharply due to the presence of an obstacle, experimental results demonstrate that PI-SRL can nevertheless successfully handle this difficulty. However, it is impossible to avoid all failures if the “known space” edge is the same as the edge to error states the algorithm would often ’explore’ into error states.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(iv) The number of failures depends on the distance between the known space and the error space. ","element":"span"},{"text":"The experiments in the pole-balancing and helicopter hovering domains demonstrate that the number of failures depends on how close the known space is to the error space. Due to the structure of these domains, the improving the learned baseline behavior step in the algorithm tends to concentrate the known space at the origin of coordinates away from the error space. The greater the distance between the known space and the error space, the lower the number of failures. Additionally, in helicopter hovering, the known space is, from the beginning, far from the error space (consequently, the number of failures is also low from the beginning). Therefore, the initial distribution of the known space learned from the baseline policy ","element":"span"},{"style":{"height":10.3},"width":48.87,"height":25.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/43-0.png","element":"img","alt":" πT","inline":true,"padRight":true},{"text":"later influences the number of failures obtained by the second step of PI-SRL.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(v) PI-SRL is completely safe if only the first step of the algorithm is executed. ","element":"span"},{"text":"However, by proceeding only in this way, algorithm performance would be heavily limited by the capabilities of the baseline behavior. ","element":"span"},{"text":"If learner performance is to be improved beyond the performance of this baseline behavior, the subsequent exploratory process from the second step of PI-SRL must be carried out. Since complete knowledge of the domain and its dynamic is not possessed, however, it is also inevitable that, during this exploratory process, unknown regions of the state space will be visited where the agent may reach error states.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(vi) The risk parameter allows the user to configure the level of risk assumed. ","element":"span"},{"text":"In our algorithm, the user can gradually increase the value of the risk parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/44-0.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"in order to obtain better policies, but also assuming a greater likelihood of damage in the learning system.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(vii) PI-SRL performs successfully even when a poor initial policy with failures is used","element":"span"},{"text":". The experiments in Figure 24 from the helicopter hovering domain demonstrate that PI-SRL is able to learn a near-optimal policy despite poor initialization, just as it can when a policy free of failures is used to initialize the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":". However, the Figure also shows that if a very poor initial policy with many failures is used, PI-SRL decreases in performance and produces a higher number of failures, although some better behavior is still learnt. In this case, the algorithm falls into a local minimum, likely biased by the very poor initialization.","element":"span"}],[{"text":"In what follows, the applicability of the method is discussed, allowing the reader to more clearly understand the scenarios in which the proposed PI-SRL approach may be applicable. This applicability is restricted to domains having the following characteristics.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(i) It is mandatory that the scenario satisfy the two assumptions described in Section 2","element":"span"},{"text":". According to the first assumption, nearby states in the domain must necessarily have similar actions. According to the other, similar actions in similar states should produce similar effects. This fact that similar actions lead to similar states assumes some degree of smoothness in the dynamic behavior of the system which, in certain environments, may not hold. However, as we clearly explain in Section 2, we consider both assumptions to be logical assumptions derived from generalization principles in the RL literature (Kaelbling et al., 1996; Jiang, 2004).","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(ii) The applicability of the method is limited by the size of the case-base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"style":{"fontStyle":"italic"},"text":"required to mimic the baseline behavior. ","element":"span"},{"text":"It is not possible to apply the proposed approach to tasks when, in the first step of the PI-SRL algorithm, modeling baseline behavior, a prohibitively large number of cases are required to properly mimic complex baseline behaviors. In this case, the threshold ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/44-1.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"can be increased to further restrict the addition of new cases to the case-base. However, this increase may adversely affect the final performance of the algorithm. Nevertheless, the experiments performed in Section 4 demonstrate that relatively simple baseline behaviors are mimicked almost perfectly using a manageable number of cases.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(iii) The PI-SRL algorithm requires the presence of a baseline behavior. ","element":"span"},{"text":"The proposed method requires the presence of a baseline behavior that safely demonstrates the task to be learned. This baseline behavior can be conducted by a human teacher or a hand-coded agent. It is important to note, nevertheless, that the presence of such a baseline behavior is not guaranteed in all domains.","element":"span"}],[{"text":"Finally, a logical continuation of the present study would take into account the automatic graduation of the risk parameter along the learning process. ","element":"span"},{"text":"For example, it would be particularly interesting to exploit the fact that the known space is far away from the error space in order to increase the risk parameter or, on the contrary, to reduce it when it is close. ","element":"span"},{"text":"Other future work aims to deploy the algorithm in real environments, inasmuch as the uncertainty of the real environments presents the biggest challenge to autonomous robots. ","element":"span"},{"text":"Autonomous robotic controllers must deal with a large number of factors such as the robotic mechanical system and electrical characteristics, as well as environmental complexity. However, the use of the PI-SRL algorithm (or other risk-sensitive approaches) for learning processes in real environments could reduce the amount of damage incurred and, consequently, allow the lifespan of the robots to be extended. It might be worthwhile add a mechanism to the algorithm to detect when a known state can lead directly to an error state. All such problems are currently being investigated.","element":"span"}]]},{"heading":"Acknowledgments","paragraphs":[[{"text":"This study has been partially supported by Spanish MICIIN projects TIN2008-06701-C03-03, TRA2009-0080 and CCG10-UC3M/TIC-5597. We offer our gratitude and special thanks to Raquel Fuentetaja Piz´an, Assistant Professor at Universidad Carlos III de Madrid in the Planning & Learning Group (PLG), for her generous and invaluable comments during the revision of this paper. ","element":"span"},{"text":"We would also like to thank to Jos´e Antonio Mart´ın, Assistant Professor at Universidad Complutense de Madrid, for his invaluable comments regarding his evolutionary RL algorithm.","element":"span"}]]},{"heading":"References","paragraphs":[[{"text":"Aamodt, A., & Plaza, E. (1994). Case-Based Reasoning; Foundational Issues, Methodological Variations, and System Approaches. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AI Communications","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"7","element":"span"},{"text":"(1), 39–59.","element":"span"}],[{"text":"Abbeel, P., Coates, A., Hunter, T., & Ng, A. Y. (2008). Autonomous Autorotation of an RC Helicopter. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ISER","element":"span"},{"text":", pp. 385–394.","element":"span"}],[{"text":"Abbeel, P., Coates, A., & Ng, A. Y. (2010). Autonomous helicopter aerobatics through apprenticeship learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I. J. Robotic Res.","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"29","element":"span"},{"text":"(13), 1608–1639.","element":"span"}],[{"text":"Abbott, R. G. (2008). Robocup 2007: Robot soccer world cup xi.. chap. Behavioral Cloning for Simulator Validation, pp. 329–336. Springer-Verlag, Berlin, Heidelberg.","element":"span"}],[{"text":"Aha, D. W. (1992). Tolerating Noisy, Irrelevant and Novel Attributes in Instance-Based Learning Algorithms. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Journal Man-Machine Studies","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"36","element":"span"},{"text":"(2), 267–287.","element":"span"}],[{"text":"Aha, D. W., & Kibler, D. (1991). Instance-based learning algorithms. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine Learning","element":"span"},{"text":", pp. 37–66.","element":"span"}],[{"text":"Anderson, C. W., Draper, B. A., & Peterson, D. A. (2000). Behavioral cloning of student pilots with modular neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the Seventeenth International Conference on Machine Learning","element":"span"},{"text":", pp. 25–32. Morgan Kaufmann.","element":"span"}],[{"text":"Argall, B., Chernova, S., Veloso, M., & Browning, B. (2009). A Survey of Robot Learning from Demonstration. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Robotics and Autonomous Systems","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"57","element":"span"},{"text":"(5), 469–483.","element":"span"}],[{"text":"Bartsch-Sprl, B., Lenz, M., & Hbner, A. (1999). Case-based reasoning: Survey and future directions.. In Puppe, F. (Ed.), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"XPS","element":"span"},{"text":", Vol. 1570 of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Lecture Notes in Computer Science","element":"span"},{"text":", pp. 67–89. Springer.","element":"span"}],[{"text":"Bianchi, R., Ros, R., & de M´antaras, R. L. (2009). Improving reinforcement learning by using case-based heuristics.. Vol. 5650, pp. 75–89. Lecture Notes in Artificial Intelligence, Springer, Lecture Notes in Artificial Intelligence, Springer.","element":"span"}],[{"text":"Borrajo, F., Bueno, Y., de Pablo, I., Santos, B. n., Fern´andez, F., Garc´ıa, J., & Sagredo, I. (2010). SIMBA: A Simulator for Business Education and Research. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Decission Support Systems","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"48","element":"span"},{"text":"(3), 498–506.","element":"span"}],[{"text":"Boyan, J., Moore, A., & Sutton, R. (1995). Proceedings of the workshop on value function approximation, machine learning conference 1995... Technical Report CMU-CS-95-206.","element":"span"}],[{"text":"Chernova, S., & Veloso, M. (2007). Confidence-based policy learning from demonstration using gaussian mixture models. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Joint Conference on Autonomous Agents and Multi-Agent Systems","element":"span"},{"text":".","element":"span"}],[{"text":"Chernova, S., & Veloso, M. (2008). Multi-thresholded approach to demonstration selection for interactive robot learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 3rd ACM/IEEE international conference on Human robot interaction","element":"span"},{"text":", HRI ’08, pp. 225–232, New York, NY, USA. ACM.","element":"span"}],[{"text":"Cichosz, P. (1995). Truncating temporal differences: On the efficient implementation of td(lambda) for reinforcement learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Artificial Intelligence Research (JAIR)","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"2","element":"span"},{"text":", 287–318.","element":"span"}],[{"text":"Cichosz, P. (1996). Truncated temporal differences with function approximation: Successful examples using cmac. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the Thirteenth European Symposium on Cybernetics and Systems Research (EMCSR-96)","element":"span"},{"text":".","element":"span"}],[{"text":"Coraluppi, S. P., & Marcus, S. I. (1999). Risk-Sensitive and Minimax Control of DiscreteTime, Finite-State Markov Decision Processes. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AUTOMATICA","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"35","element":"span"},{"text":", 301–309.","element":"span"}],[{"text":"Defourny, B., Ernst, D., & Wehenkel, L. (2008). Risk-aware decision making and dynamic programming. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"NIPS 2008 Workshop on Model Uncertainty and Risk in RL","element":"span"},{"text":".","element":"span"}],[{"text":"Driessens, K., & Ramon, J. (2003). Relational instance based regression for relational rl. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference of Machine Learning (ICML)","element":"span"},{"text":", pp. 123–130.","element":"span"}],[{"text":"Driessens, K., & Dˇzeroski, S. (2004). Integrating guidance into relational reinforcement learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine Learning","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"57","element":"span"},{"text":"(3), 271–304.","element":"span"}],[{"text":"Fernandez, F., & Isasi, P. (2008). Local feature weighting in nearest prototype classification. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural Networks, IEEE Transactions on","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"19","element":"span"},{"text":"(1), 40–53.","element":"span"}],[{"text":"Fern´andez, F., & Borrajo, D. (2008). ","element":"span"},{"text":"Two steps reinforcement learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Journal of Intelligent Systems","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"23","element":"span"},{"text":"(2), 213–245.","element":"span"}],[{"text":"Floyd, M. W., & Esfandiari, B. (2010). Toward a domain-independent case-based reasoning approach for imitation: Three case studies in gaming. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Workshop on Case-Based Reasoning for Computer Games at the 18th International Conference on Case-Based Reasoning (ICCBR)","element":"span"},{"text":", pp. 55–64.","element":"span"}],[{"text":"Floyd, M. W., Esfandiari, B., & Lam, K. (2008). A Case-Based Reasoning Approach to Imitating Robocup Players. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 21st International Florida Artificial Intelligence Research Society Conference","element":"span"},{"text":", pp. 251–256.","element":"span"}],[{"text":"Forbes, J., & Andre, D. (2002). ","element":"span"},{"text":"Representations for learning control policies. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The University of New South","element":"span"},{"text":", pp. 7–14.","element":"span"}],[{"text":"Gabel, T., & Riedmiller, M. (2005). Cbr for state value function approximation in reinforcement learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 6th International Conference on Case-Based Reasoning (ICCBR 2005","element":"span"},{"text":", pp. 206–221. Springer.","element":"span"}],[{"text":"Geibel, P. (2001). Reinforcement Learning with Bounded Risk. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 18th International Conference on Machine Learning","element":"span"},{"text":", pp. 162–169. Morgan Kaufmann.","element":"span"}],[{"text":"Geibel, P., & Wysotzki, F. (2005). Risk-sensitive Reinforcement Learning Applied to Control under Constraints. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Artificial Intelligence Research (JAIR)","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"24","element":"span"},{"text":", 81–108.","element":"span"}],[{"text":"Hans, A., Schneegass, D., Sch¨afer, A. M., & Udluft, S. (2008). Safe Exploration for Reinforcement Learning. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"European Symposium on Artificial Neural Network","element":"span"},{"text":", pp. 143–148.","element":"span"}],[{"text":"Heger, M. (1994). Consideration of Risk in Reinforcement Learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"11th International Conference on Machine Learning","element":"span"},{"text":", pp. 105–111.","element":"span"}],[{"text":"Hern´andez-D´ıaz, A. G., Coello, C. A. C., Perez, F., Caballero, R., Luque, J. M., & SantanaQuintero, L. V. (2008). Seeding the initial population of a multi-objective evolutionary algorithm using gradient-based information. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Congress on Evolutionary Computation","element":"span"},{"text":", pp. 1617–1624. IEEE.","element":"span"}],[{"text":"Hester, T., Quinlan, M., & Stone, P. (2011). A real-time model-based reinforcement learning architecture for robot control. Tech. rep. arXiv e-Prints 1105.1749, arXiv.","element":"span"}],[{"text":"Hu, H., Kostiadis, K., Hunter, M., & Kalyviotis, N. (2001). ","element":"span"},{"text":"Essex wizards 2001 team description. In Birk, A., Coradeschi, S., & Tadokoro, S. (Eds.), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"RoboCup","element":"span"},{"text":", Vol. 2377 of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Lecture Notes in Computer Science","element":"span"},{"text":", pp. 511–514. Springer.","element":"span"}],[{"text":"Jiang, A. X. (2004). Multiagent reinforcement learning in stochastic games with continuous action spaces..","element":"span"}],[{"text":"Kaelbling, L., Littman, M., & Moore, A. (1996). Reinforcement learning: A survey. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Artificial Intelligence Research (JAIR)","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"4","element":"span"},{"text":", 237–285.","element":"span"}],[{"text":"Konen, W., & Bartz-Beielstein, T. (2009). Reinforcement learning for games: failures and successes. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 11th Annual Conference Companion on Genetic and Evolutionary Computation Conference: Late Breaking Papers","element":"span"},{"text":", GECCO ’09, pp. 2641– 2648, New York, NY, USA. ACM.","element":"span"}],[{"text":"Koppejan, R., & Whiteson, S. (2009). Neuroevolutionary reinforcement learning for generalized helicopter control. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GECCO 2009: Proceedings of the Genetic and Evolutionary Computation Conference","element":"span"},{"text":", pp. 145–152.","element":"span"}],[{"text":"Koppejan, R., & Whiteson, S. (2011). Neuroevolutionary reinforcement learning for generalized control of simulated helicopters. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Evolutionary Intelligence","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"4","element":"span"},{"text":", 219–241.","element":"span"}],[{"text":"Lee, J.-Y., & Lee, J.-J. (2008). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Multiple Designs of Fuzzy Controllers for Car Parking Using Evolutionary Algorithm","element":"span"},{"text":", pp. 1–6. No. May.","element":"span"}],[{"text":"Luenberger, D. G. (1998). Investment science. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Oxford University Press","element":"span"},{"text":".","element":"span"}],[{"text":"Mannor, S. (2004). Reinforcement learning for average reward zero-sum games. In ShaweTaylor, J., & Singer, Y. (Eds.), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"COLT","element":"span"},{"text":", Vol. 3120 of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Lecture Notes in Computer Science","element":"span"},{"text":", pp. 49–63. Springer.","element":"span"}],[{"text":"Martin H, J., & de Lope, J. (2009). Exa: An effective algorithm for continuous actions reinforcement learning problems. In ","element":"span"},{"style":{"height":15.6},"width":900.09,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/48-0.png","element":"img","alt":" Industrial Electronics, 2009. IECON ’09. 35th","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Annual Conference of IEEE","element":"span"},{"text":", pp. 2063 –2068.","element":"span"}],[{"text":"Mart´ın H., J. A., & Lope, J. (2009). Learning Autonomous Helicopter Flight with Evolutionary Reinforcement Learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"12th International Conference on Computer Aided Systems Theory (EUROCAST)","element":"span"},{"text":", pp. 75–82.","element":"span"}],[{"text":"Mihatsch, O., & Neuneier, R. (2002). Risk-Sensitive reinforcement learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine Learning","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"49","element":"span"},{"text":"(2-3), 267–290.","element":"span"}],[{"text":"Moldovan, T. M., & Abbeel, P. (2012). ","element":"span"},{"text":"Safe exploration in markov decision processes. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CoRR","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"abs/1205.4810","element":"span"},{"text":".","element":"span"}],[{"text":"Narendra, K. S., & Thathachar, M. A. L. (1974). ","element":"span"},{"text":"Learning automata - a survey. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Ieee Transactions On Systems Man And Cybernetics","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SMC-4","element":"span"},{"text":"(4), 323–334.","element":"span"}],[{"text":"Narendra, K. S., & Thathachar, M. A. L. (1989). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Learning automata: an introduction","element":"span"},{"text":". Prentice-Hall, Inc., Upper Saddle River, NJ, USA.","element":"span"}],[{"text":"Ng, A. Y., Kim, H. J., Jordan, M. I., & Sastry, S. (2003). Autonomous Helicopter Flight via Reinforcement Learning. In Thrun, S., Saul, L. K., & Sch¨olkopf, B. (Eds.), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"NIPS","element":"span"},{"text":". MIT Press.","element":"span"}],[{"text":"Peters, J., Tedrake, R., Roy, N., & Morimoto, J. (2010). Robot learning. In Sammut, C., & Webb, G. I. (Eds.), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Encyclopedia of Machine Learning","element":"span"},{"text":", pp. 865–869. Springer.","element":"span"}],[{"text":"Poli, R., & Cagnoni, S. (1997). Genetic programming with user-driven selection: Experiments on the evolution of algorithms for image enhancement. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Genetic Programming 1997: Proceedings of the Second Annual Conference","element":"span"},{"text":", pp. 269–277. Morgan Kaufmann.","element":"span"}],[{"text":"Salkham, A., Cunningham, R., Garg, A., & Cahill, V. (2008). A collaborative reinforcement learning approach to urban traffic control optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Web Intelligence ","element":"span"},{"style":{"height":17.6},"width":1621.87,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1402.0560/images/48-1.png","element":"img","alt":"and Intelligent Agent Technology, 2008. WI-IAT ’08. IEEE/WIC/ACM International","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Conference on","element":"span"},{"text":", Vol. 2, pp. 560–566.","element":"span"}],[{"text":"Santamar´ıa, J. C., Sutton, R. S., & Ram, A. (1998). ","element":"span"},{"text":"Experiments with reinforcement learning in problems with continuous state and action spaces. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Adaptive Behavior","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"6","element":"span"},{"text":", 163–218.","element":"span"}],[{"text":"Sharma, M., Holmes, M., Santamaria, J., Irani, A., Isbell, C., & Ram, A. (2007). Transfer learning in real-time strategy games using hybrid cbr/rl. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"In Proceedings of the Twentieth International Joint Conference on Artificial Intelligence","element":"span"},{"text":".","element":"span"}],[{"text":"Siebel, N. T., & Sommer, G. (2007). Evolutionary reinforcement learning of artificial neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Journal of Hybrid Intelligent Systems","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"4","element":"span"},{"text":", 171–183.","element":"span"}],[{"text":"Smart, W. D., & Kaelbling, L. P. (2000). Practical reinforcement learning in continuous spaces. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Artificial Intelligence","element":"span"},{"text":", pp. 903–910. Morgan Kaufmann.","element":"span"}],[{"text":"Smart, W. D., & Kaelbling, L. P. (2002). Effective reinforcement learning for mobile robots. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ICRA","element":"span"},{"text":", pp. 3404–3410. IEEE.","element":"span"}],[{"text":"Sutton, R. S., & Barto, A. G. (1998). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Reinforcement Learning: An Introduction","element":"span"},{"text":". The MIT Press.","element":"span"}],[{"text":"Tang, J., Singh, A., Goehausen, N., & Abbeel, P. (2010). Parameterized maneuver learning for autonomous helicopter flight. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Robotics and Automation (ICRA)","element":"span"},{"text":".","element":"span"}],[{"text":"Taylor, M. E., Kulis, B., & Sha, F. (2011). Metric learning for reinforcement learning agents. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the International Conference on Autonomous Agents and Multiagent Systems (AAMAS)","element":"span"},{"text":".","element":"span"}],[{"text":"Van Hasselt, H., & Wiering, M. A. (2007). Reinforcement Learning in Continuous Action Spaces. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Approximate Dynamic Programming and Reinforcement Learning, 2007. ADPRL 2007. IEEE International Symposium on","element":"span"},{"text":", pp. 272–279.","element":"span"}],[{"text":"Wyatt, J. (1997). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Exploration and Inference in Learning from Reinforcement","element":"span"},{"text":". University of Edinburgh.","element":"span"}],[{"text":"Yao, X. (1999). Evolving artificial neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"PIEEE: Proceedings of the IEEE","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"87","element":"span"},{"text":", 1423–1447.","element":"span"}]]}],"_version":"3.3.4"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]