1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTkxMS4wMTg3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2019-11-04T05:17:28.000Z","paperID":"1911.01871","published":"2019-11-04T05:17:28.000Z","authors":"[\"Sayak Ray Chowdhury\",\"Aditya Gopalan\"]","title":"On Online Learning in Kernelized Markov Decision Processes","scoreTrending":null,"summary":"We develop algorithms with low regret for learning episodic Markov decision\nprocesses based on kernel approximation techniques. The algorithms are based on\nboth the Upper Confidence Bound (UCB) as well as Posterior or Thompson Sampling\n(PSRL) philosophies, and work in the general setting of continuous state and\naction spaces when the true unknown transition dynamics are assumed to have\nsmoothness induced by an appropriate Reproducing Kernel Hilbert Space (RKHS).","lastCheckedForCode":"2022-09-04T22:03:05.533Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9vbi1vbmxpbmUtbGVhcm5pbmctaW4ta2VybmVsaXplZC1tYXJrb3YifQ==","type":"pwc","url":"https://paperswithcode.com/paper/on-online-learning-in-kernelized-markov","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"sayak ray chowdhury","node":{"id":"eyJhZGRyZXNzIjoic2F5YWtAaWlzYy5hYy5pbiJ9","address":"sayak@iisc.ac.in","name":"Saya K.","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"IISc Bangalore"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"Q0_CaxYAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJhYTU2ZWUzMi05NjUzLTQzMTktYWU1Yy0zOGUxMjA3NzgxNzQifQ==","name":"sayak ray chowdhury","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjQwMy4wMDQwOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.00409"},{"id":"eyJwYXBlcklEIjoiMTcwNC4wNjg4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1704.06880"},{"id":"eyJwYXBlcklEIjoiMjQwMi4xMDUwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.10500"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wNTU2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.05567"},{"id":"eyJwYXBlcklEIjoiMjMxMC4yMDE1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.20158"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wMTg3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.01871"},{"id":"eyJwYXBlcklEIjoiMTkwOS4wNzA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.07040"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wMTAzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.01032"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNTc3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.05772"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xMzk0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.13945"},{"id":"eyJwYXBlcklEIjoiMjExMi4xMDU5OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.10599"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wNzg4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.07881"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wNzMwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.07306"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xOTczMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.19733"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wNTg0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.05849"},{"id":"eyJwYXBlcklEIjoiMjEwOC4xMTU2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.11563"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wMjk5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.02992"}]}]}},{"author":"aditya gopalan","node":{"id":"eyJhZGRyZXNzIjoiYWRpdHlhQGlpc2MuYWMuaW4ifQ==","address":"aditya@iisc.ac.in","name":"Aditya","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"IISc Bangalore"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"dM5_1NsAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIwN2Q3NmMxNS1kOTFkLTQ0MmUtODBmNy0yMDJjNzUyNDYzNTAifQ==","name":"aditya gopalan","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTMxMS4wNDY2IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1311.0466"},{"id":"eyJwYXBlcklEIjoiMTQwNi43NDk4IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1406.7498"},{"id":"eyJwYXBlcklEIjoiMTYwMi4wODg4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1602.08886"},{"id":"eyJwYXBlcklEIjoiMTcwNC4wNjg4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1704.06880"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wODIwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.08201"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wMTg3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.01871"},{"id":"eyJwYXBlcklEIjoiMTkwOS4wNzA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.07040"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wMDU0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.00543"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wMTY5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.01695"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMjkxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.12916"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wNDAwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.04008"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wMTAzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.01032"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNzU2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.07562"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wODE5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.08197"},{"id":"eyJwYXBlcklEIjoiMTgxMC4xMDMyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.10321"},{"id":"eyJwYXBlcklEIjoiMTYxMS4xMDI4MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.10283"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wNzMwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.07306"},{"id":"eyJwYXBlcklEIjoiMjAwNC4xMjc4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2004.12782"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wMDU1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.00558"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wNzk5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.07994"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wODU4MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.08583"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xMzYxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.13617"},{"id":"eyJwYXBlcklEIjoiMjEwNy4xMDQ5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.10492"},{"id":"eyJwYXBlcklEIjoiMTMxMS4wNDY4IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1311.0468"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wODgwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.08805"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wNDEyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.04125"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xNjgxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.16810"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xNTY0OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.15648"}]}]}}]},"__typename":"paper","authorArray":["Sayak Ray Chowdhury","Aditya Gopalan"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2b",null,{"publisher":"arxiv","paperID":"1911.01871","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2c",null,{"article":"$L2d","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2e",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L2f",null,{"paperID":"1911.01871","publisher":"arxiv","paperJSON":{"title":"On Online Learning in Kernelized Markov Decision Processes","paperID":"1911.01871","avgLineHeight":12,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"We develop algorithms with low regret for learning episodic Markov decision processes based on kernel approximation techniques. The algorithms are based on both the Upper Confidence Bound (UCB) as well as Posterior or Thompson Sampling (PSRL) philosophies, and work in the general setting of continuous state and action spaces when the true unknown transition dynamics are assumed to have smoothness induced by an appropriate Reproducing Kernel Hilbert Space (RKHS).","element":"span"}]]},{"heading":"I. INTRODUCTION","paragraphs":[[{"text":"The goal of reinforcement learning (RL) is to learn optimal behavior by repeated interaction with an unknown environment, usually modeled as a Markov Decision Process (MDP). Performance is typically measured by the amount of interaction, in terms of episodes or rounds, needed to arrive at an optimal (or near-optimal) policy; this is also known as the sample complexity of RL [1]. The sample complexity objective encourages efficient exploration across states and actions, but, at the same time, is indifferent to the reward earned during the learning phase.","element":"span"}],[{"text":"A related, but different, goal in RL is the online one, i.e., to learn to gather high cumulative reward, or to equivalently keep the learner’s regret (the gap between its and the optimal policy’s net reward) as low as possible. This is preferable in settings where experimentation comes at a premium and the reward earned in each round is of direct value, e.g., recommender systems (in which rewards correspond to clickthrough events and ultimately translate to revenue), dynamic pricing – in general, control of unknown dynamical systems with instantaneous costs.","element":"span"}],[{"text":"A primary challenge in RL is to learn efficiently across complex (very large or infinite) state and action spaces. In the most general tabula rasa MDP setting, the learner must explore each state-action transition before developing a reasonably clear understanding of the environment, which is prohibitive for large problems. Real-world domains, though, possess more structure: transition and reward behavior often varies smoothly over states and actions, making it possible to generalize via inductive inference – observing a state transition or reward is informative of other, similar transitions or rewards. Scaling RL to large, complex, real-world domains requires exploiting regularity structure in the environment, which has typically been carried out via the use of parametric MDP models in model-based approaches, e.g., [2].","element":"span"}],[{"text":"This paper takes a step in developing theory and algorithms for online RL in environments with smooth transition and reward structure. We specifically consider the episodic online learning problem in the nonparametric, kernelizable MDP setting, i.e., of minimizing regret (relative to an optimal finite-horizon policy) in MDPs with continuous state and action spaces, whose transition and reward functions exhibit smoothness over states and actions compatible with the structure of a reproducing kernel. We develop variants of the well-known UCRL and posterior sampling algorithms for MDPs with continuous state and action spaces, and show that they enjoy sublinear, finite-time regret bounds when the mean transition and reward functions are assumed to belong to the associated Reproducing Kernel Hilbert Space (RKHS) of functions.","element":"span"}],[{"text":"Our results bound the regret of the algorithms in terms of a novel generalization of the information gain of the state transition and reward function kernels, from the memoryless kernel bandit setting [3] to the state-based kernel MDP setting, and help shed light on how the choice of kernel model influences regret performance. We also leverage two different kernel approximation techniques, namely the Quadrature Fourier Features (QFF) approximation [25] and the Nystr¨om approximation [31], to prove the results in the paper. To the best of our knowledge, these are the first concrete regret bounds for RL under kernel approximation, explicitly showing the dependence of regret on kernel structure.","element":"span"}],[{"text":"Our results represent a generalization of several streams of work. We generalize online learning in the kernelized bandit setting [4], [6] to kernelized MDPs, and tabula rasa online learning approaches for MDPs such as Upper Confidence Bound for Reinforcement Learning (UCRL) [7] and Posterior Sampling for Reinforcement Learning (PSRL) [8], [9] to MDPs with kernel structure. Our results can also generalize regret minimization for an episodic variant of the well-known parametric Linear Quadratic Regulator (LQR) problem [10]– [13] to its nonlinear, nonparametric, infinite-dimensional, kernelizable counterpart.","element":"span"}]]},{"heading":"II. RELATED WORK","paragraphs":[[{"text":"$30","element":"span"}]]},{"heading":"III. PROBLEM STATEMENT","paragraphs":[[{"text":"We consider the problem of learning to optimize reward in an unknown finite-horizon MDP, ","element":"span"},{"style":{"height":16},"width":414.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-0.png","element":"img","alt":" M⋆ = {S, A, R⋆, P⋆, H}","inline":true},{"text":", over repeated episodes of interaction. Here, ","element":"span"},{"style":{"height":11.6},"width":148.96,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-1.png","element":"img","alt":" S ⊂ Rm","inline":true,"padRight":true},{"text":"represents the state space, ","element":"span"},{"style":{"height":12.4},"width":150.56,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-2.png","element":"img","alt":" A ⊂ Rn","inline":true,"padRight":true},{"text":"the action space, ","element":"span"},{"text":"H ","element":"span"},{"text":"the episode length, ","element":"span"},{"style":{"height":16},"width":137.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-3.png","element":"img","alt":" R⋆(s, a)","inline":true,"padRight":true},{"text":"the reward distribution over ","element":"span"},{"text":"R","element":"span"},{"text":", and ","element":"span"},{"style":{"height":16},"width":132.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-4.png","element":"img","alt":"P⋆(s, a)","inline":true,"padRight":true},{"text":"the transition distribution over ","element":"span"},{"text":"S","element":"span"},{"text":". At each period ","element":"span"},{"text":"h ","element":"span"},{"text":"= 1","element":"span"},{"text":", ","element":"span"},{"text":"2","element":"span"},{"text":", . . ., H ","element":"span"},{"text":"within an episode, an agent observes a state ","element":"span"},{"style":{"height":13.1},"width":127.28,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-5.png","element":"img","alt":" sh ∈ S","inline":true},{"text":", takes an action ","element":"span"},{"style":{"height":13.9},"width":135.68,"height":34.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-6.png","element":"img","alt":" ah ∈ A","inline":true},{"text":", observes a reward ","element":"span"},{"style":{"height":16},"width":289.6,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-7.png","element":"img","alt":"rh ∼ R⋆(sh, ah)","inline":true},{"text":", and causes the MDP to transition to a next state ","element":"span"},{"style":{"height":16},"width":324.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-8.png","element":"img","alt":" sh+1 ∼ P⋆(sh, ah)","inline":true},{"text":". We assume that the agent, while not possessing knowledge of the reward and transition distribution ","element":"span"},{"style":{"height":14},"width":105.68,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-9.png","element":"img","alt":" R⋆, P⋆","inline":true,"padRight":true},{"text":"of the unknown MDP ","element":"span"},{"style":{"height":13.1},"width":52.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-10.png","element":"img","alt":" M⋆","inline":true},{"text":", knows ","element":"span"},{"text":"S","element":"span"},{"text":", ","element":"span"},{"text":"A ","element":"span"},{"text":"and ","element":"span"},{"text":"H","element":"span"},{"text":".","element":"span"}],[{"text":"A policy ","element":"span"},{"style":{"height":16},"width":468.79,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-11.png","element":"img","alt":" π : S × {1, 2, . . ., H} → A","inline":true,"padRight":true},{"text":"is defined to be a mapping from a state ","element":"span"},{"style":{"height":11.6},"width":94.16,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-12.png","element":"img","alt":" s ∈ S","inline":true,"padRight":true},{"text":"and a period ","element":"span"},{"style":{"height":13.2},"width":187.2,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-13.png","element":"img","alt":" 1 ≤ h ≤ H","inline":true,"padRight":true},{"text":"to an action ","element":"span"},{"style":{"height":12.4},"width":102.56,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-14.png","element":"img","alt":" a ∈ A","inline":true},{"text":". For any MDP ","element":"span"},{"style":{"height":16},"width":429.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-15.png","element":"img","alt":" M = {S, A, RM, PM, H}","inline":true,"padRight":true},{"text":"and policy ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-16.png","element":"img","alt":" π","inline":true},{"text":", the finite horizon, undiscounted, value function for every state ","element":"span"},{"style":{"height":11.6},"width":93.2,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-17.png","element":"img","alt":" s ∈ S","inline":true,"padRight":true},{"text":"and every period ","element":"span"},{"style":{"height":13.2},"width":185.28,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-18.png","element":"img","alt":" 1 ≤ h ≤ H","inline":true,"padRight":true},{"text":"is defined as","element":"span"}],[{"style":{"width":"74%"},"width":727,"height":124,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-19.png","element":"img"}],[{"text":"where the subscript ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-20.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"indicates the application of the learning policy ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-21.png","element":"img","alt":" π","inline":true},{"text":", i.e., ","element":"span"},{"style":{"height":16.7},"width":251.2,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-22.png","element":"img","alt":" aj = π(sj, j)","inline":true},{"text":", and the subscript ","element":"span"},{"text":"M ","element":"span"},{"text":"explicitly references the MDP environment ","element":"span"},{"text":"M","element":"span"},{"text":", i.e., ","element":"span"},{"style":{"height":11.5},"width":118.36,"height":28.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-23.png","element":"img","alt":" sj+1 ∼","inline":true},{"style":{"height":16.7},"width":183.52,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-24.png","element":"img","alt":"PM(sj, aj)","inline":true},{"text":", for all ","element":"span"},{"text":"j ","element":"span"},{"text":"= ","element":"span"},{"text":"h, . . . , H","element":"span"},{"text":".","element":"span"}],[{"text":"We use","element":"span"},{"style":{"height":20},"width":565.16,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-25.png","element":"img","alt":"RM(s, a) = E�r�� r ∼ RM(s, a)�","inline":true},{"text":"to denote the mean of the reward distribution ","element":"span"},{"style":{"height":16},"width":154.72,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-26.png","element":"img","alt":" RM(s, a)","inline":true,"padRight":true},{"text":"that corresponds to playing action ","element":"span"},{"text":"a ","element":"span"},{"text":"at state ","element":"span"},{"text":"s ","element":"span"},{"text":"in the MDP ","element":"span"},{"text":"M","element":"span"},{"text":". We can view a sample ","element":"span"},{"text":"r ","element":"span"},{"text":"from the reward distribution ","element":"span"},{"style":{"height":16},"width":154.24,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-27.png","element":"img","alt":" RM(s, a)","inline":true,"padRight":true},{"text":"as ","element":"span"},{"style":{"height":16},"width":336,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-28.png","element":"img","alt":"r =RM(s, a) + εR","inline":true},{"text":", where ","element":"span"},{"style":{"height":9.5},"width":42.72,"height":23.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-29.png","element":"img","alt":" εR","inline":true,"padRight":true},{"text":"denotes a sample of zero-mean, real-valued additive noise. Similarly, the transition distribution ","element":"span"},{"style":{"height":16},"width":149.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-30.png","element":"img","alt":" PM(s, a)","inline":true,"padRight":true},{"text":"can also be decomposed as a mean value","element":"span"},{"style":{"height":16},"width":155.68,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-31.png","element":"img","alt":"P M(s, a)","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":10.8},"width":56.8,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-32.png","element":"img","alt":" Rm","inline":true,"padRight":true},{"text":"plus a zero-mean additive noise ","element":"span"},{"style":{"height":9.5},"width":42.72,"height":23.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-33.png","element":"img","alt":" εP","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":10.8},"width":56.8,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-34.png","element":"img","alt":" Rm","inline":true,"padRight":true},{"text":"so that ","element":"span"},{"style":{"height":16},"width":350.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-35.png","element":"img","alt":" s′ =P M(s, a) + εP","inline":true,"padRight":true},{"text":"lies in","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":11.6},"width":152.8,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-36.png","element":"img","alt":"S ⊂ Rm","inline":true},{"text":". A policy ","element":"span"},{"style":{"height":9.1},"width":54.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-37.png","element":"img","alt":" πM","inline":true,"padRight":true},{"text":"is said to be optimal for the MDP ","element":"span"},{"text":"M ","element":"span"},{"text":"if","element":"span"}],[{"style":{"width":"85%"},"width":839,"height":61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-38.png","element":"img"}],[{"text":"At the beginning of each episode ","element":"span"},{"text":"l","element":"span"},{"text":", an RL algorithm chooses a policy ","element":"span"},{"style":{"height":9.11},"width":32.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-39.png","element":"img","alt":" πl","inline":true,"padRight":true},{"text":"depending upon the observed state-action-reward sequences upto episode ","element":"span"},{"style":{"height":10.8},"width":93.92,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-40.png","element":"img","alt":" l − 1","inline":true},{"text":", denoted by the history ","element":"span"},{"style":{"height":16.71},"width":780.12,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-41.png","element":"img","alt":" Hl−1 := {sj,k, aj,k, rj,k, sj,k+1}1≤j≤l−1,1≤k≤H","inline":true},{"text":", and executes it for the entire duration of the episode. In other words, at each period ","element":"span"},{"text":"h ","element":"span"},{"text":"of the ","element":"span"},{"text":"l","element":"span"},{"text":"-th episode, the learning algorithm chooses action ","element":"span"},{"style":{"height":16.71},"width":281.92,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-42.png","element":"img","alt":" al,h = πl(sl,h, h)","inline":true},{"text":", receives reward ","element":"span"},{"style":{"height":16.71},"width":496.12,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-43.png","element":"img","alt":"rl,h = R⋆(sl,h, al,h) + εR,l,h","inline":true,"padRight":true},{"text":"and observes the next state ","element":"span"},{"style":{"height":16.71},"width":536.44,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-44.png","element":"img","alt":"sl,h+1 = P ⋆(sl,h, al,h) + εP,l,h","inline":true},{"text":". The goal of an episodic online RL algorithm is to maximize its cumulative reward across episodes, or, equivalently, minimize its cumulative regret: the loss incurred in terms of the value function due to not knowing the optimal policy ","element":"span"},{"style":{"height":10.64},"width":168.4,"height":26.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-45.png","element":"img","alt":" π⋆ := πM⋆","inline":true,"padRight":true},{"text":"of the unknown MDP ","element":"span"},{"style":{"height":13.1},"width":52.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-46.png","element":"img","alt":" M⋆","inline":true,"padRight":true},{"text":"beforehand and instead using the policy ","element":"span"},{"style":{"height":9.1},"width":32.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-47.png","element":"img","alt":" πl","inline":true,"padRight":true},{"text":"for each episode ","element":"span"},{"text":"l","element":"span"},{"text":", ","element":"span"},{"text":"l ","element":"span"},{"text":"= 1","element":"span"},{"text":", ","element":"span"},{"text":"2","element":"span"},{"text":", . . .","element":"span"},{"text":". The cumulative (expected) regret of an RL algorithm ","element":"span"},{"style":{"height":16},"width":301.76,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-48.png","element":"img","alt":" π = {π1, π2, . . .}","inline":true,"padRight":true},{"text":"upto time horizon ","element":"span"},{"style":{"height":10.8},"width":139.68,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-49.png","element":"img","alt":"T = τH","inline":true,"padRight":true},{"text":"is defined as","element":"span"}],[{"style":{"width":"66%"},"width":647,"height":111,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-50.png","element":"img"}],[{"text":"where the initial states ","element":"span"},{"style":{"height":15.5},"width":159.68,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-51.png","element":"img","alt":" sl,1, l ≥ 1","inline":true,"padRight":true},{"text":"are assumed to be fixed.","element":"span"}],[{"text":"For the rest of the paper, unless otherwise specified, we define ","element":"span"},{"style":{"height":16},"width":661.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-52.png","element":"img","alt":" Z := S × A, z := (s, a), z′ := (s′, a′)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.5},"width":115.48,"height":28.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-53.png","element":"img","alt":" zl,h :=","inline":true},{"style":{"height":16.7},"width":169.6,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-54.png","element":"img","alt":"(sl,h, al,h)","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":13.2},"width":86.24,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-55.png","element":"img","alt":" l ≥ 1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.2},"width":185.28,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-56.png","element":"img","alt":" 1 ≤ h ≤ H","inline":true},{"text":".","element":"span"}]]},{"heading":"IV. ASSUMPTIONS","paragraphs":[[{"id":"id-5","text":"A. Smoothness of Value Function","element":"span"}],[{"style":{"width":"96%"},"width":941,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-57.png","element":"img"}],[{"style":{"height":13.2},"width":206.4,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-58.png","element":"img","alt":"1 ≤ h ≤ H","inline":true},{"text":", we define the one step future value function as the expected value of the optimal policy ","element":"span"},{"style":{"height":9.1},"width":54.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-59.png","element":"img","alt":" πM","inline":true},{"text":", with the next state distributed according to ","element":"span"},{"style":{"height":10},"width":26,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-60.png","element":"img","alt":" ϕ","inline":true},{"text":", i.e. ","element":"span"},{"style":{"height":17.78},"width":187.96,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-61.png","element":"img","alt":" U Mh (ϕ) :=","inline":true},{"style":{"height":28.8},"width":341.08,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-62.png","element":"img","alt":"Es′∼ϕ�V MπM,h+1(s′)�","inline":true},{"text":". We assume the following regularity condition on the future value function of any MDP (also made by [2]). For any two single-step transition distributions ","element":"span"},{"style":{"height":10},"width":103.84,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-63.png","element":"img","alt":"ϕ1, ϕ2","inline":true,"padRight":true},{"text":"over ","element":"span"},{"text":"S","element":"span"},{"text":", and ","element":"span"},{"style":{"height":13.2},"width":185.28,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-64.png","element":"img","alt":" 1 ≤ h ≤ H","inline":true},{"text":",","element":"span"}],[{"style":{"width":"84%"},"width":830,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-65.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16.7},"width":395.12,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-66.png","element":"img","alt":"ϕ := Es′∼ϕ[s′] ∈ S","inline":true,"padRight":true},{"text":"denotes the mean of the distribution ","element":"span"},{"style":{"height":10},"width":26,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-67.png","element":"img","alt":" ϕ","inline":true},{"text":". In other words, the one-step future value functions for each period ","element":"span"},{"text":"h ","element":"span"},{"text":"are Lipschitz continuous with respect to the ","element":"span"},{"style":{"height":16.8},"width":66.88,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-68.png","element":"img","alt":" ∥·∥2","inline":true},{"text":"-norm of the mean","element":"span"},{"text":"2","element":"span"},{"text":", with global Lipschitz constant ","element":"span"},{"style":{"height":13.1},"width":59.36,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-69.png","element":"img","alt":" LM","inline":true},{"text":". We also assume that there is a known constant ","element":"span"},{"text":"L ","element":"span"},{"text":"such that ","element":"span"},{"style":{"height":14.64},"width":156.6,"height":36.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-70.png","element":"img","alt":" LM⋆ ≤ L","inline":true},{"text":".","element":"span"}],[{"text":"B. Smoothness of Mean Reward and Transition Functions","element":"span"}],[{"text":"Attaining sub-linear regret is impossible in general for arbitrary reward and transition distributions, and thus some regularity assumptions are needed. In this paper, we assume smoothness for the mean reward function","element":"span"},{"style":{"height":13.1},"width":249.8,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-71.png","element":"img","alt":"R⋆ : Z → R","inline":true,"padRight":true},{"text":"is induced by the structure of a kernel on ","element":"span"},{"text":"Z","element":"span"},{"text":". Specifically, we make the standard assumption of a p.s.d. kernel ","element":"span"},{"style":{"height":13.1},"width":77.72,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-72.png","element":"img","alt":" kR :","inline":true},{"style":{"height":11.2},"width":208.52,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-73.png","element":"img","alt":"Z × Z → R","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":16},"width":212.48,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-74.png","element":"img","alt":" kR(z, z) ≤ 1","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":103.48,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-75.png","element":"img","alt":" z ∈ Z","inline":true},{"text":", and","element":"span"},{"style":{"height":13.1},"width":44.72,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-76.png","element":"img","alt":"R⋆","inline":true,"padRight":true},{"text":"being an element of the reproducing kernel Hilbert space (RKHS) ","element":"span"},{"style":{"height":16},"width":138.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-77.png","element":"img","alt":" HkR(Z)","inline":true,"padRight":true},{"text":"of smooth real valued functions on ","element":"span"},{"text":"Z","element":"span"},{"text":". An RKHS of real-valued functions ","element":"span"},{"style":{"height":11.2},"width":125,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-78.png","element":"img","alt":" X → R","inline":true},{"text":", denoted by ","element":"span"},{"style":{"height":16},"width":118.72,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-79.png","element":"img","alt":" Hk(X)","inline":true},{"text":", is completely specified by its kernel function ","element":"span"},{"style":{"height":16},"width":93.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-80.png","element":"img","alt":" k(·, ·)","inline":true,"padRight":true},{"text":"and viceversa, with an inner product ","element":"span"},{"style":{"height":16},"width":87.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-81.png","element":"img","alt":" ⟨·, ·⟩k","inline":true,"padRight":true},{"text":"obeying the reproducing property ","element":"span"},{"style":{"height":16},"width":352.04,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-82.png","element":"img","alt":" f(x) = ⟨f, k(x, ·)⟩k","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":16},"width":220.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-83.png","element":"img","alt":" f ∈ Hk(X)","inline":true},{"text":". The induced RKHS norm ","element":"span"},{"style":{"height":20},"width":309.32,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/1-84.png","element":"img","alt":" ∥f∥k = �⟨f, f⟩k","inline":true,"padRight":true},{"text":"is a measure of smoothness of ","element":"span"},{"text":"f ","element":"span"},{"text":"with respect to the kernel function ","element":"span"},{"text":"k","element":"span"},{"text":". We assume that the RKHS norm of","element":"span"},{"style":{"height":13.11},"width":44.72,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-0.png","element":"img","alt":"R⋆","inline":true,"padRight":true},{"text":"is bounded, i.e., ","element":"span"},{"style":{"height":22.11},"width":261.12,"height":55.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-1.png","element":"img","alt":"��R⋆��kR ≤ BR","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":13.1},"width":168.16,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-2.png","element":"img","alt":" BR < ∞","inline":true},{"text":". Boundedness of ","element":"span"},{"style":{"height":13.1},"width":44.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-3.png","element":"img","alt":" kR","inline":true,"padRight":true},{"text":"along the diagonal holds for any stationary kernel, i.e., where ","element":"span"},{"style":{"height":16},"width":371.2,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-4.png","element":"img","alt":"kR(z, z′) = kR(z −z′)","inline":true},{"text":", e.g., the Squared Exponential kernel ","element":"span"},{"style":{"height":13.1},"width":53,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-5.png","element":"img","alt":"kSE","inline":true,"padRight":true},{"text":"and the Mat","element":"span"},{"style":{"height":10.8},"width":21.44,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-6.png","element":"img","alt":"´e","inline":true},{"text":"rn kernel ","element":"span"},{"style":{"height":13.1},"width":101.36,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-7.png","element":"img","alt":" kMat´ern","inline":true},{"text":":","element":"span"}],[{"style":{"width":"85%"},"width":842,"height":232,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-8.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"l > ","element":"span"},{"text":"0 ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":11.6},"width":97.76,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-9.png","element":"img","alt":" ν > 0","inline":true,"padRight":true},{"text":"are hyperparameters of the kernels,","element":"span"}],[{"style":{"width":"100%"},"width":980,"height":481,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-10.png","element":"img"}],[{"text":"C. Sub-Gaussian Noise Variables","element":"span"}],[{"text":"We assume that the random variables ","element":"span"},{"style":{"height":19.78},"width":304.44,"height":49.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-11.png","element":"img","alt":" {εR,l,h}l≥1,1≤h≤H","inline":true,"padRight":true},{"text":"is conditionally zero-mean and ","element":"span"},{"style":{"height":9.1},"width":46.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-12.png","element":"img","alt":" σR","inline":true},{"text":"-sub-Gaussian, i.e., there exists a known ","element":"span"},{"style":{"height":13.1},"width":121.76,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-13.png","element":"img","alt":" σR > 0","inline":true,"padRight":true},{"text":"such that for any ","element":"span"},{"style":{"height":11.6},"width":101,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-14.png","element":"img","alt":" λ ∈ R","inline":true},{"text":",","element":"span"}],[{"style":{"width":"88%"},"width":869,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-15.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.91},"width":141.76,"height":39.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-16.png","element":"img","alt":"FR,l,h−1","inline":true,"padRight":true},{"text":"is ","element":"span"},{"text":"the ","element":"span"},{"text":"sigma ","element":"span"},{"text":"algebra ","element":"span"},{"text":"generated ","element":"span"},{"text":"by the ","element":"span"},{"text":"random ","element":"span"},{"text":"variables ","element":"span"},{"style":{"height":16.71},"width":544.44,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-17.png","element":"img","alt":"{sj,k, aj,k, εR,j,k}1≤j≤l−1,1≤k≤H","inline":true},{"text":", ","element":"span"},{"style":{"height":16.7},"width":511,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-18.png","element":"img","alt":"{sl,k, al,k, εR,l,k}1≤k≤h−1, sl,h","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.5},"width":59.8,"height":28.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-19.png","element":"img","alt":" al,h","inline":true},{"text":". Similarly, the random variables ","element":"span"},{"style":{"height":19.78},"width":301.08,"height":49.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-20.png","element":"img","alt":" {εP,l,h}l≥1,1≤h≤H","inline":true,"padRight":true},{"text":"is assumed to be conditionally component-wise independent, zero-mean and ","element":"span"},{"style":{"height":9.1},"width":46.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-21.png","element":"img","alt":" σP","inline":true,"padRight":true},{"text":"-subGaussian, in the sense that there exists a known ","element":"span"},{"style":{"height":13.1},"width":138.08,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-22.png","element":"img","alt":" σP > 0","inline":true,"padRight":true},{"text":"such that for any ","element":"span"},{"style":{"height":11.6},"width":101,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-23.png","element":"img","alt":" λ ∈ R","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.2},"width":175.16,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-24.png","element":"img","alt":" 1 ≤ i ≤ m","inline":true},{"text":",","element":"span"}],[{"style":{"width":"92%"},"width":909,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-25.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.91},"width":138.88,"height":39.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-26.png","element":"img","alt":"FP,l,h−1","inline":true,"padRight":true},{"text":"is ","element":"span"},{"text":"the ","element":"span"},{"text":"sigma ","element":"span"},{"text":"algebra ","element":"span"},{"text":"generated ","element":"span"},{"text":"by the ","element":"span"},{"text":"random ","element":"span"},{"text":"variables ","element":"span"},{"style":{"height":16.71},"width":541.56,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-27.png","element":"img","alt":"{sj,k, aj,k, εP,j,k}1≤j≤l−1,1≤k≤H","inline":true},{"text":", ","element":"span"},{"style":{"height":16.7},"width":506.68,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-28.png","element":"img","alt":"{sl,k, al,k, εP,l,k}1≤k≤h−1, sl,h","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.5},"width":59.8,"height":28.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-29.png","element":"img","alt":" al,h","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.1},"width":45.28,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-30.png","element":"img","alt":" Im","inline":true,"padRight":true},{"text":"denotes the identity matrix of rank ","element":"span"},{"text":"m","element":"span"},{"text":".","element":"span"}]]},{"heading":"V. ALGORITHM","paragraphs":[[{"text":"A. Kernel Approximation For kernelized MDPs [24] develop variants of the UCRL2 algorithm which, at every episode ","element":"span"},{"text":"l","element":"span"},{"text":", constructs confidence sets for the mean reward and the mean transition functions. The construction of each confidence set require one inversion of the kernel (gram) matrix, which takes ","element":"span"},{"style":{"height":17.36},"width":93.28,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-31.png","element":"img","alt":" O(l3)","inline":true,"padRight":true},{"text":"time. This makes the algorithm quite prohibitive for large number of episodes. To reduce this computational cost without compromising on the accuracy of the confidence sets, we incorporate two efficient kernel approximation schemes, namely the Quadrature Fourier Features (QFF) approximation [25] and the Nystr¨om approximation [26].","element":"span"}],[{"text":"1) Quadrature Fourier Features (QFF) approximation:","element":"span"}],[{"text":"If ","element":"span"},{"text":"k ","element":"span"},{"text":"is a bounded, continuous, positive definite, stationary kernel defined over ","element":"span"},{"style":{"height":11.6},"width":159.48,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-32.png","element":"img","alt":" X ⊂ Rq","inline":true,"padRight":true},{"text":"and satisfies ","element":"span"},{"text":"k","element":"span"},{"text":"(","element":"span"},{"text":"x, x","element":"span"},{"text":") = 1 ","element":"span"},{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":121.8,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-33.png","element":"img","alt":" x ∈ X","inline":true},{"text":", then by Bochner’s theorem [27], ","element":"span"},{"text":"k ","element":"span"},{"text":"is the Fourier transform of a probability measure ","element":"span"},{"text":"p","element":"span"},{"text":", i.e., ","element":"span"},{"text":"k","element":"span"},{"text":"(","element":"span"},{"text":"x, y","element":"span"},{"text":") = ","element":"span"},{"style":{"height":19.12},"width":460.84,"height":47.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-34.png","element":"img","alt":"�Rq p(ω) cos(ωT (x − y))dω","inline":true},{"text":". For the Squared Exponential ","element":"span"},{"text":"kernel defined over ","element":"span"},{"style":{"height":11.6},"width":163.32,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-35.png","element":"img","alt":" X ⊂ Rq","inline":true},{"text":", this measure has density ","element":"span"},{"style":{"height":30.61},"width":393.68,"height":76.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-36.png","element":"img","alt":"p(ω) =� l√2π�qe−l2∥ω∥222","inline":true,"padRight":true},{"text":"(abusing notation for measure and density). [25] show that for any stationary kernel ","element":"span"},{"text":"k ","element":"span"},{"text":"on ","element":"span"},{"style":{"height":10.8},"width":43.8,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-37.png","element":"img","alt":"Rq","inline":true,"padRight":true},{"text":"whose inverse Fourier transform decomposes product wise, i.e., ","element":"span"},{"style":{"height":20.06},"width":344.32,"height":50.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-38.png","element":"img","alt":" p(ω) = �qj=1 pj(ωj)","inline":true},{"text":", we can use Gauss-Hermite ","element":"span"},{"text":"quadrature [28] to approximate it. If ","element":"span"},{"style":{"height":16},"width":199.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-39.png","element":"img","alt":" X = [0, 1]q","inline":true},{"text":", the SE kernel is approximated as follows. Choose ","element":"span"},{"style":{"height":14.43},"width":133.16,"height":36.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-40.png","element":"img","alt":"¯d ∈ N","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.83},"width":140.28,"height":44.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-41.png","element":"img","alt":"d = ( ¯d)q","inline":true},{"text":", and construct the ","element":"span"},{"text":"2","element":"span"},{"text":"d","element":"span"},{"text":"-dimensional feature map","element":"span"}],[{"id":"id-6","style":{"width":"99%"},"width":976,"height":230,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-42.png","element":"img"}],[{"text":"Here the set ","element":"span"},{"style":{"height":16},"width":263.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-43.png","element":"img","alt":" {ω1, . . . , ωd} =","inline":true}],[{"style":{"height":15.44},"width":243.52,"height":38.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-44.png","element":"img","alt":"A ¯d × · · · × A ¯d","inline":true},{"text":", where ","element":"span"},{"style":{"height":15.44},"width":51.04,"height":38.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-45.png","element":"img","alt":" A ¯d","inline":true,"padRight":true},{"text":"isthe set of ","element":"span"},{"style":{"height":13.82},"width":26.72,"height":34.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-46.png","element":"img","alt":"¯d","inline":true,"padRight":true},{"text":"(real) roots of the ","element":"span"},{"style":{"height":13.82},"width":27.2,"height":34.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-47.png","element":"img","alt":"¯d","inline":true},{"text":"-th Hermite polynomial ","element":"span"},{"style":{"height":14.64},"width":54.4,"height":36.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-48.png","element":"img","alt":" H ¯d","inline":true},{"text":", and ","element":"span"},{"style":{"height":27.52},"width":411.92,"height":68.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-49.png","element":"img","alt":" ν(z) = �qj=1 2¯d−1 ¯d!¯d2H ¯d−1(zj)2","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":112.92,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-50.png","element":"img","alt":" z ∈ Rq","inline":true},{"text":". ","element":"span"},{"text":"2) Nystr¨om approximation: Unlike the QFF approximation where the basis functions (cosine and sine) do not depend on the data, the basis functions used by the Nystr¨om method are data dependent. For a set of points ","element":"span"},{"style":{"height":16},"width":333.48,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-51.png","element":"img","alt":"{x1, . . . , xt} ⊂ X","inline":true},{"text":", the Nystr¨om method [26] approximates a kernel ","element":"span"},{"style":{"height":11.2},"width":369.32,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-52.png","element":"img","alt":" k : X × X → R","inline":true,"padRight":true},{"text":"as follows: First, randomly sample ","element":"span"},{"text":"d ","element":"span"},{"text":"points to construct a dictionary ","element":"span"},{"text":"D ","element":"span"},{"text":"= ","element":"span"},{"style":{"height":16.7},"width":386.36,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-53.png","element":"img","alt":"{xi1, . . . , xid}; ij ∈ [t]","inline":true},{"text":", according to the following distribution. For each ","element":"span"},{"style":{"height":16},"width":123.8,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-54.png","element":"img","alt":" i ∈ [t]","inline":true},{"text":", include ","element":"span"},{"style":{"height":9.1},"width":33.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-55.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"in ","element":"span"},{"text":"D ","element":"span"},{"text":"independently with some suitably chosen probability ","element":"span"},{"style":{"height":10},"width":31.16,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-56.png","element":"img","alt":" pi","inline":true},{"text":". (","element":"span"},{"style":{"height":10},"width":31.16,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-57.png","element":"img","alt":"pi","inline":true},{"text":"’s trade off between the quality and the size of the embedding.) Then, compute the (approximate) ","element":"span"},{"text":"d","element":"span"},{"text":"-dimensional feature embedding","element":"span"}],[{"style":{"height":32.34},"width":430.72,"height":80.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-58.png","element":"img","alt":"˜ϕ(x) = �K1/2D �†kD(x)","inline":true},{"text":", where ","element":"span"},{"style":{"height":16.7},"width":380.2,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-59.png","element":"img","alt":" KD = [k(u, v)]u,v∈D","inline":true},{"text":", ","element":"span"},{"style":{"height":17.39},"width":611.96,"height":43.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-60.png","element":"img","alt":"kD(x) = [k(xi1, x), . . . , k(xid, x)]T","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.36},"width":44.76,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-61.png","element":"img","alt":" A†","inline":true,"padRight":true},{"text":"denotes the pseudo inverse of any matrix ","element":"span"},{"text":"A","element":"span"},{"text":".","element":"span"}],[{"text":"Now we will present our algorithm Kernel-UCRL using the Nystr¨om approximation. The description and performance of Kernel-UCRL using the quadrature Fourier features approximation is deferred to the Appendix.","element":"span"}],[{"text":"B. Kernel-UCRL Algorithm under Nystr¨om Approximation","element":"span"}],[{"text":"Kernel-UCRL (Algorithm ","element":"span"},{"href":"#id-0","text":"1) ","element":"a"},{"text":"is an optimistic algorithm based on the Upper Confidence Bound principle, which adapts the confidence sets of UCRL2 [7] to exploit the kernel structure. At the start of episode ","element":"span"},{"text":"l","element":"span"},{"text":", first we find feature embeddings ","element":"span"},{"style":{"height":18.06},"width":390.52,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-62.png","element":"img","alt":"˜ϕR,l : Z → RdR,L","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.5},"width":115.16,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-63.png","element":"img","alt":"˜ϕP,l :","inline":true},{"style":{"height":14.03},"width":225.4,"height":35.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-64.png","element":"img","alt":"¯Z → RdP,L","inline":true,"padRight":true},{"text":"to efficiently approximate the kernels ","element":"span"},{"style":{"height":13.1},"width":44.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-65.png","element":"img","alt":" kR","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.1},"width":44.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-66.png","element":"img","alt":" kP","inline":true,"padRight":true},{"text":", respectively. First, we construct a dictionary ","element":"span"},{"style":{"height":15.5},"width":74.32,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-67.png","element":"img","alt":"DR,l","inline":true,"padRight":true},{"text":"by including every state-action pair ","element":"span"},{"text":"z ","element":"span"},{"text":"from the set ","element":"span"},{"style":{"height":16.7},"width":912.8,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-68.png","element":"img","alt":" Zl−1 := {zj,k}1≤j≤l−1,1≤k≤H = {z1,1, . . . , zl−1,H}","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":15.5},"width":74.32,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-69.png","element":"img","alt":" DR,l","inline":true,"padRight":true},{"text":"with probability ","element":"span"},{"style":{"height":16.7},"width":114.88,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-70.png","element":"img","alt":" bR,l(z)","inline":true,"padRight":true},{"text":"(to be defined later). Then, we define ","element":"span"},{"style":{"height":28.8},"width":314.8,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/2-71.png","element":"img","alt":" ˜ϕR,l(z) =�K1/2DR,l","inline":true}],[{"text":"we define ","element":"span"},{"style":{"height":32.33},"width":586.72,"height":80.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-0.png","element":"img","alt":" ˜ϕP,l(z, i) = �K1/2DP,l�†kDP,l(z, i)","inline":true},{"text":", where the dictionary ","element":"span"},{"style":{"height":15.5},"width":71.44,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-1.png","element":"img","alt":" DP,l","inline":true,"padRight":true},{"text":"is constructed by including every ","element":"span"},{"text":"(","element":"span"},{"text":"z, i","element":"span"},{"text":") ","element":"span"},{"text":"from the set ","element":"span"},{"style":{"height":22.59},"width":751.96,"height":56.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-2.png","element":"img","alt":"¯Zl−1 := �(zj,k, i)�1≤j≤l−1,1≤k≤H,1≤i≤m =","inline":true},{"style":{"height":19.2},"width":457.88,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-3.png","element":"img","alt":"�(z1,1, 1), . . . , (zl−1,H, m)�","inline":true},{"text":"with probability ","element":"span"},{"style":{"height":16.7},"width":143.68,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-4.png","element":"img","alt":" bP,l(z, i)","inline":true,"padRight":true},{"text":"(to be defined later).","element":"span"}],[{"text":"Then we construct confidence sets ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-5.png","element":"img","alt":" CR,l","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.5},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-6.png","element":"img","alt":" CP,l","inline":true,"padRight":true},{"text":"for the mean reward and the mean transition functions, respectively, as follows. First, we compute ","element":"span"},{"style":{"height":22.22},"width":466.72,"height":55.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-7.png","element":"img","alt":"˜θR,l−1 = ˜V −1R,l−1 ˜ΦTR,l−1Rl−1","inline":true},{"text":", ","element":"span"},{"text":"where ","element":"span"},{"style":{"height":18.07},"width":489.56,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-8.png","element":"img","alt":" Rl−1 := [r1,1, . . . , rl−1,H]T","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":11.51},"width":57.32,"height":28.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-9.png","element":"img","alt":" rj,k","inline":true,"padRight":true},{"text":"being the reward of the state-action pair ","element":"span"},{"style":{"height":16.7},"width":252.44,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-10.png","element":"img","alt":" zj,k, j ∈ [l − 1]","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":129.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-11.png","element":"img","alt":" k ∈ [H]","inline":true},{"text":", ","element":"span"},{"style":{"height":19.54},"width":703.64,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-12.png","element":"img","alt":"˜ΦR,l−1 = [ ˜ϕR,l(z1,1), . . . , ˜ϕR,l(zl−1,H)]T","inline":true,"padRight":true},{"text":", and ","element":"span"},{"style":{"height":19.54},"width":163.96,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-13.png","element":"img","alt":" ˜VR,l−1 =","inline":true},{"style":{"height":21.65},"width":390.16,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-14.png","element":"img","alt":"˜ΦTR,l−1 ˜ΦR,l−1 + HIdR,l","inline":true},{"text":". Fix any ","element":"span"},{"style":{"height":14.8},"width":314.24,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-15.png","element":"img","alt":" 0 < δ, εR, εP < 1","inline":true},{"text":". Now, ","element":"span"},{"text":"we define ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-16.png","element":"img","alt":" CR,l","inline":true,"padRight":true},{"text":"to be the set of all functions ","element":"span"},{"style":{"height":14},"width":180.2,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-17.png","element":"img","alt":" f : Z → R","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"width":"87%"},"width":855,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-18.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":16.7},"width":145.24,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-19.png","element":"img","alt":"µR,l−1(z","inline":true},{"text":") = ˜","element":"span"},{"style":{"height":19.73},"width":251.68,"height":49.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-20.png","element":"img","alt":"ϕR,l(z)T ˜θR,l−1","inline":true},{"text":", ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":20.37},"width":143.8,"height":50.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-21.png","element":"img","alt":"σ2R,l−1(z","inline":true},{"text":") = ","element":"span"},{"style":{"height":16},"width":169.72,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-22.png","element":"img","alt":" kR(z, z)−","inline":true,"padRight":true},{"text":"˜","element":"span"},{"style":{"height":18.06},"width":146.36,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-23.png","element":"img","alt":"ϕR,l(z)T","inline":true,"padRight":true},{"text":"˜","element":"span"},{"style":{"height":16.7},"width":106.36,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-24.png","element":"img","alt":"ϕR,l(z","inline":true},{"text":") + ","element":"span"},{"style":{"height":22.03},"width":292.48,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-25.png","element":"img","alt":" H ˜ϕR,l(z)T ˜V −1R,l−1","inline":true,"padRight":true},{"text":"˜","element":"span"},{"style":{"height":16.7},"width":106.36,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-26.png","element":"img","alt":"ϕR,l(z","inline":true},{"text":") ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":15.9},"width":66.16,"height":39.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-27.png","element":"img","alt":" βR,l","inline":true,"padRight":true},{"text":"=","element":"span"}],[{"style":{"width":"96%"},"width":947,"height":70,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-28.png","element":"img"}],[{"text":"Similarly, we compute ","element":"span"},{"style":{"height":22.23},"width":450.4,"height":55.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-29.png","element":"img","alt":"˜θP,l−1 = ˜V −1P,l−1 ˜ΦTP,l−1Sl−1","inline":true},{"text":", where ","element":"span"},{"style":{"height":20.37},"width":505.4,"height":50.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-30.png","element":"img","alt":"Sl−1 := [sT1,2, . . . , sTl−1,H+1]T","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":11.5},"width":99.04,"height":28.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-31.png","element":"img","alt":" sj,k+1","inline":true,"padRight":true},{"text":"being the next ","element":"span"},{"text":"state of the state-action pair ","element":"span"},{"style":{"height":16.7},"width":299.96,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-32.png","element":"img","alt":" zj,k, j ∈ [l − 1]","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.6},"width":73.56,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-33.png","element":"img","alt":" k ∈","inline":true,"padRight":true},{"text":"[","element":"span"},{"text":"H","element":"span"},{"text":"]","element":"span"},{"text":", ","element":"span"},{"style":{"height":19.53},"width":801.08,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-34.png","element":"img","alt":"˜ΦP,l−1 = [ ˜ϕP,l(z1,1, 1), . . . , ˜ϕP,l(zl−1,H, m)]T","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":21.84},"width":599.92,"height":54.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-35.png","element":"img","alt":"˜VP,l−1 = ˜ΦTP,l−1 ˜ΦP,l−1 + mHIdP,l","inline":true},{"text":". Now, we define ","element":"span"},{"style":{"height":15.5},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-36.png","element":"img","alt":" CP,l","inline":true,"padRight":true},{"text":"to be the set of all functions ","element":"span"},{"style":{"height":14},"width":208,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-37.png","element":"img","alt":" f : Z → Rm","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"height":17.37},"width":827.32,"height":43.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-38.png","element":"img","alt":"∥f(z) − ˜µP,l−1(z)∥2 ≤ βP,l ∥˜σP,l−1(z)∥2 , ∀z ∈ Z","inline":true},{"text":", ","element":"span"},{"text":"(6) where ","element":"span"},{"style":{"height":16.7},"width":500.8,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-39.png","element":"img","alt":"˜µP,l−1(z) = [˜µP,l−1(z, 1)","inline":true},{"text":", . . . , ","element":"span"},{"style":{"height":16.7},"width":123.52,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-40.png","element":"img","alt":" ˜µP,l−1(","inline":true},{"text":"z, m","element":"span"},{"style":{"height":17.55},"width":49.4,"height":43.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-41.png","element":"img","alt":")]T","inline":true,"padRight":true},{"text":", ","element":"span"},{"style":{"height":16.71},"width":123.52,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-42.png","element":"img","alt":"˜µP,l−1(","inline":true},{"text":"z, i","element":"span"},{"style":{"height":16.71},"width":284.8,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-43.png","element":"img","alt":") = ˜ϕP,l(","inline":true},{"text":"z, i","element":"span"},{"style":{"height":19.73},"width":467.32,"height":49.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-44.png","element":"img","alt":")T ˜θP,l−1, ˜σP,l−1(z) =","inline":true},{"style":{"height":16.7},"width":207.52,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-45.png","element":"img","alt":"[˜σP,l−1(z, 1)","inline":true},{"text":", . . . , ","element":"span"},{"style":{"height":16.7},"width":122.08,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-46.png","element":"img","alt":" ˜σP,l−1(","inline":true},{"text":"z, m","element":"span"},{"style":{"height":17.36},"width":49.4,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-47.png","element":"img","alt":")]T","inline":true,"padRight":true},{"text":", ","element":"span"},{"style":{"height":20.37},"width":122.56,"height":50.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-48.png","element":"img","alt":"˜σ2P,l−1(","inline":true},{"text":"z, i","element":"span"},{"text":") ","element":"span"},{"text":"= ","element":"span"},{"text":"k","element":"span"},{"style":{"height":16},"width":58.24,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-49.png","element":"img","alt":"P ((","inline":true},{"text":"z, i","element":"span"},{"text":")","element":"span"},{"text":", ","element":"span"},{"text":"(","element":"span"},{"text":"z, i","element":"span"},{"style":{"height":16.7},"width":364.48,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-50.png","element":"img","alt":")) − ˜ϕP,l(","inline":true},{"text":"z, i","element":"span"},{"style":{"height":18.06},"width":124.96,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-51.png","element":"img","alt":")T ˜ϕP,l(","inline":true},{"text":"z, i","element":"span"},{"text":") ","element":"span"},{"text":"+ ","element":"span"},{"text":"mH ","element":"span"},{"style":{"height":16.7},"width":84.64,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-52.png","element":"img","alt":" ˜ϕP,l(","inline":true},{"text":"z, i","element":"span"},{"style":{"height":18.83},"width":66.56,"height":47.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-53.png","element":"img","alt":")T ˜","inline":true},{"text":"V ","element":"span"},{"style":{"height":21.52},"width":168.64,"height":53.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-54.png","element":"img","alt":"−1P,l−1 ˜ϕP,l(","inline":true},{"text":"z, i","element":"span"},{"text":") ","element":"span"},{"text":"and ","element":"span"},{"text":"the ","element":"span"},{"text":"confidence","element":"span"}],[{"style":{"width":"99%"},"width":977,"height":143,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-55.png","element":"img"}],[{"text":"Next, we build the set ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-56.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"of all plausible MDPs ","element":"span"},{"text":"M ","element":"span"},{"text":"such that: ","element":"span"},{"text":"(","element":"span"},{"text":"i","element":"span"},{"text":") ","element":"span"},{"text":"the mean reward function","element":"span"},{"style":{"height":16.71},"width":261.76,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-57.png","element":"img","alt":"RM ∈ CR,l, (ii)","inline":true,"padRight":true},{"text":"the mean transition function","element":"span"},{"style":{"height":15.51},"width":177.04,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-58.png","element":"img","alt":"P M ∈ CP,l","inline":true,"padRight":true},{"text":"and ","element":"span"},{"text":"(","element":"span"},{"text":"iii","element":"span"},{"text":") ","element":"span"},{"text":"the global Lipschitz constant (of future value functions) ","element":"span"},{"style":{"height":13.2},"width":156.6,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-59.png","element":"img","alt":" LM ≤ L","inline":true},{"text":". Finally, we select an optimistic policy ","element":"span"},{"style":{"height":9.1},"width":32.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-60.png","element":"img","alt":" πl","inline":true,"padRight":true},{"text":"for the family of MDPs ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-61.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"in the sense that ","element":"span"},{"style":{"height":21.33},"width":677.92,"height":53.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-62.png","element":"img","alt":" V Mlπl,1(sl,1) = maxπ maxM∈Ml V Mπ,1(sl,1)","inline":true},{"text":", ","element":"span"},{"text":"where ","element":"span"},{"style":{"height":11.5},"width":54.4,"height":28.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-63.png","element":"img","alt":" sl,1","inline":true,"padRight":true},{"text":"is the initial state and ","element":"span"},{"style":{"height":13.1},"width":48.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-64.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"is the most optimistic realization from ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-65.png","element":"img","alt":" Ml","inline":true},{"text":", and execute ","element":"span"},{"style":{"height":9.1},"width":32.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-66.png","element":"img","alt":" πl","inline":true,"padRight":true},{"text":"for the entire episode. The pseudo-code of kernel-UCRL is given in Algorithm ","element":"span"},{"href":"#id-0","text":"1.","element":"a"}]]},{"heading":"VI. ANALYSIS OF KERNEL-UCRL UNDER NYSTR ¨OM APPROXIMATION","paragraphs":[[{"text":"A. Regret Bound of Kernel-UCRL","element":"span"}],[{"text":"Let ","element":"span"},{"style":{"height":20.18},"width":839.32,"height":50.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-67.png","element":"img","alt":"σ2R,l(z) := kR(z, z) − kR,l(z)T (KR,l +","inline":true},{"style":{"height":18.06},"width":271.36,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-68.png","element":"img","alt":"HIlH)−1kR,l(z)","inline":true},{"text":", ","element":"span"},{"text":"where ","element":"span"},{"style":{"height":16.7},"width":364.12,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-69.png","element":"img","alt":"kR,l(z) :=","inline":true},{"style":{"height":18.06},"width":495.8,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-70.png","element":"img","alt":"[kR(z1,1, z), . . . , kR(zl,H, z)]T","inline":true,"padRight":true},{"text":"denotes ","element":"span"},{"text":"the ","element":"span"},{"text":"vector ","element":"span"},{"text":"of kernel evaluations between ","element":"span"},{"text":"z ","element":"span"},{"text":"and elements of ","element":"span"},{"style":{"height":13.11},"width":38.8,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-71.png","element":"img","alt":" Zl","inline":true},{"text":", and ","element":"span"},{"style":{"height":15.51},"width":77.68,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-72.png","element":"img","alt":" KR,l","inline":true,"padRight":true},{"text":"denotes the kernel matrix computed at ","element":"span"},{"style":{"height":13.11},"width":38.8,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-73.png","element":"img","alt":" Zl","inline":true},{"text":".","element":"span"}],[{"style":{"width":"100%"},"width":981,"height":917,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-74.png","element":"img"}],[{"style":{"height":25.87},"width":325.44,"height":64.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-75.png","element":"img","alt":"ηR = 6λR ln(12T/δ)ε2R","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20.37},"width":540.8,"height":50.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-76.png","element":"img","alt":" bR,l(z) = min{ηR˜σ2R,l−1(z), 1}","inline":true},{"text":". ","element":"span"},{"text":"Then, with probability at least ","element":"span"},{"style":{"height":16},"width":131.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-77.png","element":"img","alt":" 1 − δ/3","inline":true},{"text":", uniformly over all ","element":"span"},{"style":{"height":11.6},"width":100.12,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-78.png","element":"img","alt":"z ∈ Z","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":105.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-79.png","element":"img","alt":" l ∈ [τ]","inline":true},{"text":", the following holds:","element":"span"}],[{"style":{"width":"100%"},"width":980,"height":842,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-80.png","element":"img"}],[{"style":{"height":33.7},"width":640,"height":84.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-81.png","element":"img","alt":"�� ˜V −1R,l−1 ˜ΦTR,l−1εR,l−1�� ˜VR,l−1˜σR,l−1(z)","inline":true},{"text":", ","element":"span"},{"text":"where ","element":"span"},{"style":{"height":18.06},"width":695.48,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-82.png","element":"img","alt":"εR,l−1 := [εR,1,1, . . . , εR,l−1,H]T","inline":true,"padRight":true},{"text":"denotes ","element":"span"},{"text":"the vector ","element":"span"},{"text":"of ","element":"span"},{"text":"reward ","element":"span"},{"text":"noise ","element":"span"},{"text":"variables. ","element":"span"},{"text":"Now ","element":"span"},{"text":"see ","element":"span"},{"text":"that ","element":"span"},{"style":{"height":35.61},"width":965.84,"height":89.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-83.png","element":"img","alt":"�� ˜V −1R,l−1 ˜ΦTR,l−1εR,l−1�� ˜VR,l−1 = ��˜ΦTR,l−1εR,l−1�� ˜V −1R,l−1","inline":true},{"text":". Then by [30, Theorem 1], for any ","element":"span"},{"style":{"height":16},"width":206.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-84.png","element":"img","alt":" δ ∈ (0, 1)","inline":true},{"text":", with probability at least ","element":"span"},{"style":{"height":16},"width":130.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-85.png","element":"img","alt":" 1 − δ/6","inline":true},{"text":", uniformly over all ","element":"span"},{"style":{"height":11.6},"width":107.32,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-86.png","element":"img","alt":" z ∈ Z","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.2},"width":85.76,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-87.png","element":"img","alt":"l ≥ 1","inline":true},{"text":",","element":"span"}],[{"id":"id-1","style":{"width":"98%"},"width":961,"height":202,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-88.png","element":"img"}],[{"text":"Further, from [31, Theorem 1], with probability at least ","element":"span"},{"style":{"height":16},"width":133.76,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-89.png","element":"img","alt":"1 − δ/6","inline":true},{"text":", uniformly over all ","element":"span"},{"style":{"height":11.6},"width":115,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-90.png","element":"img","alt":" z ∈ Z","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":119.48,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-91.png","element":"img","alt":" l ∈ [τ]","inline":true},{"text":", we have","element":"span"},{"style":{"height":7.6},"width":16,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-92.png","element":"img","alt":"1","inline":true},{"style":{"height":20.47},"width":748,"height":51.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-93.png","element":"img","alt":"λR σ2R,l−1(z) ≤ ˜σ2R,l−1(z) ≤ λRσ2R,l−1(z)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.51},"width":124.12,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/3-94.png","element":"img","alt":" dR,l ≤","inline":true,"padRight":true},{"style":{"height":19.61},"width":460.48,"height":49.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-0.png","element":"img","alt":"6ηRλR�1 + 1H�γ(l−1)H(R)","inline":true},{"text":". Then, from [32, Equation 25], we have","element":"span"}],[{"id":"id-2","style":{"width":"92%"},"width":909,"height":86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-1.png","element":"img"}],[{"text":"(8) Now the result follows by combining ","element":"span"},{"href":"#id-1","text":"7 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-2","text":"8, ","element":"a"},{"text":"and applying an union bound.","element":"span"}],[{"style":{"width":"95%"},"width":937,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-2.png","element":"img"}],[{"style":{"height":18.06},"width":363.04,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-3.png","element":"img","alt":"mHImlH)−1kP,l(z, i)","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":16.7},"width":339.64,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-4.png","element":"img","alt":"kP,l(z, i) :=","inline":true},{"style":{"height":18.06},"width":773.72,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-5.png","element":"img","alt":"[kP ((z1,1, 1), (z, i)), . . . , kP ((zl,H, m), (z, i))]T","inline":true,"padRight":true},{"text":"denotes the vector of kernel evaluations between ","element":"span"},{"text":"(","element":"span"},{"text":"z, i","element":"span"},{"text":") ","element":"span"},{"text":"and elements of ","element":"span"},{"style":{"height":15.93},"width":38.8,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-6.png","element":"img","alt":"¯Zl","inline":true},{"text":", and ","element":"span"},{"style":{"height":15.5},"width":74.8,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-7.png","element":"img","alt":" KP,l","inline":true,"padRight":true},{"text":"denotes the kernel matrix computed at ","element":"span"},{"style":{"height":15.93},"width":38.8,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-8.png","element":"img","alt":"¯Zl","inline":true},{"text":".","element":"span"}],[{"style":{"width":"95%"},"width":938,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-9.png","element":"img"}],[{"style":{"height":25.87},"width":303.84,"height":64.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-10.png","element":"img","alt":"ηP = 6λP ln(12T/δ)ε2P","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20.37},"width":575.84,"height":50.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-11.png","element":"img","alt":" bP,l(z, i) = min{ηP ˜σ2P,l−1(z, i), 1}","inline":true},{"text":". ","element":"span"},{"text":"Then, with probability at least ","element":"span"},{"style":{"height":16},"width":131.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-12.png","element":"img","alt":" 1 − δ/3","inline":true},{"text":", uniformly over all ","element":"span"},{"style":{"height":11.6},"width":100.12,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-13.png","element":"img","alt":"z ∈ Z","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":105.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-14.png","element":"img","alt":" l ∈ [τ]","inline":true},{"text":", the following holds:","element":"span"}],[{"id":"id-4","style":{"width":"100%"},"width":980,"height":916,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-15.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"L ","element":"span"},{"text":"is a known upper bound over ","element":"span"},{"style":{"height":14.64},"width":216.76,"height":36.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-16.png","element":"img","alt":" LM⋆, D =","inline":true},{"style":{"height":16.8},"width":329.44,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-17.png","element":"img","alt":"maxs,s′∈S ∥s − s′∥2","inline":true,"padRight":true},{"text":"denotes the diameter of ","element":"span"},{"style":{"height":15.5},"width":198.04,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-18.png","element":"img","alt":" S, CP,T =","inline":true}],[{"style":{"width":"99%"},"width":976,"height":397,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-19.png","element":"img"}],[{"text":"By construction of the set of MDPs ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-20.png","element":"img","alt":" Ml","inline":true},{"text":", it follows that when the events ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-21.png","element":"img","alt":" ER,l","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.5},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-22.png","element":"img","alt":" EP,l","inline":true,"padRight":true},{"text":"are true for all episodes ","element":"span"},{"style":{"height":16},"width":119,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-23.png","element":"img","alt":" l ∈ [τ]","inline":true},{"text":", the unknown MDP ","element":"span"},{"style":{"height":13.1},"width":52.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-24.png","element":"img","alt":" M⋆","inline":true,"padRight":true},{"text":"lies in ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-25.png","element":"img","alt":" Ml","inline":true},{"text":". Thus ","element":"span"},{"style":{"height":21.33},"width":225.4,"height":53.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-26.png","element":"img","alt":" V Mlπl,1(sl,1) ≥","inline":true},{"style":{"height":21.33},"width":173.44,"height":53.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-27.png","element":"img","alt":"V M⋆π⋆,1(sl,1)","inline":true},{"text":", since ","element":"span"},{"style":{"height":13.1},"width":48.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-28.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"is the most optimistic MDP of ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-29.png","element":"img","alt":" Ml","inline":true},{"text":". ","element":"span"},{"text":"This implies","element":"span"}],[{"id":"id-3","style":{"width":"94%"},"width":930,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-30.png","element":"img"}],[{"text":"Now by the reproducing property of RKHS and CauchySchwartz inequality ","element":"span"},{"style":{"height":20},"width":609.4,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-31.png","element":"img","alt":"��R⋆(z)�� = ��⟨R⋆, kR(z, ·)⟩kR�� ≤","inline":true,"padRight":true},{"style":{"height":22.31},"width":385.44,"height":55.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-32.png","element":"img","alt":"��R⋆��kR kR(z, z) ≤ BR","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":100.6,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-33.png","element":"img","alt":" z ∈ Z","inline":true},{"text":". Thus ","element":"span"},{"href":"#id-3","text":"9, ","element":"a"},{"text":"[24, Lemma ","element":"span"},{"text":"7] and [24, Lemma 9] together imply that for any ","element":"span"},{"style":{"height":12.4},"width":165.44,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-34.png","element":"img","alt":" 0 < δ < 1","inline":true},{"text":", with probability at least ","element":"span"},{"style":{"height":16},"width":128,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-35.png","element":"img","alt":" 1 − δ/3","inline":true},{"text":",","element":"span"}],[{"style":{"width":"99%"},"width":970,"height":179,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-36.png","element":"img"}],[{"text":"(10) Now, by triangle inequality ","element":"span"},{"style":{"height":19.81},"width":451.48,"height":49.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-37.png","element":"img","alt":"��RMl(zl,h) −R⋆(zl,h)�� ≤","inline":true},{"style":{"height":20},"width":968.68,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-38.png","element":"img","alt":"��RMl(zl,h) − ˜µR,l−1(zl,h)�� + ��R⋆(zl,h) − ˜µR,l−1(zl,h)��","inline":true},{"text":". Therefore, when the event ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-39.png","element":"img","alt":" ER,l","inline":true,"padRight":true},{"text":"is true,","element":"span"}],[{"style":{"width":"90%"},"width":887,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-40.png","element":"img"}],[{"text":"since the mean reward function","element":"span"},{"style":{"height":14.83},"width":70.96,"height":37.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-41.png","element":"img","alt":"RMl","inline":true,"padRight":true},{"text":"lies in the confidence set ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-42.png","element":"img","alt":" CR,l","inline":true},{"text":". Similarly when the event ","element":"span"},{"style":{"height":15.5},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-43.png","element":"img","alt":" EP,l","inline":true,"padRight":true},{"text":"is true,","element":"span"}],[{"style":{"width":"100%"},"width":980,"height":1124,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-44.png","element":"img"}],[{"text":"where we have used the fact that both ","element":"span"},{"style":{"height":15.91},"width":66.16,"height":39.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-45.png","element":"img","alt":" βR,l","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.91},"width":63.28,"height":39.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-46.png","element":"img","alt":" βP,l","inline":true,"padRight":true},{"text":"are non-decreasing with the number of episodes ","element":"span"},{"text":"l ","element":"span"},{"text":"and that ","element":"span"},{"style":{"height":14.83},"width":167.64,"height":37.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-47.png","element":"img","alt":"LMl ≤ L","inline":true,"padRight":true},{"text":"by construction of ","element":"span"},{"style":{"height":13.11},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-48.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"(and since ","element":"span"},{"style":{"height":13.11},"width":174.16,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-49.png","element":"img","alt":" Ml ∈ Ml","inline":true},{"text":"). Now the result follows from [24, Lemma 11] by noting that ","element":"span"},{"style":{"height":20.43},"width":745.12,"height":51.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-50.png","element":"img","alt":"�τl=1�Hh=1 σR,l−1(zl,h) ≤ �2eHT γT(R)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20.85},"width":833.44,"height":52.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-51.png","element":"img","alt":"�τl=1�Hh=1 ∥σP,l−1(zl,h)∥2 ≤�2emHT γmT(P)","inline":true},{"text":".","element":"span"}],[{"text":"B. Interpretation of the Bound","element":"span"}],[{"text":"Theorem ","element":"span"},{"href":"#id-4","text":"1 ","element":"a"},{"text":"implies that the cumulative regret of KernelUCRL after ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-52.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"episodes is ","element":"span"},{"style":{"height":28.8},"width":536.44,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-53.png","element":"img","alt":"˜O��HγT (R) + γT (R)�√T +","inline":true}],[{"style":{"height":28.8},"width":715.2,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-54.png","element":"img","alt":"L��mHγmT(P)+γmT (P)�√T +H√T�","inline":true},{"text":"with high prob-","element":"span"}],[{"text":"ability. ( ","element":"span"},{"style":{"height":14.83},"width":30,"height":37.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-55.png","element":"img","alt":"˜O","inline":true,"padRight":true},{"text":"hides logarithmic factors.) Now we illustrate the growth of ","element":"span"},{"style":{"height":16},"width":107.68,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-56.png","element":"img","alt":" γT (R)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":136.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-57.png","element":"img","alt":" γmT (P)","inline":true,"padRight":true},{"text":"as functions of ","element":"span"},{"text":"T ","element":"span"},{"text":"with the following concrete examples.","element":"span"}],[{"text":"Let ","element":"span"},{"style":{"height":16},"width":521.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-58.png","element":"img","alt":" kR(z, z′) := k1(s, s′)+k2(a, a′)","inline":true},{"text":", i.e., ","element":"span"},{"style":{"height":13.1},"width":44.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-59.png","element":"img","alt":" kR","inline":true,"padRight":true},{"text":"is an additive kernel of ","element":"span"},{"style":{"height":13.1},"width":340.52,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-60.png","element":"img","alt":" k1 : S × S → R","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.9},"width":349.64,"height":34.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/4-61.png","element":"img","alt":" k2 : A × A → R","inline":true},{"text":". Then, from [33, Theorem 3], ","element":"span"},{"style":{"height":16},"width":485.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-0.png","element":"img","alt":" γT (R) ≤ γT (k1) + γT (k2) +","inline":true,"padRight":true},{"text":"2 ln ","element":"span"},{"text":"T ","element":"span"},{"text":". Now if both ","element":"span"},{"style":{"height":14},"width":332.48,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-1.png","element":"img","alt":" S ⊂ Rm, A ⊂ Rn","inline":true,"padRight":true},{"text":"are compact and convex sets, and both ","element":"span"},{"style":{"height":14},"width":92.8,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-2.png","element":"img","alt":" k1, k2","inline":true,"padRight":true},{"text":"are Squared Exponential (SE) kernels, then from [3, Theorem 4], ","element":"span"},{"style":{"height":19.2},"width":377.04,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-3.png","element":"img","alt":" γT (k1) = O�(ln T )m�","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.2},"width":374.16,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-4.png","element":"img","alt":" γT (k2) = O�(ln T )n�","inline":true},{"text":". Hence, in this case ","element":"span"},{"style":{"height":16},"width":157.24,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-5.png","element":"img","alt":" γT (R) =","inline":true},{"style":{"height":20.37},"width":318.48,"height":50.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-6.png","element":"img","alt":"˜O�(ln T )max{m,n}�","inline":true},{"text":".","element":"span"}],[{"text":"Further, let ","element":"span"},{"style":{"height":16},"width":602.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-7.png","element":"img","alt":" kP ((z, i), (z′, j)) := k3(z, z′)k4(i, j)","inline":true},{"text":", i.e., ","element":"span"},{"style":{"height":13.1},"width":44.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-8.png","element":"img","alt":" kP","inline":true,"padRight":true},{"text":"is a product kernel of ","element":"span"},{"style":{"height":13.1},"width":266.6,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-9.png","element":"img","alt":" k3 : Z ×Z → R","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":276.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-10.png","element":"img","alt":" k4 : [m]×[m] →","inline":true,"padRight":true},{"text":"R","element":"span"},{"text":". Then, from [33, Theorem 2], ","element":"span"},{"style":{"height":16},"width":420.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-11.png","element":"img","alt":" γmT (P) ≤ mγmT (k3) +","inline":true,"padRight":true},{"text":"m ","element":"span"},{"text":"ln(","element":"span"},{"text":"mT ","element":"span"},{"text":")","element":"span"},{"text":", since all kernel matrices over any subset of ","element":"span"},{"text":"{","element":"span"},{"text":"1","element":"span"},{"text":", . . . , m","element":"span"},{"text":"} ","element":"span"},{"text":"have rank at most ","element":"span"},{"text":"m","element":"span"},{"text":". Now if ","element":"span"},{"style":{"height":13.1},"width":36.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-12.png","element":"img","alt":" k3","inline":true,"padRight":true},{"text":"is a Squared Exponential kernel on ","element":"span"},{"text":"Z","element":"span"},{"text":", then ","element":"span"},{"style":{"height":20.37},"width":499.44,"height":50.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-13.png","element":"img","alt":" γmT (k3) = ˜O�(ln(mT ))m+n�","inline":true},{"text":". Hence, in this case ","element":"span"},{"style":{"height":22.34},"width":537.84,"height":55.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-14.png","element":"img","alt":" γmT (P) = O�m�ln(mT )�m+n�","inline":true},{"text":".","element":"span"}],[{"text":"In essence, ","element":"span"},{"style":{"height":16},"width":107.68,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-15.png","element":"img","alt":" γT (R)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":136.48,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-16.png","element":"img","alt":" γmT (P)","inline":true,"padRight":true},{"text":"grow sublinearly with ","element":"span"},{"text":"T ","element":"span"},{"text":"for some popular kernels, e.g. Squared Exponential, polynomial and Mat´ern. Now, since the cumulative regret of KernelUCRL scales linearly with ","element":"span"},{"style":{"height":16},"width":107.68,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-17.png","element":"img","alt":" γT (R)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":136.48,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-18.png","element":"img","alt":" γmT (P)","inline":true},{"text":", it, in turn, grows sublinearly with ","element":"span"},{"text":"T ","element":"span"},{"text":"for these kernels.","element":"span"}]]},{"heading":"VII. ANALYSIS OF PSRL UNDER NYSTR ¨OM APPROXIMATION","paragraphs":[[{"text":"Optimizing for an optimistic policy is not computationally tractable in general, even though planning for the optimal policy is possible for a given MDP. A popular approach to overcome this difficulty is to sample a random MDP at every episode and solve for its optimal policy, called posterior sampling [34].","element":"span"}],[{"text":"PSRL (Algorithm ","element":"span"},{"href":"#id-0","text":"2)","element":"a"},{"text":", in its most general form, starts with a prior distribution ","element":"span"},{"style":{"height":10.8},"width":29,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-19.png","element":"img","alt":" Φ","inline":true,"padRight":true},{"text":"over MDPs. At the beginning of episode ","element":"span"},{"text":"l","element":"span"},{"text":", using the history of observations ","element":"span"},{"style":{"height":13.1},"width":84.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-20.png","element":"img","alt":" Hl−1","inline":true},{"text":", it updates the posterior ","element":"span"},{"style":{"height":13.1},"width":38.8,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-21.png","element":"img","alt":" Φl","inline":true,"padRight":true},{"text":"and samples an MDP ","element":"span"},{"style":{"height":13.1},"width":48.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-22.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"from ","element":"span"},{"style":{"height":13.1},"width":38.8,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-23.png","element":"img","alt":"Φl","inline":true},{"text":". (Sampling can be done using MCMC methods even if ","element":"span"},{"style":{"height":13.1},"width":38.8,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-24.png","element":"img","alt":" Φl","inline":true,"padRight":true},{"text":"doesn’t admit any closed form.) It then selects an optimal policy ","element":"span"},{"style":{"height":9.1},"width":32.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-25.png","element":"img","alt":" πl","inline":true,"padRight":true},{"text":"of the sampled MDP ","element":"span"},{"style":{"height":13.1},"width":48.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-26.png","element":"img","alt":" Ml","inline":true},{"text":", in the sense that ","element":"span"},{"style":{"height":22.1},"width":436,"height":55.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-27.png","element":"img","alt":" V Mlπl,h(s) = maxπ V Mlπ,h(s)","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":114.8,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-28.png","element":"img","alt":" s ∈ S","inline":true,"padRight":true},{"text":"and for all ","element":"span"},{"text":"h ","element":"span"},{"text":"= 1","element":"span"},{"text":", ","element":"span"},{"text":"2","element":"span"},{"text":", . . ., H","element":"span"},{"text":", and executes ","element":"span"},{"style":{"height":9.1},"width":32.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-29.png","element":"img","alt":" πl","inline":true,"padRight":true},{"text":"for the entire episode.","element":"span"}],[{"id":"id-0","style":{"width":"100%"},"width":980,"height":739,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-30.png","element":"img"}],[{"text":"[34] show that if we have a frequentist regret bound for UCRL in hand, then we can obtain a similar bound (upto a constant factor) on the Bayes regret, defined as the expected ","element":"span"},{"text":"regret under the prior distribution ","element":"span"},{"style":{"height":10.8},"width":29,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-31.png","element":"img","alt":" Φ","inline":true},{"text":", of PSRL. We use this idea to obtain a sublinear bound on the Bayes regret of PSRL under kernel approximation.","element":"span"}],[{"text":"Theorem 2 (Bayes regret of PSRL under Nystr¨om approximation): Let the assumptions in Section ","element":"span"},{"href":"#id-5","text":"IV ","element":"a"},{"text":"hold and ","element":"span"},{"style":{"height":10.8},"width":29,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-32.png","element":"img","alt":" Φ","inline":true,"padRight":true},{"text":"be a (known) prior distribution over MDPs ","element":"span"},{"style":{"height":13.1},"width":52.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-33.png","element":"img","alt":" M⋆","inline":true},{"text":". Then, the Bayes regret of PSRL satisfies","element":"span"}],[{"style":{"width":"100%"},"width":980,"height":503,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-34.png","element":"img"}]]},{"heading":"VIII. CONCLUSIONS","paragraphs":[[{"text":"Any MDP ","element":"span"},{"text":"M ","element":"span"},{"text":"whose mean reward function satisfies ","element":"span"},{"style":{"height":18.06},"width":376.48,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-35.png","element":"img","alt":"RM(z) = θTR ˜ϕR,l(z)","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":15.66},"width":215.92,"height":39.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-36.png","element":"img","alt":" θR ∈ RdR,l","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":33.69},"width":530.8,"height":84.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-37.png","element":"img","alt":"��θR − ˜θR,l−1�� ˜VR,l−1≤√HβR,l","inline":true},{"text":", and mean transition func-","element":"span"}],[{"text":"tion satisfies ","element":"span"},{"style":{"height":18.06},"width":730.03,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-38.png","element":"img","alt":"P M(z, i) = θTP ˜ϕP,l(z, i), i = 1, . . . , m","inline":true},{"text":", for some ","element":"span"},{"style":{"height":15.66},"width":207.76,"height":39.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-39.png","element":"img","alt":" θP ∈ RdP,l","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":33.5},"width":392.92,"height":83.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-40.png","element":"img","alt":"��θP − ˜θP,l−1�� ˜VP,l−1 ≤","inline":true},{"style":{"height":18.94},"width":167.92,"height":47.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-41.png","element":"img","alt":"√mHβP,l","inline":true},{"text":", lies in the set ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-42.png","element":"img","alt":" Ml","inline":true},{"text":". However, there might be other MDPs in ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-43.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"which do not posses this linear structure. Therefore, optimal planning may be computationally intractable even for a single MDP. So it is common in the literature to assume access to an approximate MDP planner ","element":"span"},{"style":{"height":16},"width":133.6,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-44.png","element":"img","alt":" Γ(M, ε)","inline":true,"padRight":true},{"text":"which returns an ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-45.png","element":"img","alt":" ε","inline":true},{"text":"-optimal policy for ","element":"span"},{"text":"M","element":"span"},{"text":". Given such a planner ","element":"span"},{"style":{"height":10.8},"width":25,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-46.png","element":"img","alt":" Γ","inline":true},{"text":", if it is possible to obtain (through extended value iteration [7] or otherwise) an efficient planner ","element":"span"},{"style":{"height":18.83},"width":140.8,"height":47.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-47.png","element":"img","alt":"˜Γ(M, ε)","inline":true,"padRight":true},{"text":"which returns an ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-48.png","element":"img","alt":" ε","inline":true},{"text":"-optimal policy for the most optimistic MDP from a family ","element":"span"},{"text":"M","element":"span"},{"text":", then we modify PSRL and Kernel-UCRL to choose ","element":"span"},{"style":{"height":19.2},"width":324.64,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-49.png","element":"img","alt":" πl = Γ(Ml,�H/l)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":9.1},"width":79.48,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-50.png","element":"img","alt":" πl =","inline":true},{"style":{"height":19.6},"width":240.64,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-51.png","element":"img","alt":"˜Γ(Ml,�H/l)","inline":true,"padRight":true},{"text":"respectively at every episode ","element":"span"},{"text":"l","element":"span"},{"text":". It follows that this adds only an ","element":"span"},{"style":{"height":18.24},"width":125.44,"height":45.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-52.png","element":"img","alt":" O(√T)","inline":true,"padRight":true},{"text":"factor in the respective regret bounds. The design of such approximate planners for continuous state and action spaces remains a subject of active research, whereas our focus in this work is on the statistical efficiency of the online learning problem.","element":"span"}]]},{"heading":"APPENDIX","paragraphs":[[{"text":"In this section we will assume that ","element":"span"},{"style":{"height":11.6},"width":118.28,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-53.png","element":"img","alt":" S ⊂ R","inline":true},{"text":", i.e., ","element":"span"},{"text":"m ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":12.4},"width":133.76,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-54.png","element":"img","alt":" A ⊂ Rn","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"text":"n ","element":"span"},{"text":"= ","element":"span"},{"text":"O","element":"span"},{"text":"(1)","element":"span"},{"text":". Also we will assume that ","element":"span"},{"style":{"height":13.1},"width":44.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-55.png","element":"img","alt":"kR","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.1},"width":44.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-56.png","element":"img","alt":" kP","inline":true,"padRight":true},{"text":"are Squared Exponential (SE) kernels defined over ","element":"span"},{"style":{"height":17.36},"width":140.32,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-57.png","element":"img","alt":" [0, 1]n+1","inline":true,"padRight":true},{"text":"with length scale parameters ","element":"span"},{"style":{"height":13.1},"width":36,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-58.png","element":"img","alt":" lR","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.1},"width":36,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-59.png","element":"img","alt":" lP","inline":true,"padRight":true},{"text":", respectively.","element":"span"}],[{"text":"A. Kernel-UCRL under QFF Approximation","element":"span"}],[{"text":"First, we choose an ","element":"span"},{"style":{"height":13.1},"width":151.88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-60.png","element":"img","alt":" mR ∈ N","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":36.78},"width":979,"height":91.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-61.png","element":"img","alt":" 1/l2R ≤ mR ≤C1/l2R","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20.19},"width":578.08,"height":50.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-62.png","element":"img","alt":" log4/e(T 6) ≤ mR ≤ C2 log4/e(T 6)","inline":true,"padRight":true},{"text":"for some ap- ","element":"span"},{"text":"propriate constants ","element":"span"},{"style":{"height":13.11},"width":44.32,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-63.png","element":"img","alt":" C1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.11},"width":44.32,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-64.png","element":"img","alt":" C2","inline":true},{"text":". Then we set ","element":"span"},{"style":{"height":17.36},"width":271.84,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-65.png","element":"img","alt":" dR = 2(mR)n+1","inline":true,"padRight":true},{"text":"and construct ","element":"span"},{"style":{"height":13.1},"width":44.64,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-66.png","element":"img","alt":" dR","inline":true,"padRight":true},{"text":"dimensional feature map ","element":"span"},{"style":{"height":16},"width":104.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-67.png","element":"img","alt":" ˜ϕR(z)","inline":true,"padRight":true},{"text":"using ","element":"span"},{"href":"#id-6","text":"4. ","element":"a"},{"text":"Similarly, we choose an ","element":"span"},{"style":{"height":13.1},"width":165.32,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-68.png","element":"img","alt":" mP ∈ N","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":17.78},"width":133.24,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/5-69.png","element":"img","alt":" 1/l2P ≤","inline":true,"padRight":true},{"style":{"height":17.78},"width":226.56,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-0.png","element":"img","alt":"mP ≤ C3/l2P","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20},"width":597.28,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-1.png","element":"img","alt":" log4/e(T 6) ≤ mP ≤ C4 log4/e(T 6)","inline":true,"padRight":true},{"text":"for ","element":"span"},{"text":"some appropriate constants ","element":"span"},{"style":{"height":13.1},"width":44.32,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-2.png","element":"img","alt":" C3","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.1},"width":44.32,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-3.png","element":"img","alt":" C4","inline":true,"padRight":true},{"text":"and construct the feature map ","element":"span"},{"style":{"height":16},"width":104.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-4.png","element":"img","alt":" ˜ϕP (z)","inline":true,"padRight":true},{"text":"of dimension ","element":"span"},{"style":{"height":17.36},"width":292.48,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-5.png","element":"img","alt":" dP = 2(mP )n+1","inline":true},{"text":". Now we construct confidence sets ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-6.png","element":"img","alt":" CR,l","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.5},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-7.png","element":"img","alt":" CP,l","inline":true,"padRight":true},{"text":"as follows. First, we compute ","element":"span"},{"style":{"height":22.22},"width":480.64,"height":55.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-8.png","element":"img","alt":"˜θR,l−1 = ˜V −1R,l−1 ˜ΦTR,l−1Rl−1","inline":true},{"text":", where ","element":"span"},{"style":{"height":13.1},"width":132.76,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-9.png","element":"img","alt":" Rl−1 =","inline":true},{"style":{"height":18.06},"width":303.8,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-10.png","element":"img","alt":"[r1,1, . . . , rl−1,H]T","inline":true,"padRight":true},{"text":", ","element":"span"},{"style":{"height":19.54},"width":645.56,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-11.png","element":"img","alt":"˜ΦR,l−1 = [ ˜ϕR(z1,1), . . . , ˜ϕR(zl−1,H)]T","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":21.65},"width":573,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-12.png","element":"img","alt":"˜VR,l−1 = ˜ΦTR,l−1 ˜ΦR,l−1 + HIdR","inline":true},{"text":". Then we fix any ","element":"span"},{"style":{"height":12.4},"width":195.2,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-13.png","element":"img","alt":"0 < δ < 1","inline":true,"padRight":true},{"text":"and define ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-14.png","element":"img","alt":" CR,l","inline":true,"padRight":true},{"text":"to be the set of all functions ","element":"span"},{"style":{"height":14},"width":180.2,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-15.png","element":"img","alt":"f : Z → R","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"width":"99%"},"width":977,"height":264,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-16.png","element":"img"}],[{"text":"Similarly, we compute ","element":"span"},{"style":{"height":22.23},"width":503.2,"height":55.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-17.png","element":"img","alt":"˜θP,l−1 = ˜V −1P,l−1 ˜ΦTP,l−1Sl−1","inline":true},{"text":", ","element":"span"},{"text":"where ","element":"span"},{"style":{"height":18.07},"width":579.8,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-18.png","element":"img","alt":"Sl−1 = [s1,2, . . . , sl−1,H+1]T","inline":true,"padRight":true},{"text":", ","element":"span"},{"style":{"height":19.54},"width":206.2,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-19.png","element":"img","alt":"˜ΦP,l−1 =","inline":true},{"style":{"height":18.26},"width":471.8,"height":45.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-20.png","element":"img","alt":"[ ˜ϕP (z1,1), . . . , ˜ϕP (zl−1,H)]T","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":21.65},"width":422.2,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-21.png","element":"img","alt":"˜VP,l−1 = ˜ΦTP,l−1 ˜ΦP,l−1 +","inline":true},{"style":{"height":14.64},"width":90.56,"height":36.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-22.png","element":"img","alt":"HIdP","inline":true,"padRight":true},{"text":". Now, we define ","element":"span"},{"style":{"height":15.5},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-23.png","element":"img","alt":" CP,l","inline":true,"padRight":true},{"text":"to be the set of all functions ","element":"span"},{"style":{"height":14},"width":208,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-24.png","element":"img","alt":"f : Z → Rm","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"width":"99%"},"width":977,"height":265,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-25.png","element":"img"}],[{"text":"Next, following the same approach as before, we build the set ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-26.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"of all plausible MDPs, choose an optimistic policy ","element":"span"},{"style":{"height":9.1},"width":32.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-27.png","element":"img","alt":"πl","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":13.1},"width":58,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-28.png","element":"img","alt":" Ml","inline":true,"padRight":true},{"text":"and execute ","element":"span"},{"style":{"height":9.1},"width":32.56,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-29.png","element":"img","alt":" πl","inline":true,"padRight":true},{"text":"for the entire episode.","element":"span"}],[{"id":"id-8","text":"B. Regret Bound under QFF Approximation","element":"span"}],[{"style":{"width":"95%"},"width":938,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-30.png","element":"img"}],[{"text":"(0","element":"span"},{"text":", ","element":"span"},{"text":"1)","element":"span"},{"text":". Then, with probability at least ","element":"span"},{"style":{"height":16},"width":136.16,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-31.png","element":"img","alt":" 1 − δ/3","inline":true},{"text":", uniformly over all ","element":"span"},{"style":{"height":11.6},"width":100.12,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-32.png","element":"img","alt":" z ∈ Z","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.2},"width":159.72,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-33.png","element":"img","alt":" 1 ≤ l ≤ τ","inline":true},{"text":",","element":"span"}],[{"style":{"width":"92%"},"width":907,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-34.png","element":"img"}],[{"style":{"height":22.03},"width":646.24,"height":55.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-35.png","element":"img","alt":"˜σ2R,l−1(z) = H ˜ϕR,l(z)T ˜V −1R,l−1 ˜ϕR,l(z)","inline":true},{"text":", where ","element":"span"},{"style":{"height":15.5},"width":188.64,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-36.png","element":"img","alt":" ˜ϕR,l = ˜ϕR","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.5},"width":235.68,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-37.png","element":"img","alt":"dR,l = dR","inline":true,"padRight":true},{"text":"for ","element":"span"},{"text":"every ","element":"span"},{"text":"episode ","element":"span"},{"text":"l","element":"span"},{"text":". ","element":"span"},{"text":"Now ","element":"span"},{"text":"define ","element":"span"},{"style":{"height":18.06},"width":712,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-38.png","element":"img","alt":"αR,l(z) = kR,l(z)T (KR,l+HIlH)−1R⋆,l−1","inline":true},{"text":". Then from [32, Equation 7], we have","element":"span"},{"style":{"height":20},"width":609.76,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-39.png","element":"img","alt":"��R⋆(z) − αR,l−1(z)�� ≤ BRσR,l−1(z)","inline":true},{"text":". Let ","element":"span"},{"style":{"height":20.67},"width":894.08,"height":51.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-40.png","element":"img","alt":" εdR := supz,z′∈Z��kR(z, z′) − ˜ϕR(z)T ˜ϕR(z′)�� < 1","inline":true},{"text":". Then from [32, Lemma 15],","element":"span"}],[{"style":{"width":"100%"},"width":980,"height":367,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-41.png","element":"img"}],[{"id":"id-7","text":"From [25, Theorem 1] ","element":"span"},{"style":{"height":25.07},"width":390.12,"height":62.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-42.png","element":"img","alt":" εdR ≤ (n+1)2n 1√2mmRR","inline":true}],[{"style":{"height":29.54},"width":295.2,"height":73.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-43.png","element":"img","alt":"O�� e4mRl2R�mR�","inline":true},{"text":", since ","element":"span"},{"text":"n ","element":"span"},{"text":"= ","element":"span"},{"text":"O","element":"span"},{"text":"(1)","element":"span"},{"text":". If ","element":"span"},{"style":{"height":17.78},"width":207.36,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-44.png","element":"img","alt":" mR ≥ 1/l2R","inline":true},{"text":", then ","element":"span"},{"style":{"height":16},"width":349.6,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-45.png","element":"img","alt":"εdR = O ((e/4)mR)","inline":true},{"text":". Further if ","element":"span"},{"style":{"height":20.19},"width":317.92,"height":50.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-46.png","element":"img","alt":" mR ≥ log4/e(T 6)","inline":true},{"text":", then ","element":"span"},{"style":{"height":17.36},"width":262.24,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-47.png","element":"img","alt":"εdR = O(1/T 6)","inline":true,"padRight":true},{"text":"and thus, in turn, ","element":"span"},{"style":{"height":22.99},"width":409.24,"height":57.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-48.png","element":"img","alt":" Hε1/2dR l2 = O(1/H2τ) =","inline":true,"padRight":true},{"text":"O","element":"span"},{"text":"(1","element":"span"},{"text":"/T ","element":"span"},{"text":") ","element":"span"},{"text":"for each ","element":"span"},{"style":{"height":13.2},"width":87.24,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-49.png","element":"img","alt":" l ≤ τ","inline":true},{"text":". Now the result follows by combining ","element":"span"},{"id":"id-9","href":"#id-1","text":"7 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-7","text":"13 ","element":"a"},{"text":"using the triangle inequality.","element":"span"}],[{"style":{"width":"95%"},"width":937,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-50.png","element":"img"}],[{"text":"(0","element":"span"},{"text":", ","element":"span"},{"text":"1)","element":"span"},{"text":". Then, with probability at least ","element":"span"},{"style":{"height":16},"width":136.16,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-51.png","element":"img","alt":" 1 − δ/3","inline":true},{"text":", uniformly over all ","element":"span"},{"style":{"height":11.6},"width":100.12,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-52.png","element":"img","alt":" z ∈ Z","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.2},"width":159.72,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-53.png","element":"img","alt":" 1 ≤ l ≤ τ","inline":true},{"text":",","element":"span"}],[{"style":{"width":"92%"},"width":904,"height":92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-54.png","element":"img"}],[{"text":"Theorem 3 (Regret bound for Kernel-UCRL): Let the assumptions in Section ","element":"span"},{"href":"#id-5","text":"IV ","element":"a"},{"text":"hold. Further, let ","element":"span"},{"text":"m ","element":"span"},{"text":"= 1","element":"span"},{"text":", ","element":"span"},{"text":"n ","element":"span"},{"text":"= ","element":"span"},{"text":"O","element":"span"},{"text":"(1) ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":14},"width":125.28,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-55.png","element":"img","alt":" kR, kP","inline":true,"padRight":true},{"text":"are SE kernels on ","element":"span"},{"style":{"height":17.55},"width":140.32,"height":43.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-56.png","element":"img","alt":" [0, 1]n+1","inline":true,"padRight":true},{"text":"with length scale parameters ","element":"span"},{"style":{"height":14},"width":113.76,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-57.png","element":"img","alt":" lR, lP","inline":true,"padRight":true},{"text":", respectively. Then, for any ","element":"span"},{"style":{"height":12.4},"width":68.28,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-58.png","element":"img","alt":" δ ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"text":", ","element":"span"},{"text":"1)","element":"span"},{"text":", Kernel-UCRL with QFF approximation enjoys, with probability at least ","element":"span"},{"style":{"height":11.6},"width":87.64,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-59.png","element":"img","alt":" 1 − δ","inline":true},{"text":", the regret bound","element":"span"}],[{"style":{"width":"100%"},"width":980,"height":366,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-60.png","element":"img"}],[{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-61.png","element":"img","alt":"ER,l","inline":true,"padRight":true},{"text":":=","element":"span"},{"style":{"height":20},"width":632.44,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-62.png","element":"img","alt":"� ��R⋆(z) − ˜µR,l−1(z)�� ≤ βR,l˜σR,l−1(z","inline":true},{"text":") + ","element":"span"},{"style":{"height":19.2},"width":262.04,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-63.png","element":"img","alt":" O(BR/T ), ∀z�,","inline":true},{"style":{"height":15.51},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-64.png","element":"img","alt":"EP,l","inline":true,"padRight":true},{"text":":=","element":"span"},{"style":{"height":19.81},"width":623.8,"height":49.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-65.png","element":"img","alt":"� ��P ⋆(z) − ˜µP,l−1(z)�� ≤ βP,l˜σP,l−1(z","inline":true},{"text":") + ","element":"span"},{"style":{"height":19.2},"width":262.04,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-66.png","element":"img","alt":" O(BP /T ), ∀z�.","inline":true}],[{"text":"When the events ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-67.png","element":"img","alt":" ER,l","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.5},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-68.png","element":"img","alt":" EP,l","inline":true,"padRight":true},{"text":"are true for all episodes ","element":"span"},{"style":{"height":11.6},"width":51.96,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-69.png","element":"img","alt":" l ∈","inline":true},{"style":{"height":16},"width":43.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-70.png","element":"img","alt":"[τ]","inline":true},{"text":", then using a similar approach as in the proof of Theorem ","element":"span"},{"href":"#id-4","text":"1 ","element":"a"},{"text":"we can show that for any ","element":"span"},{"style":{"height":12.4},"width":175.04,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-71.png","element":"img","alt":" 0 < δ < 1","inline":true},{"text":", with probability at least ","element":"span"},{"style":{"height":16},"width":128,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-72.png","element":"img","alt":" 1 − δ/3","inline":true},{"text":",","element":"span"}],[{"id":"id-10","style":{"width":"99%"},"width":973,"height":179,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-73.png","element":"img"}],[{"text":"(14) Also for every episode ","element":"span"},{"text":"l ","element":"span"},{"text":"the following holds:","element":"span"}],[{"style":{"width":"99%"},"width":975,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-74.png","element":"img"}],[{"text":"By our choice of ","element":"span"},{"style":{"height":9.1},"width":59.04,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-75.png","element":"img","alt":" mR","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":9.1},"width":59.04,"height":22.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-76.png","element":"img","alt":" mP","inline":true,"padRight":true},{"text":", Lemmas ","element":"span"},{"href":"#id-8","text":"3 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-9","text":"4 ","element":"a"},{"text":"together imply that the events ","element":"span"},{"style":{"height":15.5},"width":64.72,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-77.png","element":"img","alt":" ER,l","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.5},"width":61.84,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-78.png","element":"img","alt":" EP,l","inline":true,"padRight":true},{"text":"are true for all episodes ","element":"span"},{"style":{"height":16},"width":109.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-79.png","element":"img","alt":"l ∈ [τ]","inline":true},{"text":". Further, since ","element":"span"},{"style":{"height":28.72},"width":528.64,"height":71.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-80.png","element":"img","alt":"12 ln det( ˜VR,l−1)det(HIdR ) = O(dR ln(lH))","inline":true,"padRight":true},{"text":"and","element":"span"}],[{"style":{"height":28.72},"width":644.08,"height":71.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-81.png","element":"img","alt":"2 ln det( ˜VP,l−1)det(HIdP ) = O(dP ln(lH)), βR,l","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.9},"width":63.28,"height":39.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-82.png","element":"img","alt":" βP,l","inline":true,"padRight":true},{"text":"are non-decreasing functions in ","element":"span"},{"text":"l","element":"span"},{"text":". Now, combining ","element":"span"},{"href":"#id-10","text":"14 ","element":"a"},{"text":"and 15, and applying a union bound we have, with probability at least ","element":"span"},{"style":{"height":11.6},"width":87.64,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-83.png","element":"img","alt":"1 − δ","inline":true},{"text":", that","element":"span"}],[{"style":{"width":"96%"},"width":941,"height":186,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-84.png","element":"img"}],[{"text":"From [24, Lemma 11], ","element":"span"},{"style":{"height":19.2},"width":567.6,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-85.png","element":"img","alt":" ˜σR,l−1(zl,h) = O�√HT dR ln T�","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.2},"width":594,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-86.png","element":"img","alt":" ˜σP,l−1(zl,h) = O�√HT dP ln T�","inline":true},{"text":", since ","element":"span"},{"style":{"height":16},"width":153.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-87.png","element":"img","alt":" γt(k) =","inline":true,"padRight":true},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"ln ","element":"span"},{"text":"t","element":"span"},{"text":") ","element":"span"},{"text":"for any linear kernel ","element":"span"},{"text":"k ","element":"span"},{"text":"defined over ","element":"span"},{"style":{"height":13.36},"width":45.8,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-88.png","element":"img","alt":" Rd","inline":true},{"text":". Now the result follows by noting that ","element":"span"},{"style":{"height":19.2},"width":543.6,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-89.png","element":"img","alt":" dR = (mR)n+1 = O�(ln T )n+1�","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.2},"width":544.08,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1911.01871/images/6-90.png","element":"img","alt":" dP = (mP )n+1 = O�(ln T )n+1�","inline":true},{"text":"for ","element":"span"},{"text":"n ","element":"span"},{"text":"= ","element":"span"},{"text":"O","element":"span"},{"text":"(1)","element":"span"},{"text":".","element":"span"}]]},{"heading":"ACKNOWLEDGMENT","paragraphs":[[{"text":"Sayak Ray Chowdhury is supported by a Google PhD Fellowship. Aditya Gopalan is grateful for support from the DST INSPIRE faculty grant IFA13- ENG-69.","element":"span"}]]},{"heading":"REFERENCES","paragraphs":[[{"text":"[1] A. L. Strehl, L. Li, and M. L. Littman, “Reinforcement learning in finite MDPs: PAC analysis,” J. Mach. Learn. Res., vol. 10, pp. 2413– 2444, Dec. 2009.","element":"span"}],[{"text":"[2] I. Osband and B. Van Roy, “Model-based reinforcement learning and the eluder dimension,” in Advances in Neural Information Processing Systems, 2014, pp. 1466–1474.","element":"span"}],[{"text":"[3] N. Srinivas, A. Krause, S. M. Kakade, and M. Seeger, “Gaussian process optimization in the bandit setting: No regret and experimental design,” arXiv preprint arXiv:0912.3995, 2009.","element":"span"}],[{"text":"[4] S. R. Chowdhury and A. Gopalan, “On kernelized multi-armed bandits,” in Proceedings of the 34th International Conference on Machine Learning, 2017, pp. 844–853.","element":"span"}],[{"text":"[5] A. Durand, O.-A. Maillard, and J. Pineau, “Streaming kernel regression with provably adaptive mean, variance, and regularization,” arXiv preprint arXiv:1708.00768, 2017.","element":"span"}],[{"text":"[6] M. Valko, N. Korda, R. Munos, I. Flaounas, and N. Cristianini, “Finite-time analysis of kernelised contextual bandits,” arXiv preprint arXiv:1309.6869, 2013.","element":"span"}],[{"text":"[7] T. Jaksch, R. Ortner, and P. Auer, “Near-optimal regret bounds for reinforcement learning,” Journal of Machine Learning Research, vol. 11, no. Apr, pp. 1563–1600, 2010.","element":"span"}],[{"text":"[8] I. Osband, D. Russo, and B. Van Roy, “(More) efficient reinforcement learning via posterior sampling,” in Advances in Neural Information Processing Systems, 2013, pp. 3003–3011.","element":"span"}],[{"text":"[9] Y. Ouyang, M. Gagrani, A. Nayyar, and R. Jain, “Learning unknown markov decision processes: A thompson sampling approach,” in Advances in Neural Information Processing Systems, 2017, pp. 1333– 1342.","element":"span"}],[{"text":"[10] Y. Abbasi-Yadkori and C. Szepesv´ari, “Regret bounds for the adaptive control of linear quadratic systems,” in Proceedings of the 24th Annual Conference on Learning Theory, 2011, pp. 1–26.","element":"span"}],[{"text":"[11] ——, “Bayesian optimal control of smoothly parameterized systems.” in UAI. ","element":"span"},{"text":"Citeseer, 2015, pp. 1–11.","element":"span"}],[{"text":"[12] M. Ibrahimi, A. Javanmard, and B. V. Roy, “Efficient reinforcement learning for high dimensional linear quadratic systems,” in Advances in Neural Information Processing Systems, 2012, pp. 2636–2644.","element":"span"}],[{"text":"[13] M. Abeille and A. Lazaric, “Thompson sampling for linear-quadratic control problems,” arXiv preprint arXiv:1703.08972, 2017.","element":"span"}],[{"text":"[14] A. Gopalan and S. Mannor, “Thompson sampling for learning parameterized markov decision processes,” in Proceedings of The 28th Conference on Learning Theory, COLT 2015, Paris, France, July 3-6, 2015, 2015, pp. 861–898.","element":"span"}],[{"text":"[15] S. Agrawal and R. Jia, “Optimistic posterior sampling for reinforcement learning: worst-case regret bounds,” in Advances in Neural Information Processing Systems, 2017, pp. 1184–1194.","element":"span"}],[{"text":"[16] R. Ortner and D. Ryabko, “Online regret bounds for undiscounted continuous reinforcement learning,” in Advances in Neural Information Processing Systems, 2012, pp. 1763–1771.","element":"span"}],[{"text":"[17] K. Lakshmanan, R. Ortner, and D. Ryabko, “Improved regret bounds for undiscounted continuous reinforcement learning,” in International Conference on Machine Learning, 2015, pp. 524–532.","element":"span"}],[{"text":"[18] M. Turchetta, F. Berkenkamp, and A. Krause, “Safe exploration in finite markov decision processes with gaussian processes,” in Advances in Neural Information Processing Systems, 2016, pp. 4312–4320.","element":"span"}],[{"text":"[19] F. Berkenkamp, M. Turchetta, A. Schoellig, and A. Krause, “Safe model-based reinforcement learning with stability guarantees,” in Advances in Neural Information Processing Systems, 2017, pp. 908– 919.","element":"span"}],[{"text":"[20] M. Deisenroth and C. E. Rasmussen, “Pilco: A model-based and data-efficient approach to policy search,” in Proceedings of the 28th International Conference on machine learning (ICML-11), 2011, pp. 465–472.","element":"span"}],[{"text":"[21] T. Jung and P. Stone, “Gaussian processes for sample efficient reinforcement learning with rmax-like exploration,” in Joint European Conference on Machine Learning and Knowledge Discovery in Databases. ","element":"span"},{"text":"Springer, 2010, pp. 601–616.","element":"span"}],[{"text":"[22] R. Grande, T. Walsh, and J. How, “Sample efficient reinforcement learning with gaussian processes,” in International Conference on Machine Learning, 2014, pp. 1332–1340.","element":"span"}],[{"text":"[23] R. C. Grande, “Computationally efficient gaussian process changepoint detection and regression,” Ph.D. dissertation, Massachusetts Institute of Technology, 2014.","element":"span"}],[{"text":"[24] S. R. Chowdhury and A. Gopalan, “Online learning in kernelized markov decision processes,” in The 22nd International Conference on Artificial Intelligence and Statistics, 2019, pp. 3197–3205.","element":"span"}],[{"text":"[25] M. Mutny and A. Krause, “Efficient high dimensional bayesian optimization with additivity and quadrature fourier features,” in Advances in Neural Information Processing Systems, 2018, pp. 9005–9016.","element":"span"}],[{"text":"[26] T. Yang, Y.-F. Li, M. Mahdavi, R. Jin, and Z.-H. Zhou, “Nystr¨om method vs random fourier features: A theoretical and empirical comparison,” in Advances in neural information processing systems, 2012, pp. 476–484.","element":"span"}],[{"text":"[27] S. Bochner, Lectures on Fourier integrals. Princeton University Press, 1959.","element":"span"}],[{"text":"[28] F. B. Hildebrand, Introduction to numerical analysis. ","element":"span"},{"text":"Courier Corporation, 1987.","element":"span"}],[{"text":"[29] A. Rahimi and B. Recht, “Random features for large-scale kernel machines,” in Advances in neural information processing systems, 2008, pp. 1177–1184.","element":"span"}],[{"text":"[30] Y. Abbasi-Yadkori, D. P´al, and C. Szepesv´ari, “Improved algorithms for linear stochastic bandits,” in Advances in Neural Information Processing Systems, 2011, pp. 2312–2320.","element":"span"}],[{"text":"[31] D. Calandriello, L. Carratino, A. Lazaric, M. Valko, and L. Rosasco, “Gaussian process optimization with adaptive sketching: Scalable and no regret,” In Conference on Learning Theory, 2019.","element":"span"}],[{"text":"[32] S. R. Chowdhury and A. Gopalan, “Bayesian optimization under heavy-tailed payoffs,” arXiv preprint arXiv:1909.07040, 2019.","element":"span"}],[{"text":"[33] A. Krause and C. S. Ong, “Contextual gaussian process bandit optimization,” in Advances in Neural Information Processing Systems, 2011, pp. 2447–2455.","element":"span"}],[{"text":"[34] I. Osband and B. Van Roy, “Why is posterior sampling better than optimism for reinforcement learning?” arXiv preprint arXiv:1607.00215, 2016.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]