28:["$","$L30",null,{"isWhiteLabelled":false,"children":["$","$Lc",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L31",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTgwOC4wMTgxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2019-01-19T07:56:22.000Z","paperID":"1808.01813","published":"2018-08-06T10:34:40.000Z","authors":"[\"Ronald Ortner\"]","title":"Regret Bounds for Reinforcement Learning via Markov Chain Concentration","scoreTrending":null,"summary":"We give a simple optimistic algorithm for which it is easy to derive regret\nbounds of $\\tilde{O}(\\sqrt{t_{\\rm mix} SAT})$ after $T$ steps in uniformly\nergodic Markov decision processes with $S$ states, $A$ actions, and mixing time\nparameter $t_{\\rm mix}$. These bounds are the first regret bounds in the\ngeneral, non-episodic setting with an optimal dependence on all given\nparameters. They could only be improved by using an alternative mixing time\nparameter.","lastCheckedForCode":"2022-09-05T08:35:50.955Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9yZWdyZXQtYm91bmRzLWZvci1yZWluZm9yY2VtZW50LWxlYXJuaW5nLXZpYSJ9","type":"pwc","url":"https://paperswithcode.com/paper/regret-bounds-for-reinforcement-learning-via","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"ronald ortner","node":{"id":"eyJhZGRyZXNzIjoicm9ydG5lckB1bmlsZW9iZW4uYWMuYXQifQ==","address":"rortner@unileoben.ac.at","name":"R. Ortner","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"Montanuniversität Leoben"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"9zM7wvkAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIyZjUxOTA1MS04Y2ZmLTQ4ZjItOTIyMi1mYWE4MzY3YjA0OTMifQ==","name":"ronald ortner","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTQwNS4yNjUyIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1405.2652"},{"id":"eyJwYXBlcklEIjoiMTMwMi4yNTUwIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1302.2550"},{"id":"eyJwYXBlcklEIjoiMTgwNS4xMDA2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.10066"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wNTg1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.05857"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wMTgxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.01813"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wMTE4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.01182"}]}]}}]},"__typename":"paper","authorArray":["Ronald Ortner"]}}],["$","$L25",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L25",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$8",null,{"children":["$","$L32",null,{"publisher":"arxiv","paperID":"1808.01813","product":{"paper":"$28:props:children:props:children:0:props:product","models":"$28:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$8",null,{"children":["$","$L33",null,{"article":"$L34","model":"$undefined"}]}]]}],["$","$L25",null,{"size":"grow","children":["$","$L35",null,{}]}]]}],["$","$8",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L36",null,{"paperID":"1808.01813","publisher":"arxiv","paperJSON":{"title":"Regret Bounds for Reinforcement Learning via Markov Chain Concentration","paperID":"1808.01813","avgLineHeight":14.4,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"We give a simple optimistic algorithm for which it is easy to derive regret bounds of ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":18.64},"width":415.48,"height":46.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/0-0.png","element":"img","alt":"O(√tmixSAT) after T","inline":true,"padRight":true},{"text":"steps in uniformly ergodic Markov decision processes with ","element":"span"},{"text":"S ","element":"span"},{"text":"states, ","element":"span"},{"text":"A ","element":"span"},{"text":"actions, and mixing time parameter ","element":"span"},{"style":{"height":13.89},"width":71.76,"height":34.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/0-1.png","element":"img","alt":" tmix","inline":true},{"text":". These bounds are the first regret bounds in the general, non-episodic setting with an optimal dependence on all given parameters. ","element":"span"},{"text":"They could only be improved by using an alternative mixing time parameter.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"Starting with ","element":"span"},{"href":"#id-0","referenceIndex":9,"text":"[9]","element":"a"},{"text":", regret bounds for reinforcement learning have addressed the question of how difficult it is to learn optimal behavior in an unknown Markov decision process (MDP). Some of these bounds like the one derived in the mentioned ","element":"span"},{"href":"#id-0","referenceIndex":9,"text":"[9","element":"a"},{"text":"] depend on particular properties of the underlying MDP, typically some kind of gap that specifies the distance between an optimal and a sub-optimal action or policy (see e.g. ","element":"span"},{"href":"#id-1","referenceIndex":17,"text":"[17] ","element":"a"},{"text":"for a recent refinement of such bounds). The first so-called problem independent bounds that have no dependence on any gap-parameter were obtained in ","element":"span"},{"href":"#id-2","referenceIndex":13,"text":"[13","element":"a"},{"text":"]. For MDPs with ","element":"span"},{"text":"S ","element":"span"},{"text":"states, ","element":"span"},{"text":"A ","element":"span"},{"text":"actions and diameter ","element":"span"},{"text":"D ","element":"span"},{"text":"the regret of the UCRL algorithm was shown to be ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":22.13},"width":236.08,"height":55.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-0.png","element":"img","alt":"O(DS√AT","inline":true},{"text":") after any ","element":"span"},{"text":"T ","element":"span"},{"text":"steps. A corresponding lower bound of Ω(","element":"span"},{"style":{"height":19.6},"width":180.88,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-1.png","element":"img","alt":"√DSAT","inline":true},{"text":") left the open question of the true dependence of the regret on the parameters ","element":"span"},{"text":"S ","element":"span"},{"text":"and ","element":"span"},{"text":"D","element":"span"},{"text":". Recently, regret bounds of ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":22.32},"width":236.08,"height":55.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-2.png","element":"img","alt":"O(D√SAT","inline":true},{"text":") have been claimed in ","element":"span"},{"href":"#id-3","referenceIndex":1,"text":"[1]","element":"a"},{"text":", however there seems to be a gap in the proof, cf. Sec. 38.9 of ","element":"span"},{"href":"#id-4","referenceIndex":24,"text":"[24]","element":"a"},{"text":", so that the original bounds of ","element":"span"},{"href":"#id-2","referenceIndex":13,"text":"[13] ","element":"a"},{"text":"are still the best known bounds.","element":"span"}],[{"text":"In the simpler episodic setting, the gap between upper and lower bounds has been closed in ","element":"span"},{"href":"#id-5","referenceIndex":6,"text":"[6","element":"a"},{"text":"], showing that the regret is of order ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":22.13},"width":402.12,"height":55.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-3.png","element":"img","alt":"O(√HSAT), where","inline":true,"padRight":true},{"text":"H ","element":"span"},{"text":"is the length of an episode. However, while bounds for the non-episodic setting can be easily transferred to the episodic setting, the reverse is not true. We also note that another kind of regret bounds that appears in the literature assumes an MDP sampled from some distribution (see e.g. ","element":"span"},{"href":"#id-6","referenceIndex":20,"text":"[20","element":"a"},{"text":"] for a recent contribution). Regret bounds in this Bayesian setting cannot be turned into bounds for the worst case setting as considered here.","element":"span"}],[{"text":"There is also quite some work on bounds on the number of samples from a generative model necessary to approximate the optimal policy by an error of at most ","element":"span"},{"style":{"height":9.2},"width":22,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-4.png","element":"img","alt":" ε","inline":true},{"text":". Obviously, having access to a generative model makes learning the optimal policy easier than in the online setting considered here. However, for ergodic MDPs it could be argued that any policy reaches any state so that in this case sample complexity bounds could in principle be turned into regret bounds. We first note that this seems difficult for bounds in the discounted setting, which make up the majority in the literature. Bounds in the discounted setting (see e.g. ","element":"span"},{"href":"#id-7","referenceIndex":4,"text":"[4] ","element":"a"},{"text":"or ","element":"span"},{"href":"#id-8","referenceIndex":23,"text":"[23","element":"a"},{"text":"] for a more recent contribution obtaining near-optimal bounds) depend on the term 1 ","element":"span"},{"style":{"height":18},"width":409.8,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-5.png","element":"img","alt":" − γ, where γ is the","inline":true,"padRight":true},{"text":"discount factor, and it is not clear how this term translates into a mixing time parameter in the average reward case. For the few results in the average reward setting the best sample complexity bound we are aware of is the bound of ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":28.03},"width":235.6,"height":70.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-6.png","element":"img","alt":"O� τ 2t2mixSAε2 �","inline":true,"padRight":true},{"text":"of ","element":"span"},{"href":"#id-9","referenceIndex":27,"text":"[27]","element":"a"},{"text":", where ","element":"span"},{"style":{"height":14.88},"width":72.72,"height":37.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-7.png","element":"img","alt":" tmix","inline":true,"padRight":true},{"text":"is a mixing time parameter like ours (cf. below) and ","element":"span"},{"style":{"height":8.8},"width":24,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-8.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"characterizes the range of stationary distributions across policies. Translated into respective regret bounds, these would have a worse (i.e., linear) dependence on the mixing time and would depend on the additional parame","element":"span"},{"href":"#id-10","referenceIndex":14,"text":"ter ","element":"a"},{"style":{"height":10.8},"width":75.88,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-9.png","element":"img","alt":" τ >","inline":true,"padRight":true},{"text":"1, which does not appear in our bounds.","element":"span"}],[{"text":"Starting with ","element":"span"},{"href":"#id-10","referenceIndex":14,"text":"[14","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":8,"text":"8] ","element":"a"},{"text":"there are also sample complexity bounds in the literature that were derived for settings without generative sampling model. Although this is obviously harder, there are bounds for the discounted case where the dependence with respect to ","element":"span"},{"style":{"height":17.2},"width":232.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/1-10.png","element":"img","alt":" S, A, and ε","inline":true,"padRight":true},{"text":"is the same as for the case with a generative sampling model ","element":"span"},{"href":"#id-12","referenceIndex":25,"text":"[25]","element":"a"},{"text":". However, we are not aware of any such bounds for the undiscounted setting that would translate into online","element":"span"}],[{"text":"regret bounds optimal in ","element":"span"},{"text":"S","element":"span"},{"text":", ","element":"span"},{"text":"A","element":"span"},{"text":", and ","element":"span"},{"text":"T","element":"span"},{"text":".","element":"span"}],[{"text":"In this note, we present a simple algorithm that allows the derivation of regret bounds of ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":20.78},"width":270.64,"height":51.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-0.png","element":"img","alt":"O(√tmixSAT","inline":true},{"text":") for uniformly ergodic MDPs with mixing time ","element":"span"},{"style":{"height":14.88},"width":72.72,"height":37.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-1.png","element":"img","alt":" tmix","inline":true},{"text":", a parameter that measures how long it takes to approximate the stationary distribution induced by any policy. These bounds are optimal with respect to the parameters ","element":"span"},{"style":{"height":17.2},"width":346.32,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-2.png","element":"img","alt":" S, A, T, and tmix","inline":true},{"text":". The only possible improvement is a replacement of ","element":"span"},{"style":{"height":14.88},"width":72.72,"height":37.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-3.png","element":"img","alt":" tmix","inline":true,"padRight":true},{"text":"by a parameter that may be smaller for some MDPs, such as the diameter ","element":"span"},{"href":"#id-2","referenceIndex":13,"text":"[13] ","element":"a"},{"text":"or the bias span ","element":"span"},{"href":"#id-13","referenceIndex":7,"text":"[7, ","element":"a"},{"href":"#id-14","referenceIndex":11,"text":"11]","element":"a"},{"text":". We note, however, that it is easy to give MDPs for which ","element":"span"},{"style":{"height":14.88},"width":72.72,"height":37.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-4.png","element":"img","alt":" tmix","inline":true,"padRight":true},{"text":"is basically of the same size as the mentioned alternative parameters.","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-5.png","element":"img","alt":"1 ","inline":true,"padRight":true},{"text":"Accordingly, the obtained bound basically closes the gap between upper and lower bound on the regret for a subclass of MDPs.","element":"span"}],[{"text":"Algorithmically, the algorithm we propose works like an optimistic bandit algorithm such as UCB ","element":"span"},{"href":"#id-15","referenceIndex":3,"text":"[3]","element":"a"},{"text":". Such algorithms have been proposed before for MDP settings with a limited set of policies ","element":"span"},{"href":"#id-16","referenceIndex":5,"text":"[5]","element":"a"},{"text":". The main difference to the latter approach is that due to the re-use of samples we obtain regret bounds that do not scale with the number of policies but with the number of state-action pairs. We note however that as ","element":"span"},{"href":"#id-16","referenceIndex":5,"text":"[5] ","element":"a"},{"text":"our algorithm needs to evaluate each policy independently, which makes it impractical. ","element":"span"},{"text":"The proof of the regret bound is much simpler than for bounds achieved before and relies on concentration results for Markov chains.","element":"span"}]]},{"heading":"2 Setting","paragraphs":[[{"text":"We consider reinforcement learning in an average reward ","element":"span"},{"text":"Markov decision process ","element":"span"},{"text":"(","element":"span"},{"text":"MDP","element":"span"},{"text":") with finite state space ","element":"span"},{"text":"S ","element":"span"},{"text":"and finite action space ","element":"span"},{"text":"A","element":"span"},{"text":". We assume that each stationary policy ","element":"span"},{"style":{"height":15.2},"width":215.16,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-6.png","element":"img","alt":" π : S → A","inline":true,"padRight":true},{"text":"induces a uniformly ergodic","element":"span"},{"style":{"height":15.71},"width":189.16,"height":39.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-7.png","element":"img","alt":"2 Markov","inline":true,"padRight":true},{"text":"chain on the state space. In such MDPs, which we call ","element":"span"},{"text":"uniformly ergodic","element":"span"},{"text":", the chain induced by a policy ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-8.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"has a unique stationary distribution ","element":"span"},{"style":{"height":18},"width":153.68,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-9.png","element":"img","alt":" µπ, and","inline":true,"padRight":true},{"text":"the (state-independent) average reward ","element":"span"},{"style":{"height":13.2},"width":44,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-10.png","element":"img","alt":" ρπ","inline":true,"padRight":true},{"text":"can be written as ","element":"span"},{"style":{"height":13.6},"width":249.16,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-11.png","element":"img","alt":" ρπ = µ⊤π rπ,","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":19.6},"width":784.48,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-12.png","element":"img","alt":" µπ = (µπ(s))s and rπ = (r(s, π(s))s","inline":true,"padRight":true},{"text":"are the (column) vectors for the stationary distribution and the average reward under ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/2-13.png","element":"img","alt":" π","inline":true},{"text":", respectively. We assume that the reward distribution for each state-action pair (","element":"span"},{"text":"s, a","element":"span"},{"text":") has support in [0","element":"span"},{"text":", ","element":"span"},{"text":"1].","element":"span"}],[{"text":"The maximal average reward is known (cf. ","element":"span"},{"href":"#id-17","referenceIndex":22,"text":"[22","element":"a"},{"text":"]) to be achieved by a stationary policy ","element":"span"},{"style":{"height":13.31},"width":45.32,"height":33.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-0.png","element":"img","alt":" π∗ ","inline":true,"padRight":true},{"text":"that gives average reward ","element":"span"},{"style":{"height":17.31},"width":182.52,"height":43.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-1.png","element":"img","alt":" ρ∗ := ρπ∗","inline":true},{"text":". We are interested in the ","element":"span"},{"text":"regret ","element":"span"},{"text":"accumulated by an algorithm after any number of ","element":"span"},{"text":"T ","element":"span"},{"text":"steps defined as","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-2.png","element":"img","alt":"3","inline":true}],[{"style":{"width":"25%"},"width":394,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-3.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":11.68},"width":33.12,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-4.png","element":"img","alt":" rt","inline":true,"padRight":true},{"text":"are the (random) rewards collected by the algorithm at each step ","element":"span"},{"text":"t","element":"span"},{"text":".","element":"span"}]]},{"heading":"3 Preliminaries on Markov Chains","paragraphs":[[{"text":"In this section, we give some definitions and results about Markov chain concentration that we will use in the following.","element":"span"}],[{"text":"3.1 ","element":"span"},{"text":"Mixing Times","element":"span"}],[{"text":"For two distributions ","element":"span"},{"text":"P, Q ","element":"span"},{"text":"over the same state space (","element":"span"},{"style":{"height":19.6},"width":481.48,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-5.png","element":"img","alt":"S, F) with σ-algebra F,","inline":true,"padRight":true},{"text":"let","element":"span"}],[{"style":{"width":"42%"},"width":659,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-6.png","element":"img"}],[{"text":"be the ","element":"span"},{"text":"total variational distance ","element":"span"},{"text":"between ","element":"span"},{"text":"P ","element":"span"},{"text":"and ","element":"span"},{"text":"Q","element":"span"},{"text":". A Markov chain with a transition kernel ","element":"span"},{"text":"p ","element":"span"},{"text":"and a stationary distribution ","element":"span"},{"style":{"height":13.2},"width":28,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-7.png","element":"img","alt":" µ","inline":true,"padRight":true},{"text":"is said to be ","element":"span"},{"text":"uniformly ergodic","element":"span"},{"text":", if there are a ","element":"span"},{"style":{"height":14.4},"width":73.48,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-8.png","element":"img","alt":" θ <","inline":true,"padRight":true},{"text":"1 and a finite ","element":"span"},{"text":"L ","element":"span"},{"text":"such that","element":"span"}],[{"style":{"width":"34%"},"width":530,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-9.png","element":"img"}],[{"text":"Furthermore, the ","element":"span"},{"style":{"height":16.8},"width":334.32,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-10.png","element":"img","alt":" mixing time tmix","inline":true,"padRight":true},{"text":"of the Markov chain is defined as","element":"span"}],[{"style":{"width":"53%"},"width":837,"height":82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-11.png","element":"img"}],[{"text":"For a uniformly ergodic MDP we set the mixing time ","element":"span"},{"style":{"height":17.31},"width":72.72,"height":43.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-12.png","element":"img","alt":" tπmix ","inline":true,"padRight":true},{"text":"of a policy ","element":"span"},{"style":{"height":12.4},"width":87.8,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-13.png","element":"img","alt":" π to","inline":true,"padRight":true},{"text":"be the mixing time of the Markov chain induced by ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-14.png","element":"img","alt":" π","inline":true},{"text":", and define the ","element":"span"},{"text":"mixing ","element":"span"},{"style":{"height":18.4},"width":844.84,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/3-15.png","element":"img","alt":"time of the MDP to be tmix := maxπ tπmix.","inline":true}],[{"text":"3.2 ","element":"span"},{"text":"McDiarmid’s Inequality for Markov Chains","element":"span"}],[{"text":"Our results mainly rely on the following version of McDiarmid’s inequality for Markov chains from ","element":"span"},{"href":"#id-18","referenceIndex":21,"text":"[21","element":"a"},{"text":"].","element":"span"}],[{"id":"id-19","text":"Lemma 1. ","element":"span"},{"text":"(Corollary 2.10 and the following Remark 2.11 of ","element":"span"},{"href":"#id-18","referenceIndex":21,"text":"[21]","element":"a"},{"text":") Consider a uniformly ergodic Markov chain ","element":"span"},{"style":{"height":16.8},"width":222.6,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-0.png","element":"img","alt":" X1, . . . , Xn","inline":true,"padRight":true},{"text":"with state space ","element":"span"},{"text":"S ","element":"span"},{"text":"and mixing time ","element":"span"},{"style":{"height":17.6},"width":521.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-1.png","element":"img","alt":" tmix. Let f : Sn → R with","inline":true}],[{"id":"id-20","style":{"width":"99%"},"width":1542,"height":335,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-2.png","element":"img"}],[{"text":"Lemma ","element":"span"},{"href":"#id-19","text":"1 ","element":"a"},{"text":"can be used to obtain a concentration result for the empirical average reward of any policy ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-3.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"in an MDP. This works analogously to the concentration bounds for the total variational distance between the empirical and the stationary distribution (Proposition 2.18 in ","element":"span"},{"href":"#id-18","referenceIndex":21,"text":"[21]","element":"a"},{"text":").","element":"span"}],[{"id":"id-21","text":"Corollary 1. ","element":"span"},{"text":"Consider an MDP and a policy ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-4.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"that induces a uniformly ergodic Markov chain with mixing time ","element":"span"},{"style":{"height":14.88},"width":72.72,"height":37.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-5.png","element":"img","alt":" tmix","inline":true},{"text":". Using (column) vector notation ","element":"span"},{"style":{"height":19.6},"width":737.44,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-6.png","element":"img","alt":"µ := (µπ(s))s and r := (r(s, π(s))s","inline":true,"padRight":true},{"text":"for the stationary distribution and the reward function under ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-7.png","element":"img","alt":" π","inline":true},{"text":", and writing ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":18.45},"width":55.08,"height":46.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-8.png","element":"img","alt":"µn ","inline":true,"padRight":true},{"text":"for the empirical distribution after ","element":"span"},{"text":"n ","element":"span"},{"text":"steps defined as ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":22.42},"width":549.6,"height":56.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-9.png","element":"img","alt":"µn(s) := 1n�ni=1 1{Xi = s}","inline":true},{"text":", it holds that","element":"span"}],[{"style":{"width":"99%"},"width":1549,"height":206,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-10.png","element":"img"}],[{"text":"dition ","element":"span"},{"href":"#id-20","text":"(1) ","element":"a"},{"text":"holds choosing ","element":"span"},{"style":{"height":22.42},"width":459.52,"height":56.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-11.png","element":"img","alt":" ci = 1n for i = 1, . . . , n","inline":true,"padRight":true},{"text":"and the claim follows from ","element":"span"},{"text":"Lemma ","element":"span"},{"href":"#id-19","text":"1.","element":"a"}],[{"text":"Choosing the error probability to be ","element":"span"},{"style":{"height":14},"width":22,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-12.png","element":"img","alt":" δ","inline":true},{"text":", we obtain the following confidence interval that will be used by our algorithm.","element":"span"}],[{"id":"id-33","text":"Corollary 2. ","element":"span"},{"text":"Using the same assumptions and notation of Corollary ","element":"span"},{"href":"#id-21","text":"1, ","element":"a"},{"text":"with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":17.2},"width":84.56,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-13.png","element":"img","alt":" − δ,","inline":true}],[{"style":{"width":"40%"},"width":631,"height":145,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/4-14.png","element":"img"}],[{"text":"3.3 ","element":"span"},{"text":"Concentration of the Empirical Distribution","element":"span"}],[{"text":"We will also need the following results on the concentration of the empirical state distribution of Markov chains from ","element":"span"},{"href":"#id-18","referenceIndex":21,"text":"[21]","element":"a"},{"text":". In the following, consider a uniformly ergodic Markov chain ","element":"span"},{"style":{"height":16.8},"width":222.6,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-0.png","element":"img","alt":" X1, . . . , Xn","inline":true,"padRight":true},{"text":"with a stationary distribution ","element":"span"},{"style":{"height":13.2},"width":28,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-1.png","element":"img","alt":" µ","inline":true,"padRight":true},{"text":"and a mixing time ","element":"span"},{"style":{"height":18},"width":237.48,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-2.png","element":"img","alt":" tmix. Let ˆµn ","inline":true,"padRight":true},{"text":"be the empirical distribution after performing ","element":"span"},{"text":"n ","element":"span"},{"text":"steps in the chain.","element":"span"}],[{"id":"id-23","style":{"width":"87%"},"width":1354,"height":562,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-3.png","element":"img"}],[{"id":"id-22","text":"Lemma 4. ","element":"span"},{"text":"(Proposition 3.4 in ","element":"span"},{"href":"#id-18","referenceIndex":21,"text":"[21","element":"a"},{"text":"]) In uniformly ergodic Markov chains, the pseudo-spectral gap ","element":"span"},{"style":{"height":17.2},"width":28,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-4.png","element":"img","alt":" β","inline":true,"padRight":true},{"text":"can be bounded via the mixing time ","element":"span"},{"style":{"height":14.88},"width":135.2,"height":37.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-5.png","element":"img","alt":" tmix as","inline":true}],[{"style":{"width":"12%"},"width":197,"height":64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-6.png","element":"img"}],[{"text":"We summarize these results in the following corollary.","element":"span"}],[{"id":"id-39","style":{"width":"69%"},"width":1073,"height":227,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-7.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Using the bound of Lemma ","element":"span"},{"href":"#id-22","text":"4 ","element":"a"},{"text":"in Lemma ","element":"span"},{"href":"#id-23","text":"3 ","element":"a"},{"text":"and setting the error probability in Lemma ","element":"span"},{"href":"#id-23","text":"2 ","element":"a"},{"text":"to ","element":"span"},{"style":{"height":14},"width":22,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-8.png","element":"img","alt":" δ","inline":true},{"text":", one obtains by Jensen’s inequality","element":"span"}],[{"style":{"width":"57%"},"width":899,"height":145,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/5-9.png","element":"img"}],[{"text":"and the claim of the corollary follows immediately.","element":"span"}]]},{"heading":"4 Algorithm","paragraphs":[[{"text":"At the core, the ","element":"span"},{"text":"Osp ","element":"span"},{"text":"algorithm we propose works like the UCB algorithm in the bandit setting. In our case, each policy corresponds to an arm, and the concentration results of the previous chapter are used to obtain suitable confidence intervals for the MDP setting.","element":"span"}],[{"id":"id-24","style":{"width":"100%"},"width":1555,"height":1313,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/6-0.png","element":"img"}],[{"text":"Osp ","element":"span"},{"text":"(shown in detail as Algorithm 1) does not evaluate the policies at each time step. ","element":"span"},{"text":"Instead, it proceeds in phases","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/6-1.png","element":"img","alt":"5 ","inline":true,"padRight":true},{"text":"(cf. line ","element":"span"},{"href":"#id-24","text":"3 ","element":"a"},{"text":"of ","element":"span"},{"text":"Osp","element":"span"},{"text":"), where in each phase ","element":"span"},{"text":"k ","element":"span"},{"text":"an optimistic policy ","element":"span"},{"style":{"height":11.28},"width":44.4,"height":28.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/6-2.png","element":"img","alt":" πk","inline":true,"padRight":true},{"text":"is selected (line ","element":"span"},{"href":"#id-24","text":"8)","element":"a"},{"text":". ","element":"span"},{"text":"This is done (cf. line ","element":"span"},{"href":"#id-24","text":"5) ","element":"a"},{"text":"by first constructing for each policy ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/6-3.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"a sample path ","element":"span"},{"style":{"height":24.22},"width":576.68,"height":60.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/6-4.png","element":"img","alt":"Pπ =�(st, π(st), rt, st+1)�nt=1 ","inline":true,"padRight":true},{"text":"from the observations so far. Accordingly, the","element":"span"}],[{"id":"id-25","style":{"width":"100%"},"width":1555,"height":735,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-0.png","element":"img"}],[{"text":"algorithm keeps a record of all observations. That is, after choosing in a state ","element":"span"},{"text":"s ","element":"span"},{"text":"an action ","element":"span"},{"text":"a","element":"span"},{"text":", obtaining the reward ","element":"span"},{"text":"r","element":"span"},{"text":", and observing a transition to the next state ","element":"span"},{"style":{"height":9.2},"width":38.08,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-1.png","element":"img","alt":" s′","inline":true},{"text":", the respective observation (","element":"span"},{"style":{"height":12.4},"width":167.68,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-2.png","element":"img","alt":"s, a, r, s′","inline":true},{"text":") is appended to the sequence of observations ","element":"span"},{"text":"O ","element":"span"},{"text":"(cf. line ","element":"span"},{"href":"#id-24","text":"10)","element":"a"},{"text":".","element":"span"}],[{"text":"The sample path ","element":"span"},{"style":{"height":16.08},"width":53.12,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-3.png","element":"img","alt":" Pπ","inline":true,"padRight":true},{"text":"constructed from the observation sequence ","element":"span"},{"text":"O ","element":"span"},{"text":"contains each observation from ","element":"span"},{"text":"O ","element":"span"},{"text":"at most once. ","element":"span"},{"text":"Further, the path ","element":"span"},{"style":{"height":24.22},"width":596.36,"height":60.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-4.png","element":"img","alt":"Pπ = �(st, π(st), rt, st+1)�nt=1 ","inline":true,"padRight":true},{"text":"is such that there is no unused observation ","element":"span"},{"text":"(","element":"span"},{"style":{"height":19.6},"width":467.12,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-5.png","element":"img","alt":"sn+1, π(sn+1), r, s) in O","inline":true,"padRight":true},{"text":"that could be used to extend the path by appending the observation. In the following, we say that such a path is ","element":"span"},{"text":"non-extendible","element":"span"},{"text":". Algorithm ","element":"span"},{"href":"#id-25","text":"2 ","element":"a"},{"text":"provides an algorithm for constructing a non-extendible path. Alternative constructions could be used for obtaining non-extendible paths as well.","element":"span"}],[{"text":"For each possible policy ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-6.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"the algorithm computes an estimate of the average reward ","element":"span"},{"style":{"height":13.2},"width":44,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-7.png","element":"img","alt":" ρπ","inline":true,"padRight":true},{"text":"from the sample path ","element":"span"},{"style":{"height":16.08},"width":53.12,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-8.png","element":"img","alt":" Pπ","inline":true,"padRight":true},{"text":"and considers an optimistic upper confidence value ˜","element":"span"},{"style":{"height":13.2},"width":44,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-9.png","element":"img","alt":"ρπ","inline":true,"padRight":true},{"text":"(cf. line ","element":"span"},{"href":"#id-24","text":"6 ","element":"a"},{"text":"of ","element":"span"},{"text":"Osp","element":"span"},{"text":") using the concentration results of Section ","element":"span"},{"text":"3. ","element":"span"},{"text":"The policy with the maximal ˜","element":"span"},{"style":{"height":13.2},"width":44,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-10.png","element":"img","alt":"ρπ","inline":true,"padRight":true},{"text":"is chosen for use in phase ","element":"span"},{"text":"k","element":"span"},{"text":". The length ","element":"span"},{"style":{"height":17.2},"width":271.72,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-11.png","element":"img","alt":" nk of phase k","inline":true},{"text":", in which the chosen policy ","element":"span"},{"style":{"height":11.28},"width":44.4,"height":28.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-12.png","element":"img","alt":" πk","inline":true,"padRight":true},{"text":"is used, depends on the length ","element":"span"},{"style":{"height":19.58},"width":250.12,"height":48.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1808.01813/images/7-13.png","element":"img","alt":" n