1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMi4xMDYyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2020-02-25T01:40:31.000Z","paperID":"2002.10620","published":"2020-02-25T01:40:31.000Z","authors":"[\"Devavrat Shah\",\"Varun Somani\",\"Qiaomin Xie\",\"Zhi Xu\"]","title":"On Reinforcement Learning for Turn-based Zero-sum Markov Games","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-04T22:01:25.807Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9vbi1yZWluZm9yY2VtZW50LWxlYXJuaW5nLWZvci10dXJuLWJhc2VkLXplcm8ifQ==","type":"pwc","url":"https://paperswithcode.com/paper/on-reinforcement-learning-for-turn-based-zero","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"qiaomin xie","node":{"id":"eyJhZGRyZXNzIjoicWlhb21pbi54aWVAY29ybmVsbC5lZHUifQ==","address":"qiaomin.xie@cornell.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"RVNcy4EAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI5ZTIyMGRkYi01NzFhLTQ2OTMtYTYwMy05YTg5OWQ5ODZkMzQifQ==","name":"qiaomin xie","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMi4wNzA2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.07066"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xNTc3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.15779"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wMzkwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.03900"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wNTIxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.05213"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNDM1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.04353"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNDIxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.04211"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMzgyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.13827"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wMDE0OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.00148"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNDY3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.04672"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xNjM5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.16394"},{"id":"eyJwYXBlcklEIjoiMjAwMi4xMDYyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.10620"},{"id":"eyJwYXBlcklEIjoiMjQwMS4xMzg4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.13884"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMTg4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.01888"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wNTY4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.05689"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wODA0MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.08041"},{"id":"eyJwYXBlcklEIjoiMjIxMC4wMDk1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.00953"},{"id":"eyJwYXBlcklEIjoiMjQwNS4xNjczMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.16732"},{"id":"eyJwYXBlcklEIjoiMjQwNS4xNzg4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.17882"},{"id":"eyJwYXBlcklEIjoiMjQwNC4wNjAyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.06023"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wMDE5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.00196"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xNjUwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.16502"},{"id":"eyJwYXBlcklEIjoiMjMwNy4wOTY1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.09652"},{"id":"eyJwYXBlcklEIjoiNzE5MDciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71907"},{"id":"eyJwYXBlcklEIjoiMjMxMi4wMDE5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.00198"},{"id":"eyJwYXBlcklEIjoiMjMxMi4xMDg5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.10894"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xNzExNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.17114"}]}]}},{"author":"devavrat shah","node":{"id":"eyJhZGRyZXNzIjoiZGV2YXZyYXRAbWl0LmVkdSJ9","address":"devavrat@mit.edu","name":"Devavrat Shah","avatar":"https://img.fullcontact.com/static/59a8d8024cef5e83a632edb920dfa3fc_f38b0f381e0af411d47edbfe14545f5b108587943e11aa40f79adc41f4acf305","linkedin":"https://www.linkedin.com/in/devavrat-joshi-231837104","bio":"Robotics and Process Automation. 3D Modelling and 3D Printing for Scale Models.","site":null,"override":null,"membership":[{"name":"Scale Models."}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/1966383?v=4","username":"devavrat"}],"scholar":[{"thirdPartyID":"3qPiYJoAAAAJ"}],"twitter":[{"avatar":null,"username":"devavratshah"}],"location":[{"formatted":"Boston, MA, USA"}],"owner":[{"id":"eyJ1aWQiOiI3NjlmZTFjMi1kYmI0LTQ2N2QtOTQ0OS01MDA3YzFiZTdmNTQifQ==","name":"Devavrat Shah","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/59a8d8024cef5e83a632edb920dfa3fc_f38b0f381e0af411d47edbfe14545f5b108587943e11aa40f79adc41f4acf305"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTExMC4zNTY0IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1110.3564"},{"id":"eyJwYXBlcklEIjoiMTIwOS4xNjg4IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1209.1688"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNzY5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.07691"},{"id":"eyJwYXBlcklEIjoiMTQxMS42NTkxIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1411.6591"},{"id":"eyJwYXBlcklEIjoiMTQxMC4xMjMxIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1410.1231"},{"id":"eyJwYXBlcklEIjoiMTMwMi4zNjM5IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1302.3639"},{"id":"eyJwYXBlcklEIjoiMTQxMi4xNDQzIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1412.1443"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wOTA2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.09064"},{"id":"eyJwYXBlcklEIjoiMTQxMS4wMDczIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1411.0073"},{"id":"eyJwYXBlcklEIjoiMTUwNy4wNTM3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1507.05371"},{"id":"eyJwYXBlcklEIjoiMTQwOS4zODM2IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1409.3836"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wMzkwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.03900"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wNTIxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.05213"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xNDQ0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.14449"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNDM1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.04353"},{"id":"eyJwYXBlcklEIjoiMjAwMi4xMDYyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.10620"},{"id":"eyJwYXBlcklEIjoiMTgxMi4xMTkxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1812.11917"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNjEzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.06135"},{"id":"eyJwYXBlcklEIjoiMjIxMC4xMTM1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.11355"},{"id":"eyJwYXBlcklEIjoiMjIxMS4wODIwOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.08209"},{"id":"eyJwYXBlcklEIjoiMTQxMC43NjU5IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1410.7659"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wMDc5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.00793"},{"id":"eyJwYXBlcklEIjoiMTcxMS4wNjk0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1711.06940"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wMjUyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.02522"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xNTAzMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.15031"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMDg5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.10897"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNjk2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.06961"},{"id":"eyJwYXBlcklEIjoiMjMwOS4wNjQxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2309.06413"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wODE4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.08189"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wNDk2MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.04960"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wNDc3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.04775"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wMTI0MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.01241"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wMTk1NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.01954"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wMTY0NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.01646"},{"id":"eyJwYXBlcklEIjoiMjMwNC4xMDUyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.10525"},{"id":"eyJwYXBlcklEIjoiMjEwNC4xNDA5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.14098"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wODI1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.08257"},{"id":"eyJwYXBlcklEIjoiMjAwNS4wMDA3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.00072"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wNzA5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.07097"},{"id":"eyJwYXBlcklEIjoiMTgxMC4wNjMxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.06313"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMzQ0OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.13448"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMDE5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.10196"},{"id":"eyJwYXBlcklEIjoiMjExMC4xNTM5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.15397"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wMTgxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.01811"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wMjA5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.02096"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xNjQ5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.16491"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wMjI4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.02287"},{"id":"eyJwYXBlcklEIjoiNzMwNTMiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"73053"},{"id":"eyJwYXBlcklEIjoiNzE0NjgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71468"}]}]}},{"author":"zhi xu","node":{"id":"eyJhZGRyZXNzIjoiemhpeHVAbWl0LmVkdSJ9","address":"zhixu@mit.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"CbGdL4cAAAAJ"}],"twitter":[],"location":[],"owner":[]}},{"author":"varun somani","node":{"id":"eyJhZGRyZXNzIjoidnM0NzJAY29ybmVsbC5lZHUifQ==","address":"vs472@cornell.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}}],"github":[],"scholar":[],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI1M2IwODkwOS05YWRiLTQ3N2EtOTljZi00MzEwZWE4NTNkODAifQ==","name":"varun somani","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMi4xMDYyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.10620"}]}]}}]},"__typename":"paper","authorArray":["Devavrat Shah","Varun Somani","Qiaomin Xie","Zhi Xu"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2002.10620","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2002.10620","publisher":"arxiv","paperJSON":{"title":"On Reinforcement Learning for Turn-based Zero-sum Markov Games","paperID":"2002.10620","avgLineHeight":11.97,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"We consider the problem of finding Nash equilibrium for two-player turn-based zero-sum games. Inspired by the AlphaGo Zero (AGZ) algorithm (","element":"span"},{"href":"#id-0","referenceIndex":31,"text":"Silver et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":31,"text":"2017b","element":"a"},{"text":"), we develop a Reinforcement Learning based approach. Specifically, we propose Explore-Improve-Supervise (EIS) method that combines “exploration”, “policy improvement” and “supervised learning” to find the value function and policy associated with Nash equilibrium. We identify sufficient conditions for convergence and correctness for such an approach. For a concrete instance of EIS where random policy is used for “exploration”, Monte-Carlo Tree Search is used for “policy improvement” and Nearest Neighbors is used for “supervised learning”, we establish that this method finds an ","element":"span"},{"style":{"height":6.4},"width":17,"height":16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/0-0.png","element":"img","alt":" ε","inline":true},{"text":"-approximate value function of Nash equilibrium in ","element":"span"},{"style":{"height":0},"width":11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/0-1.png","element":"img","alt":"�","inline":true},{"style":{"height":18.49},"width":182.62,"height":46.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/0-2.png","element":"img","alt":"O(ε−(d+4))","inline":true,"padRight":true},{"text":"steps when the underlying state-space of the game is continuous and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":"-dimensional. This is nearly optimal as we establish a lower bound of ","element":"span"},{"style":{"height":18.49},"width":183.8,"height":46.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/0-3.png","element":"img","alt":"�Ω(ε−(d+2))","inline":true,"padRight":true},{"text":"for any policy.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"In 2016, AlphaGo (","element":"span"},{"href":"#id-1","referenceIndex":29,"text":"Silver et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":29,"text":"2016","element":"a"},{"text":") became the first program to defeat the world champion in the game of Go. Soon after, another program, AlphaGo Zero (AGZ) (","element":"span"},{"href":"#id-0","referenceIndex":31,"text":"Silver et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":31,"text":"2017b","element":"a"},{"text":"), achieved even stronger performance despite learning the game from scratch given only the rules. Starting ","element":"span"},{"style":{"fontStyle":"italic"},"text":"tabula rasa","element":"span"},{"text":", AGZ mastered the game of Go entirely through self-play using a new reinforcement learning algorithm. The same algorithm was shown to achieve superhuman performance in Chess and Shogi (","element":"span"},{"href":"#id-2","referenceIndex":30,"text":"Silver et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-2","referenceIndex":30,"text":"2017a","element":"a"},{"text":").","element":"span"}],[{"text":"One key innovation of AGZ is to learn a policy and value function using supervised learning from samples generated via Monte-Carlo Tree Search. Motivated by the remarkable success of this method, in this work we study the problem of finding Nash Equilibrium for two-player turn-based zero-sum games and in particular consider a reinforcement learning based approach.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Our Contributions. ","element":"span"},{"text":"The central contribution of this work is the Explore-Improve-Supervise (EIS) method for finding Nash Equilibrium for two-player turn-based zero-sum games with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"continuous ","element":"span"},{"text":"state space, modeled through the framework of Markov game. It is an iterative method where in each iteration three components are intertwined carefully: “explore” that allows for measured exploration of the state space, “improve” which allows for improving the current value and policy for the state being explored, and “supervise” which learns the improved value and policy over the explored states so as to generalize over the entire state space.","element":"span"}],[{"text":"Importantly, we identify sufficient conditions, in terms of each of the “explore”, “improve” and “supervise” modules, under which convergence to the value function of the Nash equilibrium is guaranteed. In particular, we establish finite sample complexity bounds for such a generic method to find the ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/0-4.png","element":"img","alt":" ε","inline":true},{"text":"-approximate value function of Nash equilibrium. See Theorem ","element":"span"},{"href":"#id-3","text":"2 ","element":"a"},{"text":"and Proposition ","element":"span"},{"href":"#id-4","text":"3 ","element":"a"},{"text":"for the precise statements.","element":"span"}],[{"text":"We establish that when random sampling is used for “explore”, Monte-Carlo-Tree-Search (MCTS) is used for “policy improvement” and Nearest Neighbor is used for “supervised learning”, the theoretical conditions identified for convergence of EIS policy are satisfied. Using our finite sample bound for EIS policy, and quantification of conditions as stated above, we conclude that such an instance of EIS method find ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/1-0.png","element":"img","alt":" ε","inline":true,"padRight":true},{"text":"approximate value function of Nash equilibrium in ","element":"span"},{"style":{"height":20.87},"width":195.79,"height":52.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/1-1.png","element":"img","alt":"˜O�ε−(d+4))","inline":true,"padRight":true},{"text":"steps, where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"is the dimension of the state space of the game (cf. Theorem ","element":"span"},{"href":"#id-5","text":"8","element":"a"},{"text":"). We also establish a mini-max lower bound on the number of steps required for learning ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/1-2.png","element":"img","alt":" ε","inline":true},{"text":"-approximate value function of Nash equilibrium as ","element":"span"},{"style":{"height":20.88},"width":197.9,"height":52.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/1-3.png","element":"img","alt":"˜Ω�ε−(d+2))","inline":true,"padRight":true},{"text":"for any method (cf. Theorem ","element":"span"},{"href":"#id-6","text":"4","element":"a"},{"text":"). This establishes near-optimality of an instance of EIS.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Related Work. ","element":"span"},{"text":"The Markov Decision Processes (MDP) provide a canonical framework to study the single-agent setting. Its natural extension, the Markov Games, provide a canonical framework to study multi-agent settings (","element":"span"},{"href":"#id-7","referenceIndex":13,"text":"Littman","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":13,"text":"1994","element":"a"},{"text":"). In this work, we consider an instance of it—turn-based two players or agents with zero-sum rewards. Analogous to learning the optimal policy in MDP setting, here we consider finding the Nash Equilibrium in the setting of Markov Games. There has been a rich literature on existence, uniqueness as well as algorithms for finding the Nash Equilibrium. In what follows, we describe the most relevant literature in that regard.","element":"span"}],[{"text":"To start with, in ","element":"span"},{"href":"#id-8","referenceIndex":27,"text":"Shapley ","element":"a"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":27,"text":"1953","element":"a"},{"text":"), for finite state and action spaces where game would terminate in a finite number of stages with positive probability, the existence of optimal stationary strategies and uniqueness of the optimal value function are established. For generic state space, the existence of Nash Equilibrium has been established for Markov Games with discounted rewards. Particularly, when the state space is a compact metric space, ","element":"span"},{"href":"#id-9","referenceIndex":16,"text":"Maitra & Parthasarathy ","element":"a"},{"text":"(","element":"span"},{"href":"#id-9","referenceIndex":16,"text":"1970","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":17,"text":"1971","element":"a"},{"text":") and ","element":"span"},{"href":"#id-11","referenceIndex":19,"text":"Parthasarathy ","element":"a"},{"text":"(","element":"span"},{"href":"#id-11","referenceIndex":19,"text":"1973","element":"a"},{"text":") show the uniqueness of value function and existence of optimal stationary policy. The same result has been established by ","element":"span"},{"href":"#id-12","referenceIndex":10,"text":"Kumar & Shiau ","element":"a"},{"text":"(","element":"span"},{"href":"#id-12","referenceIndex":10,"text":"1981","element":"a"},{"text":") when the state space is complete, separable metric space. For two-player zero-sum discounted Markov games, the Bellman operator corresponding to the Nash equilibrium is a contraction and hence, the value function is unique and there exists a deterministic stationary optimal policy (","element":"span"},{"href":"#id-13","referenceIndex":33,"text":"Szepesvári & Littman","element":"a"},{"text":", ","element":"span"},{"href":"#id-13","referenceIndex":33,"text":"1996","element":"a"},{"text":"; ","element":"span"},{"href":"#id-14","referenceIndex":3,"text":"Hansen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-14","referenceIndex":3,"text":"2013","element":"a"},{"text":"). We also note that existence of Nash equilibrium for a general class of games (stochastic shortest path) is established by ","element":"span"},{"href":"#id-15","referenceIndex":20,"text":"Patek ","element":"a"},{"text":"(","element":"span"},{"href":"#id-15","referenceIndex":20,"text":"1997","element":"a"},{"text":"). It argues that the optimal value function is unique and can be achieved by mixed stationary strategies.","element":"span"}],[{"text":"For computing or finding optimal value function and policy associated with the Nash equilibrium, there are two settings considered in the literature: (i) when system model is entirely known, and (ii) when model is not known but one can sample from the underlying model. In the first setting, classical approaches from the setting of MDPs such as value/policy iteration are adapted to find the optimal value function or policy associated with the Nash equilibrium (","element":"span"},{"href":"#id-15","referenceIndex":20,"text":"Patek","element":"a"},{"text":", ","element":"span"},{"href":"#id-15","referenceIndex":20,"text":"1997","element":"a"},{"text":"; ","element":"span"},{"href":"#id-14","referenceIndex":3,"text":"Hansen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-14","referenceIndex":3,"text":"2013","element":"a"},{"text":"). In the second setting which is considered here, various approximate dynamic programming algorithms have been proposed (","element":"span"},{"href":"#id-13","referenceIndex":33,"text":"Szepesvári & ","element":"a"},{"href":"#id-13","referenceIndex":33,"text":"Littman","element":"a"},{"text":", ","element":"span"},{"href":"#id-13","referenceIndex":33,"text":"1996","element":"a"},{"text":"; ","element":"span"},{"href":"#id-16","referenceIndex":1,"text":"Bowling & Veloso","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":1,"text":"2001","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":14,"text":"Littman","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":14,"text":"2001a","element":"a"},{"text":",","element":"span"},{"href":"#id-18","referenceIndex":15,"text":"b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-19","referenceIndex":5,"text":"Hu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-19","referenceIndex":5,"text":"1998","element":"a"},{"text":"; ","element":"span"},{"href":"#id-20","referenceIndex":4,"text":"Hu & Wellman","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":4,"text":"2003","element":"a"},{"text":"; ","element":"span"},{"href":"#id-21","referenceIndex":11,"text":"Lagoudakis ","element":"a"},{"href":"#id-21","referenceIndex":11,"text":"& Parr","element":"a"},{"text":", ","element":"span"},{"href":"#id-21","referenceIndex":11,"text":"2002","element":"a"},{"text":"; ","element":"span"},{"href":"#id-22","referenceIndex":21,"text":"Perolat et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-22","referenceIndex":21,"text":"2015","element":"a"},{"text":"; ","element":"span"},{"href":"#id-23","referenceIndex":22,"text":"Pérolat et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":22,"text":"2016","element":"a"},{"text":"). More recently, with the advance of deep reinforcement learning (","element":"span"},{"href":"#id-24","referenceIndex":18,"text":"Mnih et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-24","referenceIndex":18,"text":"2015","element":"a"},{"text":"; ","element":"span"},{"href":"#id-25","referenceIndex":12,"text":"Lillicrap et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-25","referenceIndex":12,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-26","referenceIndex":23,"text":"Schulman et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-26","referenceIndex":23,"text":"2015","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","referenceIndex":24,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-28","referenceIndex":37,"text":"Yang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-28","referenceIndex":37,"text":"2019a","element":"a"},{"text":"), recent work approximates the value function/policy by deep neural networks (","element":"span"},{"href":"#id-1","referenceIndex":29,"text":"Silver et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":29,"text":"2016","element":"a"},{"text":", ","element":"span"},{"href":"#id-2","referenceIndex":30,"text":"2017a","element":"a"},{"text":",","element":"span"},{"href":"#id-0","referenceIndex":31,"text":"b","element":"a"},{"text":").","element":"span"}],[{"text":"In terms of theoretical results, there has been work establishing asymptotic convergence to the optimal value function when the state space is finite. For example, Q-learning for MDP adapted to the setting of two-player zero-sum games asymptotically converges (","element":"span"},{"href":"#id-13","referenceIndex":33,"text":"Szepesvári & Littman","element":"a"},{"text":", ","element":"span"},{"href":"#id-13","referenceIndex":33,"text":"1996","element":"a"},{"text":"). Non-asymptotic results are available for model-based algorithms developed for Markov games with finite states, including R-max algorithm (","element":"span"},{"href":"#id-29","referenceIndex":2,"text":"Brafman & Tennenholtz","element":"a"},{"text":", ","element":"span"},{"href":"#id-29","referenceIndex":2,"text":"2002","element":"a"},{"text":") and an algorithm that extends upper confidence reinforcement learning algorithm (","element":"span"},{"href":"#id-30","referenceIndex":36,"text":"Wei et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-30","referenceIndex":36,"text":"2017","element":"a"},{"text":"). Recent work by ","element":"span"},{"href":"#id-31","referenceIndex":28,"text":"Sidford et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-31","referenceIndex":28,"text":"2019","element":"a"},{"text":") provides an algorithm that computes an ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/1-4.png","element":"img","alt":" ε","inline":true},{"text":"-optimal strategy with near-optimal sample complexity for Markov games with finite states. For Markov games where the transition function can be embedded in a given feature space, the work by ","element":"span"},{"href":"#id-32","referenceIndex":6,"text":"Jia ","element":"a"},{"href":"#id-32","referenceIndex":6,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-32","referenceIndex":6,"text":"2019","element":"a"},{"text":") analyzes the sample complexity of a Q-learning algorithm. However, non-asymptotic or finite sample analysis for continuous state space without a special structure, such as that considered in this work, receives less attention in the literature.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Comparison with Prior Work. ","element":"span"},{"text":"In this work, we develop Explore-Improve-Supervise (EIS) policy when the model is unknown, but one is able to sample from the underlying model. We study the convergence and sample complexity of our approach. Our goal is to provide a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"formal ","element":"span"},{"text":"study on the general framework of ","element":"span"},{"text":"EIS. The overall framework is inspired by AlphaGo Zero and inherits similar components. However, we take an important step towards bridging the gap between sound intuitions and theoretical guarantees, which is valuable for a better understanding on applying or extending this framework with different instantiations. We note that EIS bears certain similarities with another AlphaGo-inspired study (","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":"). Both works follow the main idea of coupling improvements with supervised learning. However, there are major differences. ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") focus on approximating value function in deterministic MDPs and only studies a particular instance of the modules. In contrast, we focus on a broader class of algorithms, formulating general principles and studying the guarantees. This poses different challenges and requires generic formulations on properties of the modules that are technically precise and practically implementable.","element":"span"}],[{"text":"Finally, as mentioned previously, non-asymptotic analysis for continuous state space, considered in this work, is scarce for Markov games. While there are some results for finite states, the bounds are not directly comparable. For example, the complexity in ","element":"span"},{"href":"#id-22","referenceIndex":21,"text":"Perolat et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-22","referenceIndex":21,"text":"2015","element":"a"},{"text":") depends on some oracle complexities for linear programming and regression.","element":"span"}],[{"text":"For the setting with continuous state space, the sample complexity results in ","element":"span"},{"href":"#id-32","referenceIndex":6,"text":"Jia et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-32","referenceIndex":6,"text":"2019","element":"a"},{"text":") for Q-learning rely on the assumption of linear structure of the transition kernel. The recent work by ","element":"span"},{"href":"#id-34","referenceIndex":38,"text":"Yang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-34","referenceIndex":38,"text":"2019b","element":"a"},{"text":") studies the finite-sample performance of minimax deep Q-learning for two-player zero-sum games, where the convergence rate depends on the family of neural networks. We remark that these belong to a different class of algorithms. We also derive a fundamental mini-max lower bound on sample-complexity for any method (cf. Theorem ","element":"span"},{"href":"#id-6","text":"4","element":"a"},{"text":"). The lower bound is interesting on its own. Moreover, it shows near optimal dependence on dimension for an instance of our EIS framework.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Organization. ","element":"span"},{"text":"The remainder of the paper is organized as follows. We formally introduce the framework of Markov Games and Nash equilibrium in Section ","element":"span"},{"text":"2","element":"span"},{"text":". Section ","element":"span"},{"text":"3 ","element":"span"},{"text":"describes a generic Explore-Improve-Supervise (EIS) algorithm. The precise technical properties for the modules of EIS are then stated in Section ","element":"span"},{"text":"4","element":"span"},{"text":", under which we establish our main results, convergence and sample complexity of EIS, in Section ","element":"span"},{"text":"5","element":"span"},{"text":". Finally, a concrete instantiation is provided in Section ","element":"span"},{"text":"6","element":"span"},{"text":", demonstrating the applicability of the generic EIS algorithm. All the proofs are presented in Appendices.","element":"span"}]]},{"heading":"2 Two-Player Markov Games and Nash Equilibrium","paragraphs":[[{"text":"We introduce the framework of Markov Games (MGs) (also called Stochastic Games (","element":"span"},{"href":"#id-8","referenceIndex":27,"text":"Shapley","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":27,"text":"1953","element":"a"},{"text":")) with two players and zero-sum rewards. The goal in this setting is to learn the Nash equilibrium.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Two-player Zero-sum Markov Game","element":"span"}],[{"text":"We consider two-player ","element":"span"},{"style":{"fontStyle":"italic"},"text":"turn-based ","element":"span"},{"text":"Markov games like Go and Chess, where players take turns to make decisions. We denote the two players as ","element":"span"},{"text":"P1 ","element":"span"},{"text":"and ","element":"span"},{"text":"P2","element":"span"},{"text":". Formally, a Markov game can be expressed as a tuple ","element":"span"},{"style":{"height":17.2},"width":396.91,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-0.png","element":"img","alt":"(S1, S2, A1, A2, r, P, γ)","inline":true},{"text":", where ","element":"span"},{"style":{"height":13.59},"width":40.13,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-1.png","element":"img","alt":" S1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.59},"width":40.13,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-2.png","element":"img","alt":" S2","inline":true,"padRight":true},{"text":"are the set of states controlled by ","element":"span"},{"text":"P1 ","element":"span"},{"text":"and ","element":"span"},{"text":"P2 ","element":"span"},{"text":"respectively, ","element":"span"},{"style":{"height":13.99},"width":47.82,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-3.png","element":"img","alt":" A1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.99},"width":47.82,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-4.png","element":"img","alt":"A2","inline":true,"padRight":true},{"text":"are the set of actions players can take, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"represents reward function, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P ","element":"span"},{"text":"represents transition kernel and ","element":"span"},{"style":{"height":17.2},"width":157.1,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-5.png","element":"img","alt":"γ ∈ [0, 1)","inline":true,"padRight":true},{"text":"is the discount factor. Specifically, for ","element":"span"},{"style":{"height":17.2},"width":302.18,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-6.png","element":"img","alt":" i = 1, 2, let Ai(s)","inline":true,"padRight":true},{"text":"be the set of feasible actions for player ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"in a given state ","element":"span"},{"style":{"height":13.72},"width":102.52,"height":34.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-7.png","element":"img","alt":" s ∈ Si","inline":true},{"text":". We assume that ","element":"span"},{"style":{"height":16.17},"width":230.91,"height":40.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-8.png","element":"img","alt":" S1 ∩ S2 = ∅1.","inline":true}],[{"text":"Let ","element":"span"},{"style":{"height":13.59},"width":213.4,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-9.png","element":"img","alt":" S = S1 ∪ S2","inline":true},{"text":". For each state ","element":"span"},{"style":{"height":12},"width":98.77,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-10.png","element":"img","alt":" s ∈ S","inline":true},{"text":", let ","element":"span"},{"style":{"height":17.2},"width":147.6,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-11.png","element":"img","alt":" I(s) ∈ {","inline":true},{"text":"1, 2","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"indicate the current player to play. At state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", upon taking action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"by the corresponding player ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":")","element":"span"},{"text":", player ","element":"span"},{"style":{"height":16},"width":88.14,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-12.png","element":"img","alt":" i ∈ {","inline":true},{"text":"1, 2","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"receives a reward ","element":"span"},{"style":{"height":18.19},"width":123.31,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-13.png","element":"img","alt":" ri(s, a)","inline":true},{"text":". In zero-sum games, ","element":"span"},{"style":{"height":18.19},"width":349.07,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-14.png","element":"img","alt":" r1(s, a) = −r2(s, a)","inline":true},{"text":". Without loss of generality, we let ","element":"span"},{"text":"P1 ","element":"span"},{"text":"be our reference and use the notation ","element":"span"},{"style":{"height":18.18},"width":293.38,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-15.png","element":"img","alt":" r(s, a) ≜ r1(s, a)","inline":true,"padRight":true},{"text":"for the definitions of value functions.","element":"span"}],[{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"be the distribution of the new state after playing action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":", in state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", by player ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":")","element":"span"},{"text":". In this paper, we focus on the setting where the state transitions are deterministic. This means that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"is supported on a single state, ","element":"span"},{"style":{"height":14},"width":289.86,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/2-16.png","element":"img","alt":" s ◦ a, where s ◦ a","inline":true,"padRight":true},{"text":"denotes the state after taking action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"at state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":".","element":"span"}],[{"text":"For each ","element":"span"},{"style":{"height":16},"width":273.98,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-0.png","element":"img","alt":" i ∈ {1, 2}, let πi","inline":true,"padRight":true},{"text":"be the policy for player ","element":"span"},{"style":{"height":17.2},"width":261.8,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-1.png","element":"img","alt":" i, where πi(·|s)","inline":true,"padRight":true},{"text":"is a probability distribution over ","element":"span"},{"style":{"height":17.2},"width":108.64,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-2.png","element":"img","alt":" Ai(s).","inline":true,"padRight":true},{"text":"Denote by ","element":"span"},{"style":{"height":13.72},"width":44.65,"height":34.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-3.png","element":"img","alt":" Πi","inline":true,"padRight":true},{"text":"the set of all stationary policies of player ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":", and let ","element":"span"},{"style":{"height":13.59},"width":244.47,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-4.png","element":"img","alt":" Π = Π1 × Π2","inline":true,"padRight":true},{"text":"be the set of all polices for the game. A two-player zero-sum game can be seen as player ","element":"span"},{"text":"P1 ","element":"span"},{"text":"aiming to maximize the accumulated discounted reward while ","element":"span"},{"text":"P2 ","element":"span"},{"text":"attempting to minimize it. The value function and Q function for a zero-sum Markov game can be defined in a manner analogous to the MDP setting:","element":"span"}],[{"style":{"width":"58%"},"width":1089,"height":251,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.98},"width":958.94,"height":49.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-6.png","element":"img","alt":" al ∼ πI(sl)(·|sl) and sl+1 ∼ P(sl, al). That is, Vπ1,π2(s)","inline":true,"padRight":true},{"text":"is the expected total discounted reward for ","element":"span"},{"text":"P1 ","element":"span"},{"text":"if the game starts from state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", players ","element":"span"},{"text":"P1 ","element":"span"},{"text":"and ","element":"span"},{"text":"P2 ","element":"span"},{"text":"use the policies ","element":"span"},{"style":{"height":13.19},"width":169,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-7.png","element":"img","alt":" π1 and π2","inline":true,"padRight":true},{"text":"respectively. The interpretation for Q-value is similar.","element":"span"}],[{"text":"To simplify the notation, we assume that ","element":"span"},{"style":{"height":13.99},"width":247.4,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-8.png","element":"img","alt":" A1 = A2 ≜ A","inline":true},{"text":", where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"is a finite set. We consider ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"to be a compact subset of ","element":"span"},{"style":{"height":14.59},"width":49.28,"height":36.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-9.png","element":"img","alt":" Rd","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":13.2},"width":66.93,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-10.png","element":"img","alt":"d ≥","inline":true,"padRight":true},{"text":"1). The rewards ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"are independent random variables taking value in ","element":"span"},{"style":{"height":17.2},"width":258.38,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-11.png","element":"img","alt":"[−Rmax, Rmax]","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":13.19},"width":137.99,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-12.png","element":"img","alt":" Rmax >","inline":true,"padRight":true},{"text":"0. Define ","element":"span"},{"style":{"height":17.2},"width":387.28,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-13.png","element":"img","alt":" Vmax ≜ Rmax/(1 − γ)","inline":true},{"text":". It follows that absolute value of value function and Q function for any policy is bounded by ","element":"span"},{"style":{"height":13.19},"width":96.46,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-14.png","element":"img","alt":" Vmax.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Regarding Deterministic Transitions. ","element":"span"},{"text":"Let us clarify this assumption. In fact, our approach and main results of EIS framework (i.e., Sections ","element":"span"},{"text":"4 ","element":"span"},{"text":"and ","element":"span"},{"text":"5","element":"span"},{"text":") apply to general non-deterministic cases as well. However, the example in Section ","element":"span"},{"text":"6 ","element":"span"},{"text":"considers deterministic cases. In particular, the improvement module is instantiated by a variant of Monte Carlo Tree Search, where a clean non-asymptotical analysis has been only established for the deterministic case (","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":"). To facilitate a coherent exposition, we focus on deterministic cases here. Indeed, many games, such as Go and Chess, are deterministic. Additionally, note that one could instantiate our EIS framework with other methods for the non-deterministic cases—for instance, by adapting the sparse sampling oracle (","element":"span"},{"href":"#id-35","referenceIndex":8,"text":"Kearns et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-35","referenceIndex":8,"text":"2002","element":"a"},{"text":") as the improvement module—to obtain a similar analysis. As a proof of concept, we provide empirical results in Appendix ","element":"span"},{"href":"#id-36","text":"J ","element":"a"},{"text":"on a non-deterministic game with the sparse sampling oracle.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Nash Equilibrium","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 1 ","element":"span"},{"text":"(Optimal Counter Policy)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a policy ","element":"span"},{"style":{"height":13.59},"width":143.62,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-15.png","element":"img","alt":" π2 ∈ Π2","inline":true},{"style":{"fontStyle":"italic"},"text":", policy ","element":"span"},{"style":{"height":13.59},"width":143.63,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-16.png","element":"img","alt":" π1 ∈ Π1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for ","element":"span"},{"text":"P1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is said to be an optimal counter-policy against ","element":"span"},{"style":{"height":9.19},"width":38.72,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-17.png","element":"img","alt":" π2","inline":true},{"style":{"fontStyle":"italic"},"text":", if and only if for every ","element":"span"},{"style":{"height":12},"width":95.81,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-18.png","element":"img","alt":" s ∈ S","inline":true},{"style":{"fontStyle":"italic"},"text":", we have ","element":"span"},{"style":{"height":21.34},"width":552.11,"height":53.34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-19.png","element":"img","alt":" Vπ1,π2(s) ≥ Vπ′1,π2(s), ∀π′1 ∈ Π1","inline":true},{"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Similarly, a policy ","element":"span"},{"style":{"height":14.4},"width":264.7,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-20.png","element":"img","alt":" π2 ∈ Π2 for P2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is said to be an optimal counter-policy against a given policy ","element":"span"},{"style":{"height":14.4},"width":208.33,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-21.png","element":"img","alt":" π1 ∈ Π1 for","inline":true,"padRight":true},{"text":"P1","element":"span"},{"style":{"fontStyle":"italic"},"text":", if and only if for every ","element":"span"},{"style":{"height":19.74},"width":563.17,"height":49.34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-22.png","element":"img","alt":" s ∈ S Vπ1,π2 ≤ Vπ1,π′2, ∀π′2 ∈ Π2.","inline":true}],[{"text":"In a two-player zero-sum game, it has been shown that the pairs of optimal policies coincides with the Nash equilibrium of this game (","element":"span"},{"href":"#id-9","referenceIndex":16,"text":"Maitra & Parthasarathy","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":16,"text":"1970","element":"a"},{"text":"; ","element":"span"},{"href":"#id-11","referenceIndex":19,"text":"Parthasarathy","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":19,"text":"1973","element":"a"},{"text":"; ","element":"span"},{"href":"#id-12","referenceIndex":10,"text":"Kumar & Shiau","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":10,"text":"1981","element":"a"},{"text":"). In particular, a pair of policies ","element":"span"},{"style":{"height":17.37},"width":135.04,"height":43.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-23.png","element":"img","alt":" (π∗1, π∗2)","inline":true,"padRight":true},{"text":"is called an equilibrium solution of the game, if ","element":"span"},{"style":{"height":15.96},"width":40.15,"height":39.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-24.png","element":"img","alt":" π∗1 ","inline":true,"padRight":true},{"text":"is an optimal counter ","element":"span"},{"text":"policy against ","element":"span"},{"style":{"height":15.96},"width":40.14,"height":39.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-25.png","element":"img","alt":" π∗2","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.96},"width":40.14,"height":39.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-26.png","element":"img","alt":" π∗2","inline":true,"padRight":true},{"text":"is an optimal counter policy against ","element":"span"},{"style":{"height":15.96},"width":40.15,"height":39.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-27.png","element":"img","alt":" π∗1","inline":true},{"text":". The value function of the optimal policy, ","element":"span"},{"style":{"height":18.48},"width":347.58,"height":46.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-28.png","element":"img","alt":"Vπ∗1,π∗2, is the unique","inline":true,"padRight":true},{"text":"fixed point of a ","element":"span"},{"style":{"height":10.4},"width":22,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-29.png","element":"img","alt":" γ","inline":true},{"text":"-contraction operator. In the sequel, we will simply refer to the strategy ","element":"span"},{"style":{"height":17.37},"width":233.6,"height":43.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-30.png","element":"img","alt":"π∗ = (π∗1, π∗2)","inline":true,"padRight":true},{"text":"as the optimal policy. Finally, we use the concise notation ","element":"span"},{"style":{"height":11.38},"width":48.1,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-31.png","element":"img","alt":" V ∗","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.18},"width":47.5,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-32.png","element":"img","alt":" Q∗","inline":true,"padRight":true},{"text":"to denote the optimal ","element":"span"},{"text":"value function and the optimal Q-value, respectively, i.e., ","element":"span"},{"style":{"height":20.47},"width":825.35,"height":51.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-33.png","element":"img","alt":" V ∗(s) = Vπ∗1,π∗2 (s) and Q∗(s, a) = Qπ∗1,π∗2 (s, a).","inline":true}]]},{"heading":"3 EIS: Explore-Improve-Supervise","paragraphs":[[{"text":"We describe Explore-Improve-Supervise (EIS) algorithm for learning the optimal value function ","element":"span"},{"style":{"height":11.38},"width":48.1,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-34.png","element":"img","alt":" V ∗ ","inline":true,"padRight":true},{"text":"and optimal policy ","element":"span"},{"style":{"height":11.38},"width":40.15,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/3-35.png","element":"img","alt":" π∗","inline":true},{"text":". The algorithm consists of three separate, but intertwined modules: exploration, improvement and supervised learning. Below is a brief summary of these modules. The precise, formal description of properties","element":"span"}],[{"id":"id-37","style":{"width":"99%"},"width":1872,"height":1010,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-0.png","element":"img"}],[{"text":"desired from these modules is stated in Section ","element":"span"},{"text":"4","element":"span"},{"text":", which will lead to convergence and correctness of the EIS algorithm as stated in Theorem ","element":"span"},{"href":"#id-3","text":"2","element":"a"},{"text":". Section ","element":"span"},{"text":"6 ","element":"span"},{"text":"provides a concrete example of modules of EIS satisfying properties stated in Section ","element":"span"},{"text":"4","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Exploration Module. ","element":"span"},{"text":"To extract meaningful information for the entire game, sufficient exploration is required so that enough ","element":"span"},{"style":{"fontStyle":"italic"},"text":"representative ","element":"span"},{"text":"states will be visited. This is commonly achieved by an appropriate exploration policy, such as ","element":"span"},{"style":{"height":0},"width":11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-1.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy policy and Boltzmann policy. We require the existence of an exploration module guaranteeing sufficient exploration.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Improvement Module. ","element":"span"},{"text":"For the overall learning to make any progress, the improvement module improves the existing estimates of the optimal solution. In particular, given the current estimates ","element":"span"},{"style":{"height":14.62},"width":154.46,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-2.png","element":"img","alt":"ˆV for V ∗","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.2},"width":23,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-3.png","element":"img","alt":" ˆπ","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":11.38},"width":40.15,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-4.png","element":"img","alt":" π∗","inline":true},{"text":", for a state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", a query of the improvement module produces better estimates ","element":"span"},{"style":{"height":19.02},"width":95.1,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-5.png","element":"img","alt":"ˆV ′(s)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.2},"width":109.28,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-6.png","element":"img","alt":" ˆπ′(·|s)","inline":true,"padRight":true},{"text":"that are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"closer ","element":"span"},{"text":"to the optimal ","element":"span"},{"style":{"height":17.2},"width":322.07,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-7.png","element":"img","alt":" V ∗(s) and π∗(·|s).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Supervised Learning Module. ","element":"span"},{"text":"The previous two modules can be collectively viewed as a data generation process: the exploration module samples sufficient representative states, while a query of the improvement module provides improved estimates for the optimal value and policy. With these as training data, supervised learning module would learn and generalize the improvement of the training data to the entire state space. Subsequently, the trained supervised learning module produces better estimates for ","element":"span"},{"style":{"height":11.38},"width":48.1,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-8.png","element":"img","alt":" V ∗","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.39},"width":53.23,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-9.png","element":"img","alt":" π∗.","inline":true}],[{"text":"Combining together, the three modules naturally lead to the following iterative algorithm whose pseudocode is provided in Algorithm ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":". Initially, the algorithm starts with an arbitrary model for value function and policy. In each iteration ","element":"span"},{"style":{"height":13.2},"width":54.74,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-10.png","element":"img","alt":" l ≥","inline":true,"padRight":true},{"text":"1, it performs two steps:","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Step 1. Data Generation. ","element":"span"},{"text":"Given current model ","element":"span"},{"style":{"height":17.2},"width":345.48,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-11.png","element":"img","alt":" fl−1 = (Vl−1, πl−1)","inline":true},{"text":": for current state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", query the improvement module to obtain better estimates ","element":"span"},{"style":{"height":19.02},"width":271.02,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-12.png","element":"img","alt":"ˆV (s) and ˆπ(·|s)","inline":true,"padRight":true},{"text":"than the current estimates ","element":"span"},{"style":{"height":17.2},"width":300.43,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-13.png","element":"img","alt":" fl−1(s); and then","inline":true,"padRight":true},{"text":"query the exploration module to arrive at the next state ","element":"span"},{"style":{"height":6.8},"width":26.68,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-14.png","element":"img","alt":" s′","inline":true},{"text":"; repeat the above process to obtain training data of ","element":"span"},{"style":{"height":19.39},"width":613.49,"height":48.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-15.png","element":"img","alt":" n samples, {(si, ˆV (si), ˆπ(·|si))}ni=1.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Step 2. Supervised Learning. ","element":"span"},{"text":"Given the improved estimates ","element":"span"},{"style":{"height":17.2},"width":66.7,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-16.png","element":"img","alt":" {(si","inline":true},{"text":", ","element":"span"},{"style":{"height":19.39},"width":317.12,"height":48.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-17.png","element":"img","alt":"ˆV (si), ˆπ(·|si))}ni=1","inline":true},{"text":", use the supervised ","element":"span"},{"text":"learning module to build a new model ","element":"span"},{"style":{"height":17.2},"width":219.66,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/4-18.png","element":"img","alt":" fl = (Vl, πl).","inline":true}],[{"text":"Intuitively, the iterative algorithm keeps improving our estimation after each iteration, and eventually converges to optimal solutions. The focus of this paper is to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"formally ","element":"span"},{"text":"understand under what conditions on each of the exploration, improvement and supervised learning module does the algorithm work. Of course,","element":"span"}],[{"text":"proof is in the puddling—we provide examples of existence of such modules in Section ","element":"span"},{"text":"6","element":"span"},{"text":".","element":"span"}]]},{"heading":"4 Properties of Modules","paragraphs":[[{"text":"In this section we formally state the desired properties of each of the three modules of EIS. With these properties, we establish convergence and correctness of EIS algorithm in Section ","element":"span"},{"text":"5 ","element":"span"},{"text":"to follow. We remark that the properties are not made for the ease of technical analysis. Examples satisfying them shall be provided in Section ","element":"span"},{"text":"6","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Improvement Module","element":"span"}],[{"text":"This module improves both value function and policy. The value function is real-valued, whereas policy for each given state can be viewed as a probability distribution over all possible actions. This requires a careful choice of metric for quantifying improvement. Let ","element":"span"},{"style":{"height":19.02},"width":266.76,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-0.png","element":"img","alt":"ˆV (s) and ˆπ(·|s)","inline":true,"padRight":true},{"text":"be the estimates output by the improvement module in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration of EIS. Improvement of value function means ","element":"span"},{"style":{"height":19.02},"width":569.55,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-1.png","element":"img","alt":" | ˆV (s) − V ∗(s)| < |Vl(s) − V ∗(s)|","inline":true},{"text":". Improvement for policy is measured by the KL divergence between ","element":"span"},{"style":{"height":17.2},"width":98.56,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-2.png","element":"img","alt":" ˆπ(·|s)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.2},"width":116.64,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-3.png","element":"img","alt":" π∗(·|s)","inline":true},{"text":". Here some care is needed as KL divergence would become infinite if supports of the distributions mismatch.","element":"span"}],[{"text":"Note that the optimal policy ","element":"span"},{"style":{"height":11.38},"width":40.14,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-4.png","element":"img","alt":" π∗","inline":true,"padRight":true},{"text":"only assigns positive probability to the optimal actions. On the other hand, there is no guarantee that ","element":"span"},{"style":{"height":17.2},"width":98.56,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-5.png","element":"img","alt":" ˆπ(·|s)","inline":true,"padRight":true},{"text":"always has a full support on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":". To overcome these challenges, we instead measure the KL divergence with an alternative \"optimal policy\" that guarantees a full support on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":". This naturally leads to the optimal Boltzmann policy: given a temperature ","element":"span"},{"style":{"height":9.6},"width":64,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-6.png","element":"img","alt":" τ >","inline":true,"padRight":true},{"text":"0, the optimal Boltzmann policy is given by","element":"span"}],[{"id":"id-61","style":{"width":"69%"},"width":1299,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-7.png","element":"img"}],[{"text":"If ","element":"span"},{"style":{"height":17.2},"width":555.56,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-8.png","element":"img","alt":" I(s) is player P2, use −Q∗(s, a)","inline":true,"padRight":true},{"text":"instead in the above equation to construct the Boltzmann policy (Recall that player ","element":"span"},{"text":"P1 ","element":"span"},{"text":"is set to be our reference in Section ","element":"span"},{"text":"2","element":"span"},{"text":"). By definition, ","element":"span"},{"style":{"height":19.2},"width":361.85,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-9.png","element":"img","alt":" DKL�ˆπ(·|s)||P ∗τ (·|s)�","inline":true},{"text":"is guaranteed to be finite for any estimate ","element":"span"},{"style":{"height":17.2},"width":98.56,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-10.png","element":"img","alt":" ˆπ(·|s)","inline":true},{"text":". Furthermore, ","element":"span"},{"style":{"height":15.33},"width":47.12,"height":38.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-11.png","element":"img","alt":" P ∗τ ","inline":true,"padRight":true},{"text":"converges to ","element":"span"},{"style":{"height":11.38},"width":176.84,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-12.png","element":"img","alt":" π∗ as τ →","inline":true,"padRight":true},{"text":"0. Therefore, we could use the KL ","element":"span"},{"text":"divergence ","element":"span"},{"style":{"height":19.2},"width":361.85,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-13.png","element":"img","alt":" DKL�ˆπ(·|s)||P ∗τ (·|s)�","inline":true},{"text":"with a small enough ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-14.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"to measure the improvement of the estimates.","element":"span"}],[{"text":"Finally, it makes sense to take into account the number of samples (i.e., observed state transitions) required by the module to improve the policy and value function. We now formally lay down the following property for the improvement module.","element":"span"}],[{"id":"id-38","style":{"fontWeight":"bold"},"text":"Property 1. (Improvement Property) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose the current model ","element":"span"},{"style":{"height":17.2},"width":380.22,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-15.png","element":"img","alt":" f(s) = (V (s), π(·|s))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"(potentially random) has estimation errors ","element":"span"},{"style":{"height":15.59},"width":362.01,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-16.png","element":"img","alt":" ε0,v > 0 and ε0,p > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for the value and policy estimates, respectively, i.e.,","element":"span"}],[{"style":{"width":"37%"},"width":703,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-17.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the expectations are taken with respect to the randomness of the model ","element":"span"},{"style":{"height":17.2},"width":199.36,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-18.png","element":"img","alt":" f = (V , π).","inline":true}],[{"style":{"width":"101%"},"width":1902,"height":376,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/5-19.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the expectations are with respect to the randomness in the model ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and the improvement module.","element":"span"}],[{"text":"Property ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"allows for a randomized improvement module, but requires that on average, the errors for the value and policy estimates should strictly shrink.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supervised Learning Module","element":"span"}],[{"text":"To direct the model update in an improving manner, the supervised learning step (line 10 of Algorithm ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":") should be able to learn from the training data, ","element":"span"},{"style":{"height":14.62},"width":31,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-0.png","element":"img","alt":"ˆV","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.2},"width":23,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-1.png","element":"img","alt":" ˆπ","inline":true},{"text":", and generalize to unseen states by preserving the same order of error as the training data. Generically speaking, generalization would require two conditions: (1) sufficiently many training data that are “representative” of the underlying state space; (2) the model itself is expressive enough to capture the characteristics of the function that is desired to be learned.","element":"span"}],[{"text":"Before specifying the generalization property, let us provide a few remarks on the above conditions. Condition (1) is typically ensured by using an effective exploration module. Recall that the state space is continuous. The exploration module should be capable of navigating the space until sufficiently many different states are visited. Intuitively, these ","element":"span"},{"style":{"fontStyle":"italic"},"text":"finite ","element":"span"},{"text":"states should properly cover the entire space, i.e., they are representative of the entire space so that learning from these states provide enough information for other states. Formally, this means that given the current estimation errors ","element":"span"},{"style":{"height":11.99},"width":59.61,"height":29.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-2.png","element":"img","alt":" ε1,v","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.99},"width":60.61,"height":29.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-3.png","element":"img","alt":" ε1,p","inline":true,"padRight":true},{"text":"for the optimal value and policy, there exists a sufficiently large set of ","element":"span"},{"style":{"height":17.59},"width":211.88,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-4.png","element":"img","alt":" N(ε1,v, ε1,p)","inline":true,"padRight":true},{"text":"training states, such that supervised learning applied to those training data would generalize to the entire state space with the same order of accuracy. The precise definition of representative states may depend on the particular supervised learning algorithm.","element":"span"}],[{"text":"Regarding condition (2), generalization performance of traditional models has been well studied in classical statistical learning theory. More recently, deep neural networks exhibit superior empirical generalization ability, although a complete rigorous proof seems beyond the reach of existing techniques. Our goal is to seek general principle underlying the supervised learning step and as such, we do not limit ourselves to specific models—the learning model could be a parametric model that learns via minimizing empirical squared loss and cross-entropy loss, or it could be a non-parametric model such as nearest neighbors regression. With the above conditions in mind, we state the following general property for the supervised learning module:","element":"span"}],[{"id":"id-39","style":{"fontWeight":"bold"},"text":"Property 2. (Generalization Property) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let temperature ","element":"span"},{"style":{"height":11.2},"width":95.07,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-5.png","element":"img","alt":" τ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", estimation errors ","element":"span"},{"style":{"height":15.59},"width":360.27,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-6.png","element":"img","alt":" ε1,v > 0 and ε1,p > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be given. There exists at least one set of finite states, denoted by ","element":"span"},{"style":{"height":17.59},"width":241.79,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-7.png","element":"img","alt":" S(τ, ε1,v, ε1,p)","inline":true},{"style":{"fontStyle":"italic"},"text":", with size ","element":"span"},{"style":{"height":17.59},"width":332.74,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-8.png","element":"img","alt":" NS(τ, ε1,v, ε1,p), so","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"that the following ","element":"span"},{"style":{"fontWeight":"bold"},"text":"generalization bound ","element":"span"},{"style":{"fontStyle":"italic"},"text":"holds:","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Suppose that a training dataset","element":"span"},{"style":{"height":21.36},"width":986.95,"height":53.41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-9.png","element":"img","alt":"��si, ˆV (si), ˆπ(·|si)��ni=1 satisfies S(τ, ε1,v, ε1,p) ⊂ {si}ni=1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and the following ","element":"span"},{"style":{"fontStyle":"italic"},"text":"error guarantees:","element":"span"}],[{"style":{"width":"39%"},"width":744,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-10.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the expectation is taken with respect to the randomness of the value ","element":"span"},{"style":{"height":19.02},"width":97.32,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-11.png","element":"img","alt":"ˆV (si)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":17.2},"width":111.51,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-12.png","element":"img","alt":" ˆπ(·|si)","inline":true},{"style":{"fontStyle":"italic"},"text":". Then, there exist non-negative universal constants ","element":"span"},{"style":{"height":15.59},"width":159.63,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-13.png","element":"img","alt":" cp and cv","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that after querying the supervised learning module, i.e., ","element":"span"},{"style":{"height":17.2},"width":152.21,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-14.png","element":"img","alt":"(V , π) =","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Supervised Learning Module","element":"span"},{"style":{"height":20.11},"width":702.02,"height":50.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-15.png","element":"img","alt":"({�si, ˆV (si), ˆπ(·|si)�}ni=1), (V , π) satisfy","inline":true}],[{"style":{"width":"42%"},"width":788,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-16.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"4.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Exploration Module","element":"span"}],[{"text":"With the above development, it is now straightforward to identify the desired property of the exploration module. In particular, as part of the data generation step, it should be capable of exploring the space so that a set of representative states ","element":"span"},{"style":{"height":17.59},"width":242.6,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-17.png","element":"img","alt":" S(τ, ε1,v, ε1,p)","inline":true,"padRight":true},{"text":"are visited. Consequently, the supervised learning module can then leverage the training data to generalize. Formally, let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"E ","element":"span"},{"text":"be the set of all possible representative sets that satisfy the Generalization Property:","element":"span"}],[{"style":{"width":"78%"},"width":1468,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/6-18.png","element":"img"}],[{"text":"Denote by ","element":"span"},{"style":{"height":18.16},"width":258.67,"height":45.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-0.png","element":"img","alt":" T (t) ≜ {si}ti=1","inline":true,"padRight":true},{"text":"the set of states explored by querying the exploration module up to time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", with ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-1.png","element":"img","alt":"s1","inline":true,"padRight":true},{"text":"being the initial state and ","element":"span"},{"style":{"height":10.52},"width":117.57,"height":26.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-2.png","element":"img","alt":" si+1 =","inline":true,"padRight":true},{"text":"Exploration Module","element":"span"},{"style":{"height":17.2},"width":64.72,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-3.png","element":"img","alt":"(si)","inline":true,"padRight":true},{"text":"(cf. line 7 of Algorithm ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":"). We now state the exploration property, which stipulates that starting at an arbitrary state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", the explored states should contain one of the representative sets in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"E","element":"span"},{"text":", within a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"finite ","element":"span"},{"text":"number of steps.","element":"span"}],[{"id":"id-40","style":{"fontWeight":"bold"},"text":"Property 3. (Exploration Property) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given the temperature ","element":"span"},{"style":{"height":11.6},"width":95.12,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-4.png","element":"img","alt":" τ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", and estimation errors ","element":"span"},{"style":{"height":15.59},"width":215.32,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-5.png","element":"img","alt":" ε1,v > 0 and","inline":true},{"style":{"height":15.59},"width":134.94,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-6.png","element":"img","alt":"ε1,p > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for the value and policy, define","element":"span"}],[{"style":{"width":"76%"},"width":1435,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-7.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Then, the exploration module satisfies that ","element":"span"},{"style":{"height":14.4},"width":128.64,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-8.png","element":"img","alt":" ∀s ∈ S,","inline":true}],[{"style":{"width":"35%"},"width":658,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-9.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"for some ","element":"span"},{"style":{"height":17.59},"width":340.62,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-10.png","element":"img","alt":" B(τ, ε1,v, ε1,p) < ∞","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"independent of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"style":{"fontStyle":"italic"},"text":". The above expectation is taken with respect to the randomness in the exploration module and the environment (i.e., state transitions).","element":"span"}],[{"text":"In the sequel, when the context is clear or the initial state does not matter, we usually drop the dependence in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"to simplify the notation, i.e., ","element":"span"},{"style":{"height":17.59},"width":255.53,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-11.png","element":"img","alt":" T(τ, ε1,v, ε1,p).","inline":true}]]},{"heading":"5 Main Results: Convergence Guarantees and Sample Complex-","paragraphs":[[{"style":{"width":"3%"},"width":72,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-12.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"5.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Convergence Guarantees","element":"span"}],[{"text":"As the main result of this paper, we establish convergence of the EIS algorithm under the three desired properties given in Section ","element":"span"},{"text":"4","element":"span"},{"text":", and quantify the corresponding finite sample complexity. We also provide an algorithm-independent minimax lower bound; in Section ","element":"span"},{"text":"6 ","element":"span"},{"text":"we introduce an instance of EIS that essentially matches this lower bound.","element":"span"}],[{"id":"id-3","style":{"fontWeight":"bold"},"text":"Theorem 2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a small enough ","element":"span"},{"style":{"height":9.6},"width":68.22,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-13.png","element":"img","alt":" τ >","inline":true,"padRight":true},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":", let Properties ","element":"span"},{"href":"#id-38","style":{"fontStyle":"italic"},"text":"1","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-39","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-40","style":{"fontStyle":"italic"},"text":"3 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"hold. Let ","element":"span"},{"style":{"height":16.79},"width":347.67,"height":41.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-14.png","element":"img","alt":" C0,v = ∥V0 − V ∗∥∞","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":19.2},"width":632.84,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-15.png","element":"img","alt":"C0,p = sups∈S DKL�π0(·|s)∥P ∗τ (·|s)�","inline":true},{"style":{"fontStyle":"italic"},"text":"be initialization errors. Then for a given ","element":"span"},{"style":{"height":17.2},"width":88.57,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-16.png","element":"img","alt":" ρ ∈ (","inline":true},{"text":"0, 1","element":"span"},{"text":")","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"with appropriate parameters for Algorithm ","element":"span"},{"href":"#id-37","style":{"fontStyle":"italic"},"text":"1","element":"a"},{"style":{"fontStyle":"italic"},"text":", the output ","element":"span"},{"style":{"height":17.2},"width":380.89,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-17.png","element":"img","alt":" fL = (VL, πL) after L","inline":true},{"style":{"fontStyle":"italic"},"text":"-th iteration satisfies","element":"span"}],[{"id":"id-42","style":{"width":"96%"},"width":1808,"height":234,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-18.png","element":"img"}],[{"text":"value function ","element":"span"},{"style":{"height":11.38},"width":48.1,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-19.png","element":"img","alt":" V ∗ ","inline":true,"padRight":true},{"text":"and the optimal Boltzmann policy ","element":"span"},{"style":{"height":15.32},"width":47.12,"height":38.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-20.png","element":"img","alt":" P ∗τ ","inline":true,"padRight":true},{"text":"exponentially with respect to the number of iterations. ","element":"span"},{"text":"In particular, after","element":"span"}],[{"id":"id-41","style":{"width":"35%"},"width":657,"height":103,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-21.png","element":"img"}],[{"text":"iterations, we can obtain estimates for both ","element":"span"},{"style":{"height":15.32},"width":183.63,"height":38.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-22.png","element":"img","alt":" V ∗ and P ∗τ ","inline":true,"padRight":true},{"text":"that are within ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-23.png","element":"img","alt":" ε","inline":true,"padRight":true},{"text":"estimation errors. We note that with ","element":"span"},{"text":"a sufficiently small temperature, ","element":"span"},{"style":{"height":15.33},"width":47.12,"height":38.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-24.png","element":"img","alt":" P ∗τ","inline":true,"padRight":true},{"text":"is close to the optimal policy ","element":"span"},{"style":{"height":11.39},"width":40.15,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-25.png","element":"img","alt":" π∗","inline":true},{"text":". Therefore, the model ","element":"span"},{"style":{"height":17.2},"width":244.18,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-26.png","element":"img","alt":" fL = (VL, πL)","inline":true,"padRight":true},{"text":"can be close to ","element":"span"},{"style":{"height":17.2},"width":143.22,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-27.png","element":"img","alt":" (V ∗, π∗)","inline":true,"padRight":true},{"text":"for a large ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"5.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Sample Complexity","element":"span"}],[{"text":"We can also characterize the sample complexity of the EIS algorithm. Recall that the sample complexity is defined as the total number of state transitions required for the algorithm to learn ","element":"span"},{"style":{"height":0},"width":11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/7-28.png","element":"img","alt":" ϵ","inline":true},{"text":"-approximate value/policy ","element":"span"},{"text":"function. The sample complexity of Algorithm ","element":"span"},{"href":"#id-37","text":"1 ","element":"a"},{"text":"comes from two parts: the improvement module and the exploration module. Recall that the improvement module requires ","element":"span"},{"style":{"height":17.59},"width":346.19,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-0.png","element":"img","alt":" κ(τ, ε0,v, ε0,p, ζv, ζp)","inline":true,"padRight":true},{"text":"samples for each call (cf. Property ","element":"span"},{"href":"#id-38","text":"1","element":"a"},{"text":"). The sample complexity of exploration module is proportional to ","element":"span"},{"style":{"height":17.59},"width":369.25,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-1.png","element":"img","alt":" T(τ, ε1,v, ε1,p), which","inline":true,"padRight":true},{"text":"satisfies ","element":"span"},{"style":{"height":17.59},"width":600.92,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-2.png","element":"img","alt":" E[T(τ, ε1,v, ε1,p)] ≤ B(τ, ε1,v, ε1,p)","inline":true,"padRight":true},{"text":"(cf. Property ","element":"span"},{"href":"#id-40","text":"3","element":"a"},{"text":"). The following proposition bounds the sample complexity in terms of the above relevant quantities.","element":"span"}],[{"id":"id-4","style":{"fontWeight":"bold"},"text":"Proposition 3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the setting of Theorem ","element":"span"},{"href":"#id-3","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":". Then, with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":11.6},"width":57.72,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-3.png","element":"img","alt":" − δ","inline":true},{"style":{"fontStyle":"italic"},"text":", the convergence result (i.e., Eqs (","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"5","element":"a"},{"style":{"fontStyle":"italic"},"text":") and (","element":"span"},{"href":"#id-42","style":{"fontStyle":"italic"},"text":"6","element":"a"},{"style":{"fontStyle":"italic"},"text":")) is achieved with sample complexity ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"style":{"fontStyle":"italic"},"text":"such that","element":"span"}],[{"style":{"width":"71%"},"width":1337,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-4.png","element":"img"}],[{"text":"In Section ","element":"span"},{"text":"6","element":"span"},{"text":", we provide a concrete instance of EIS that finds ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-5.png","element":"img","alt":" ε","inline":true},{"text":"-approximate value function and policy of Nash equilibrium with ","element":"span"},{"style":{"height":19.78},"width":194.62,"height":49.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-6.png","element":"img","alt":"�O(ε−(d+4))","inline":true,"padRight":true},{"text":"transitions.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"5.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A Generic Lower Bound","element":"span"}],[{"text":"To understand how good the above sample complexity upper bound is, we establish a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"lower bound ","element":"span"},{"text":"for any algorithm under any sampling policy. In particular, we leverage the the minimax lower bound for the problem of non-parametric regression (","element":"span"},{"href":"#id-43","referenceIndex":34,"text":"Tsybakov","element":"a"},{"text":", ","element":"span"},{"href":"#id-43","referenceIndex":34,"text":"2009","element":"a"},{"text":"; ","element":"span"},{"href":"#id-44","referenceIndex":32,"text":"Stone","element":"a"},{"text":", ","element":"span"},{"href":"#id-44","referenceIndex":32,"text":"1982","element":"a"},{"text":") to establish the lower bound, as stated in the following theorem.","element":"span"}],[{"id":"id-6","style":{"fontWeight":"bold"},"text":"Theorem 4. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given an algorithm, let ","element":"span"},{"style":{"height":13.59},"width":46.25,"height":33.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-7.png","element":"img","alt":" VT","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be the estimate of ","element":"span"},{"style":{"height":11.39},"width":48.1,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-8.png","element":"img","alt":" V ∗","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"after ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"style":{"fontStyle":"italic"},"text":"samples of transitions for the given Markov game. Then, for each ","element":"span"},{"style":{"height":17.2},"width":158.72,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-9.png","element":"img","alt":" δ ∈ (0, 1)","inline":true},{"style":{"fontStyle":"italic"},"text":", there exists a two-player zero-sum Markov game such that in order to achieve ","element":"span"},{"style":{"height":28.8},"width":512.42,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-10.png","element":"img","alt":" P�� ˆVT − V ∗��∞ < ε�≥ 1 − δ,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"it must be that","element":"span"}],[{"style":{"width":"25%"},"width":484,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":11.6},"width":115.19,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-12.png","element":"img","alt":" C′ > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is an algorithm-independent constant.","element":"span"}]]},{"heading":"6 Implementation: A Concrete Instantiation of the Key Modules","paragraphs":[[{"text":"In this section, we demonstrate the applicability of the generic EIS algorithm by giving a concrete instantiation. Specifically, we will use a variant of Monte Carlo Tree Search (MCTS) as the improvement module, nearest neighbor regression as the supervised learning module, and random sampling as the exploration module. We prove that all properties in Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"are satisfied. This shows that these properties are reasonable and hence gives a strong support for the generic recipe developed in this paper. Due to space limit, we provide high-level discussions here with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"informal ","element":"span"},{"text":"technical results, and defer precise statements to Appendix ","element":"span"},{"href":"#id-45","text":"E","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Improvement Module: ","element":"span"},{"style":{"fontWeight":"bold"},"text":"MCTS. ","element":"span"},{"text":"Recall that the improvement module should be able to provide improved estimates for the value and policy functions, at the queried state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":". Since both the value and policy are related to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"text":"function, one approach for estimate improvement is to first obtain better estimates ","element":"span"},{"style":{"height":17.42},"width":90.72,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-13.png","element":"img","alt":"ˆQ for","inline":true},{"style":{"height":14.19},"width":47.5,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-14.png","element":"img","alt":"Q∗","inline":true,"padRight":true},{"text":"and then construct the improved estimates of value and policy from ","element":"span"},{"style":{"height":17.42},"width":32,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-15.png","element":"img","alt":"ˆQ","inline":true},{"text":". We will take this approach in this example and use MCTS to obtain the estimates of ","element":"span"},{"style":{"height":14.18},"width":47.5,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-16.png","element":"img","alt":" Q∗ ","inline":true,"padRight":true},{"text":"(see Algorithm ","element":"span"},{"href":"#id-46","text":"2 ","element":"a"},{"text":"in Appendix ","element":"span"},{"href":"#id-45","text":"E","element":"a"},{"text":"). We assume the existence of a generative model (i.e., a simulator). The following theorem states the property of this specific improvement module, which directly implies the desired improvement property, i.e., Property ","element":"span"},{"href":"#id-38","text":"1","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 5 ","element":"span"},{"text":"(Informal Statement, Theorem ","element":"span"},{"href":"#id-47","text":"11","element":"a"},{"text":", Appendix ","element":"span"},{"href":"#id-45","text":"E","element":"a"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that the state transitions are deterministic. Given the current model ","element":"span"},{"style":{"height":17.2},"width":186.54,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-17.png","element":"img","alt":" f = (V , π)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that the value model ","element":"span"},{"style":{"height":19.6},"width":703.46,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-18.png","element":"img","alt":" V satisfies E�||V − V ∗||∞�≤ ε0,v. Then,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with appropriately chosen parameters for MCTS, for each query state ","element":"span"},{"style":{"height":13.59},"width":111.51,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-19.png","element":"img","alt":" s0 ∈ S","inline":true},{"style":{"fontStyle":"italic"},"text":", the output","element":"span"},{"style":{"height":20.1},"width":319.14,"height":50.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/8-20.png","element":"img","alt":"� ˆV (s0), ˆπ(·|s0)� =","inline":true}],[{"style":{"fontStyle":"italic"},"text":"MCTS","element":"span"},{"style":{"height":17.2},"width":262.38,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-0.png","element":"img","alt":"(f, s0) satisfies","inline":true}],[{"style":{"width":"34%"},"width":652,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-1.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"The above is achieved with a sample complexity of","element":"span"}],[{"style":{"width":"55%"},"width":1041,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-2.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Supervised Learning Module: Nearest Neighbor Regression. ","element":"span"},{"text":"We employ a nearest neighbor algorithm to learn the optimal value function and policy. Intuitively, suppose that the optimal value function and the Boltzmann policy is Lipschitz in the state space, then this algorithm will generalize if there are sufficiently many (say ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":") training data points around each state in the state space ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":". Quantitatively, consider covering ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"with balls of diameter ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h > ","element":"span"},{"text":"0. We call the training data ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"text":"-","element":"span"},{"style":{"fontStyle":"italic"},"text":"representative ","element":"span"},{"text":"if each covering ball has at least ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"training data. Here, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"would depend on the temperature ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-3.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"and estimation errors of the training data.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proposition 6 ","element":"span"},{"text":"(Informal Statement, Proposition ","element":"span"},{"href":"#id-48","text":"12","element":"a"},{"text":", Appendix ","element":"span"},{"href":"#id-45","text":"E","element":"a"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Under appropriate regularity conditions, if the training data is representative with respect to appropriate chosen ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h > ","element":"span"},{"text":"0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K > ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":", the nearest neighbor supervised learning satisfies Property ","element":"span"},{"href":"#id-39","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":". In particular, given training data with estimation errors ","element":"span"},{"style":{"height":9.59},"width":34.58,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-4.png","element":"img","alt":" εv","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":11.99},"width":35.58,"height":29.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-5.png","element":"img","alt":" εp","inline":true},{"style":{"fontStyle":"italic"},"text":", we have","element":"span"}],[{"style":{"width":"43%"},"width":806,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-6.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the constant ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c > ","element":"span"},{"text":"0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is independent of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"style":{"fontStyle":"italic"},"text":", the size of training data.","element":"span"}],[{"text":"As discussed in Appendix ","element":"span"},{"href":"#id-45","text":"E","element":"a"},{"text":", the representative number of data points for training required in the above for generalization depends on the property of the state-space. For example, if state space is the unit ball in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"dimension, for generalization error scaling with ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-7.png","element":"img","alt":" ε","inline":true,"padRight":true},{"text":"we require representative data points scaling as ","element":"span"},{"style":{"height":15.38},"width":140.53,"height":38.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-8.png","element":"img","alt":" ε−(2+d).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Exploration Module: Random Sampling Policy. ","element":"span"},{"text":"In the above supervised learning module, the sampled states for nearest neighbor regression should be ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"text":"-representative. In other words, to satisfy the exploration property, the exploration module must visit a set of ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"text":"-representative states within a finite expected number of steps. We show that a uniformly random sampling policy achieves this. Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":") ","element":"span"},{"text":"be the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":"/","element":"span"},{"text":"2-covering number of the compact state space.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proposition 7 ","element":"span"},{"text":"(Informal Statement, Proposition ","element":"span"},{"href":"#id-49","text":"13","element":"a"},{"text":", Appendix ","element":"span"},{"href":"#id-45","text":"E","element":"a"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Under appropriate regularity conditions, with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"style":{"fontStyle":"italic"},"text":"chosen as per desired the estimation errors, ","element":"span"},{"style":{"height":15.59},"width":160.85,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-9.png","element":"img","alt":" εv and εp","inline":true},{"style":{"fontStyle":"italic"},"text":", for the value and policy, the expected number of steps to obtain a set of ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":"-representative states under the random sampling policy is upper bounded by","element":"span"}],[{"style":{"width":"32%"},"width":617,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-10.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Convergence Guarantees and Sample Complexity of the Instance. ","element":"span"},{"text":"For this instance of EIS, we have shown that each module satisfies the desired properties. Therefore, the convergence result stated in Theorem ","element":"span"},{"href":"#id-3","text":"2 ","element":"a"},{"text":"holds for this specific instance. Below we make this result explicit, providing concrete bounds on the estimation errors and sample complexity. In the following, the ","element":"span"},{"style":{"height":6.8},"width":25.25,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-11.png","element":"img","alt":" c′","inline":true},{"text":"s denote appropriate constants. Please refer to Appendix ","element":"span"},{"href":"#id-45","text":"E ","element":"a"},{"text":"for details.","element":"span"}],[{"id":"id-5","style":{"fontWeight":"bold"},"text":"Theorem 8 ","element":"span"},{"text":"(Informal Statement, Theorem ","element":"span"},{"href":"#id-50","text":"14","element":"a"},{"text":", Appendix ","element":"span"},{"href":"#id-45","text":"E","element":"a"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For a given ","element":"span"},{"style":{"height":17.2},"width":159.45,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/9-12.png","element":"img","alt":" ρ ∈ (0, 1)","inline":true},{"style":{"fontStyle":"italic"},"text":", there exist appropriately chosen parameters for this instance such that:","element":"span"}],[{"style":{"width":"81%"},"width":1529,"height":516,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"3. In particular, if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a unit volume hypercube in ","element":"span"},{"style":{"height":14.58},"width":49.28,"height":36.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-1.png","element":"img","alt":" Rd","inline":true},{"style":{"fontStyle":"italic"},"text":", then the total sample complexity to achieve ","element":"span"},{"style":{"height":7.2},"width":121.42,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-2.png","element":"img","alt":" ε-error","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"value function and policy is given by","element":"span"}],[{"style":{"width":"34%"},"width":640,"height":91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-3.png","element":"img"}],[{"text":"Theorem ","element":"span"},{"href":"#id-5","text":"8 ","element":"a"},{"text":"states that for a unit hypercube, the sample complexity of the instance of EIS scales as ","element":"span"},{"style":{"height":20.87},"width":197.29,"height":52.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-4.png","element":"img","alt":"�O�ε−(4+d)�","inline":true},{"text":"(omitting the logarithmic factor). Note that the minimax lower bound in Theorem ","element":"span"},{"href":"#id-6","text":"4 ","element":"a"},{"text":"scales as ","element":"span"},{"style":{"height":20.87},"width":199.4,"height":52.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-5.png","element":"img","alt":"�Ω�ε−(2+d)�","inline":true},{"text":". Hence, in terms of the dependence on the dimension, the instance we consider here is nearly optimal. We note that the ","element":"span"},{"style":{"height":20.87},"width":197.3,"height":52.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-6.png","element":"img","alt":"�O�ε−(4+d)�","inline":true},{"text":"sample complexity results from two parts: the MCTS contributes a sample complexity scaling as ","element":"span"},{"style":{"height":13.78},"width":59.61,"height":34.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-7.png","element":"img","alt":" ε−2","inline":true,"padRight":true},{"text":"due to simulating the search tree, while nearest neighbor requires ","element":"span"},{"style":{"height":15.38},"width":126.92,"height":38.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/10-8.png","element":"img","alt":" ε−(2+d)","inline":true,"padRight":true},{"text":"samples due to the need of sufficiently many good neighbors. Obtaining tighter bound with potentially more powerful improvement module or supervised learning module such as neural networks is an interesting future avenue.","element":"span"}]]},{"heading":"7 Conclusion","paragraphs":[[{"text":"In this paper, we take theoretical steps towards understanding reinforcement learning for zero-sum turn-based Markov games. We develop the Explore-Improve-Supervise (EIS) method with three intuitive modules intertwined carefully. Such an abstraction of three key modules allows us to isolate the fundamental principles from the implementation details. Importantly, we identify conditions for successfully finding the optimal solutions, backed by a concrete instance satisfying those conditions. Overall, the abstraction and the generic properties developed in this paper could serve as some guidelines, with the potential of finding broader applications with different instantiations. Finally, it would be interesting to extend this framework to general Markov games with simultaneous moves. We believe the generic modeling techniques in Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"could be applied, but a key challenge is to develop an improvement module with rigorous non-asymptotic guarantees that satisfies the desired property. We believe that addressing this challenge and formally establishing the framework is a fruitful future direction.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-16","text":"Bowling, M. and Veloso, M. Rational and convergent learning in stochastic games. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International joint conference on artificial intelligence","element":"span"},{"text":", volume 17, pp. 1021–1026. Lawrence Erlbaum Associates Ltd, 2001.","element":"span"}],[{"id":"id-29","text":"Brafman, R. I. and Tennenholtz, M. R-max-a general polynomial time algorithm for near-optimal reinforcement ","element":"span"},{"text":"learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research","element":"span"},{"text":", 3(Oct):213–231, 2002.","element":"span"}],[{"id":"id-14","text":"Hansen, T. D., Miltersen, P. B., and Zwick, U. Strategy iteration is strongly polynomial for 2-player turn-based ","element":"span"},{"text":"stochastic games with a constant discount factor. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"J. ACM","element":"span"},{"text":", 60(1):1:1–1:16, February 2013. ISSN 0004-5411.","element":"span"}],[{"id":"id-20","text":"Hu, J. and Wellman, M. P. Nash q-learning for general-sum stochastic games. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of machine learning research","element":"span"},{"text":", 4(Nov):1039–1069, 2003.","element":"span"}],[{"id":"id-19","text":"Hu, J., Wellman, M. P., et al. Multiagent reinforcement learning: theoretical framework and an algorithm. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ICML","element":"span"},{"text":", volume 98, pp. 242–250. Citeseer, 1998.","element":"span"}],[{"id":"id-32","text":"Jia, Z., Yang, L. F., and Wang, M. Feature-based q-learning for two-player stochastic games, 2019.","element":"span"}],[{"id":"id-59","text":"Kaufmann, E. and Koolen, W. M. Monte-carlo tree search by best arm identification. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pp. 4897–4906, 2017.","element":"span"}],[{"id":"id-35","text":"Kearns, M., Mansour, Y., and Ng, A. Y. A sparse sampling algorithm for near-optimal planning in large ","element":"span"},{"text":"markov decision processes. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine learning","element":"span"},{"text":", 49(2-3):193–208, 2002.","element":"span"}],[{"id":"id-58","text":"Kocsis, L., Szepesvári, C., and Willemson, J. Improved Monte-Carlo search. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Univ. Tartu, Estonia, Tech. Rep","element":"span"},{"text":", 2006.","element":"span"}],[{"id":"id-12","text":"Kumar, P. R. and Shiau, T.-H. Existence of value and randomized strategies in zero-sum discrete-time ","element":"span"},{"text":"stochastic dynamic games. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Control and Optimization","element":"span"},{"text":", 19(5):617–634, 1981.","element":"span"}],[{"id":"id-21","text":"Lagoudakis, M. G. and Parr, R. Value function approximation in zero-sum markov games. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the Eighteenth conference on Uncertainty in artificial intelligence","element":"span"},{"text":", pp. 283–292. Morgan Kaufmann Publishers Inc., 2002.","element":"span"}],[{"id":"id-25","text":"Lillicrap, T. P., Hunt, J. J., Pritzel, A., Heess, N., Erez, T., Tassa, Y., Silver, D., and Wierstra, D. Continuous ","element":"span"},{"text":"control with deep reinforcement learning. 2016.","element":"span"}],[{"id":"id-7","text":"Littman, M. L. Markov games as a framework for multi-agent reinforcement learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine Learning Proceedings 1994","element":"span"},{"text":", pp. 157–163. Elsevier, 1994.","element":"span"}],[{"id":"id-17","text":"Littman, M. L. Friend-or-foe q-learning in general-sum games. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ICML","element":"span"},{"text":", volume 1, pp. 322–328, 2001a.","element":"span"}],[{"id":"id-18","text":"Littman, M. L. Value-function reinforcement learning in markov games. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Cognitive Systems Research","element":"span"},{"text":", 2(1): 55–66, 2001b.","element":"span"}],[{"id":"id-9","text":"Maitra, A. and Parthasarathy, T. On stochastic games. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Optimization Theory and Applications","element":"span"},{"text":", 5 (4):289–300, 1970.","element":"span"}],[{"id":"id-10","text":"Maitra, A. and Parthasarathy, T. On stochastic games, ii. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Optimization Theory and Applications","element":"span"},{"text":", 8(2):154–160, 1971.","element":"span"}],[{"id":"id-24","text":"Mnih, V., Kavukcuoglu, K., Silver, D., Rusu, A. A., Veness, J., Bellemare, M. G., Graves, A., Riedmiller, M., ","element":"span"},{"text":"Fidjeland, A. K., Ostrovski, G., et al. Human-level control through deep reinforcement learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Nature","element":"span"},{"text":", 518(7540):529, 2015.","element":"span"}],[{"id":"id-11","text":"Parthasarathy, T. Discounted, positive, and noncooperative stochastic games. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Journal of Game Theory","element":"span"},{"text":", 2(1):25–37, 1973.","element":"span"}],[{"id":"id-15","text":"Patek, S. D. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Stochastic Shortest Path Games: Theory and Algorithms","element":"span"},{"text":". PhD dissertation, Massachusetts Institute of Technology, 1997.","element":"span"}],[{"id":"id-22","text":"Perolat, J., Scherrer, B., Piot, B., and Pietquin, O. Approximate dynamic programming for two-player ","element":"span"},{"text":"zero-sum markov games. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 32nd International Conference on Machine Learning","element":"span"},{"text":", pp. 1321–1329, 2015.","element":"span"}],[{"id":"id-23","text":"Pérolat, J., Piot, B., Geist, M., Scherrer, B., and Pietquin, O. Softened approximate policy iteration for ","element":"span"},{"text":"markov games. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ICML 2016-33rd International Conference on Machine Learning","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-26","text":"Schulman, J., Levine, S., Abbeel, P., Jordan, M., and Moritz, P. Trust region policy optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pp. 1889–1897, 2015.","element":"span"}],[{"id":"id-27","text":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., and Klimov, O. Proximal policy optimization algorithms. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1707.06347","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-55","text":"Shah, D. and Xie, Q. Q-learning with nearest neighbors. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 31","element":"span"},{"text":", pp. 3115–3125. 2018.","element":"span"}],[{"id":"id-33","text":"Shah, D., Xie, Q., and Xu, Z. On reinforcement learning using Monte Carlo tree search with supervised ","element":"span"},{"text":"learning: Non-asymptotic analysis. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1902.05213","element":"span"},{"text":", 2019.","element":"span"}],[{"id":"id-8","text":"Shapley, L. S. Stochastic games. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the National Academy of Sciences","element":"span"},{"text":", 39(10):1095–1100, 1953.","element":"span"}],[{"id":"id-31","text":"Sidford, A., Wang, M., Yang, L. F., and Ye, Y. Solving discounted stochastic two-player games with ","element":"span"},{"text":"near-optimal time and sample complexity, 2019.","element":"span"}],[{"id":"id-1","text":"Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., Van Den Driessche, G., Schrittwieser, J., ","element":"span"},{"text":"Antonoglou, I., Panneershelvam, V., Lanctot, M., et al. Mastering the game of Go with deep neural networks and tree search. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Nature","element":"span"},{"text":", 529(7587):484–489, 2016.","element":"span"}],[{"id":"id-2","text":"Silver, D., Hubert, T., Schrittwieser, J., Antonoglou, I., Lai, M., Guez, A., Lanctot, M., Sifre, L., Kumaran, ","element":"span"},{"text":"D., Graepel, T., et al. Mastering chess and Shogi by self-play with a general reinforcement learning algorithm. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1712.01815","element":"span"},{"text":", 2017a.","element":"span"}],[{"id":"id-0","text":"Silver, D., Schrittwieser, J., Simonyan, K., Antonoglou, I., Huang, A., Guez, A., Hubert, T., Baker, L., Lai, ","element":"span"},{"text":"M., Bolton, A., et al. Mastering the game of Go without human knowledge. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Nature","element":"span"},{"text":", 550(7676):354, 2017b.","element":"span"}],[{"id":"id-44","text":"Stone, C. J. Optimal global rates of convergence for nonparametric regression. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Statistics","element":"span"},{"text":", pp. 1040–1053, 1982.","element":"span"}],[{"id":"id-13","text":"Szepesvári, C. and Littman, M. L. Generalized Markov decision processes: Dynamic-programming and ","element":"span"},{"text":"reinforcement-learning algorithms. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of International Conference of Machine Learning","element":"span"},{"text":", volume 96, 1996.","element":"span"}],[{"id":"id-43","text":"Tsybakov, A. B. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Introduction to Nonparametric Estimation","element":"span"},{"text":". Springer Series in Statistics. Springer, 2009.","element":"span"}],[{"id":"id-64","text":"Wainwright, M. J. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"High-dimensional statistics: A non-asymptotic viewpoint","element":"span"},{"text":", volume 48. Cambridge University Press, 2019.","element":"span"}],[{"id":"id-30","text":"Wei, C.-Y., Hong, Y.-T., and Lu, C.-J. Online reinforcement learning in stochastic games. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pp. 4987–4997, 2017.","element":"span"}],[{"id":"id-28","text":"Yang, Y., Zhang, G., Xu, Z., and Katabi, D. Harnessing structures for value-based planning and reinforcement ","element":"span"},{"text":"learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1909.12255","element":"span"},{"text":", 2019a.","element":"span"}],[{"id":"id-34","text":"Yang, Z., Xie, Y., and Wang, Z. A theoretical analysis of deep q-learning, 2019b.","element":"span"}]]},{"heading":"Appendices","paragraphs":[[{"id":"id-71","style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Preliminary Facts","element":"span"}],[{"text":"The following inequalities are used for developing our technical results: ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Jensen’s Inequality: ","element":"span"},{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X ","element":"span"},{"text":"be a random variable and ","element":"span"},{"style":{"height":14},"width":24,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/12-0.png","element":"img","alt":" φ","inline":true,"padRight":true},{"text":"be a convex function, then ","element":"span"},{"style":{"height":17.2},"width":362.36,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/12-1.png","element":"img","alt":" φ(E[X]) ≤ E[φ(X)].","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Pinsker’s Inequality: ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":10},"width":24,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-0.png","element":"img","alt":" µ","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-1.png","element":"img","alt":" ν","inline":true,"padRight":true},{"text":"be two probability distributions, then the total variation distance TV","element":"span"},{"style":{"height":17.2},"width":97.04,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-2.png","element":"img","alt":"(µ, ν)","inline":true,"padRight":true},{"text":"and the KL divergences ","element":"span"},{"style":{"height":17.2},"width":179.84,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-3.png","element":"img","alt":" DKL(µ∥ν)","inline":true,"padRight":true},{"text":"satisfy the bound","element":"span"}],[{"style":{"width":"24%"},"width":457,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-4.png","element":"img"}],[{"text":"Note that if ","element":"span"},{"style":{"height":14},"width":135.56,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-5.png","element":"img","alt":" µ and ν","inline":true,"padRight":true},{"text":"are discrete distributions, then TV","element":"span"},{"style":{"height":19.82},"width":898.26,"height":49.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-6.png","element":"img","alt":"(µ, ν) = 12�ω∈Ω |µ(ω) − ν(ω)| = 12∥µ − ν∥1, where","inline":true},{"style":{"height":16},"width":82.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-7.png","element":"img","alt":"∥ · ∥1","inline":true,"padRight":true},{"text":"denotes the total variation (or ","element":"span"},{"style":{"height":16},"width":165.35,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-8.png","element":"img","alt":" ℓ1) norm.","inline":true}],[{"style":{"width":"96%"},"width":1814,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-9.png","element":"img"}],[{"text":"then","element":"span"}],[{"style":{"width":"65%"},"width":1235,"height":364,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-10.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-3","style":{"fontWeight":"bold"},"text":"2","element":"a"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"With the three detailed properties, the proof is conceptually straightforward. At each iteration, the improvement module would produce better estimates for the explored states, by factors of ","element":"span"},{"style":{"height":14},"width":33.43,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-11.png","element":"img","alt":" ζv","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.59},"width":34.44,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-12.png","element":"img","alt":" ζp","inline":true},{"text":". The exploration continues until one of the desired representative sets in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"E ","element":"span"},{"text":"has been visited, and the exploration property guarantees that the exploration time will be finite. The current iteration then ends by calling the supervised learning module to generalize the improvement to the entire state space. In what follows, we make these statements formal.","element":"span"}],[{"text":"Let us first introduce some notion. We will use the term iteration to refer to a complete round of improvement, exploration and supervised learning (cf. Line 2 of Algorithm ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":"). In general, at each iteration, we use a superscript ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":") ","element":"span"},{"text":"to denote quantities relevant to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration, except that for the supervised learning module, we follow the convention in the paper and use a subscript ","element":"span"},{"style":{"height":17.2},"width":317.74,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-13.png","element":"img","alt":" l, i.e., fl = (Vl, πl)","inline":true},{"text":". We denote by ","element":"span"},{"style":{"height":15.38},"width":65.39,"height":38.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-14.png","element":"img","alt":"Z(l) ","inline":true,"padRight":true},{"text":"all the information during the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration. Let ","element":"span"},{"style":{"height":19.38},"width":110.47,"height":48.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-15.png","element":"img","alt":" {F(l)}","inline":true,"padRight":true},{"text":"be the sigma-algebra generated by the stochastic process ","element":"span"},{"style":{"height":19.38},"width":107.92,"height":48.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-16.png","element":"img","alt":" {Z(l)}","inline":true},{"text":", where the randomness comes from the environment and any randomness that may be used in the three modules. Let ","element":"span"},{"style":{"height":22.93},"width":216.25,"height":57.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-17.png","element":"img","alt":" ω(l)v and ω(l)p","inline":true,"padRight":true},{"text":"be the estimation errors of the model at the beginning of the iteration, i.e.,","element":"span"}],[{"style":{"width":"35%"},"width":673,"height":174,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-18.png","element":"img"}],[{"text":"We use ","element":"span"},{"style":{"height":29.2},"width":199.72,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-19.png","element":"img","alt":" D(l) =��si","inline":true},{"text":", ","element":"span"},{"style":{"height":31.09},"width":387.88,"height":77.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-20.png","element":"img","alt":"ˆV (l)(si), ˆπ(l(·|si)��nli=1","inline":true,"padRight":true},{"text":"to denote the set of training data generated by the exploration ","element":"span"},{"text":"module and querying the improvement module during the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration. Let ","element":"span"},{"style":{"height":20.45},"width":246.67,"height":51.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-21.png","element":"img","alt":" S(l) = {si}nli=1","inline":true,"padRight":true},{"text":"be the set of ","element":"span"},{"text":"states visited by the exploration module. Correspondingly, the estimation errors for the value function and the optimal policy after querying the improvement module are denoted by ","element":"span"},{"style":{"height":22.93},"width":201.21,"height":57.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-22.png","element":"img","alt":" ε(l)v and ε(l)p ","inline":true,"padRight":true},{"text":", respectively:","element":"span"}],[{"style":{"width":"36%"},"width":682,"height":203,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/13-23.png","element":"img"}],[{"text":"At the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration, the supervised learning modules takes the outputs of the improvement module, ","element":"span"},{"style":{"height":18.59},"width":80.79,"height":46.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-0.png","element":"img","alt":" D(l),","inline":true,"padRight":true},{"text":"as the training data. Let ","element":"span"},{"style":{"height":21.45},"width":54.6,"height":53.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-1.png","element":"img","alt":" ξ(l)v","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":22.93},"width":54.6,"height":57.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-2.png","element":"img","alt":" ξ(l)p","inline":true,"padRight":true},{"text":"denote the estimation errors for the new model ","element":"span"},{"style":{"height":17.2},"width":213.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-3.png","element":"img","alt":" fl = (Vl, πl)","inline":true},{"text":", after querying the supervised learning module:","element":"span"}],[{"id":"id-51","style":{"width":"99%"},"width":1868,"height":496,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-4.png","element":"img"}],[{"text":"For the supervised learning module, according to the generalization property (cf. Property ","element":"span"},{"href":"#id-39","text":"2","element":"a"},{"text":"), when the size of training set ","element":"span"},{"style":{"height":9.72},"width":33.92,"height":24.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-5.png","element":"img","alt":" nl","inline":true,"padRight":true},{"text":"is sufficiently large and the sampled states ","element":"span"},{"style":{"height":20.46},"width":243.81,"height":51.14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-6.png","element":"img","alt":" S(l) = {si}nli=1 ","inline":true,"padRight":true},{"text":"are representative of the state ","element":"span"},{"text":"space, the same order of accuracy of the training data will be generalized to the entire state space. For now, let us assume that this is the case; we will come back to verify the generalization bound in Property ","element":"span"},{"href":"#id-39","text":"2 ","element":"a"},{"text":"can indeed be satisfied by ","element":"span"},{"style":{"height":15.38},"width":62.07,"height":38.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-7.png","element":"img","alt":" S(l)","inline":true},{"text":". Then, the following bounds hold:","element":"span"}],[{"id":"id-53","style":{"width":"11%"},"width":212,"height":134,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-8.png","element":"img"}],[{"text":"Hence","element":"span"}],[{"style":{"width":"22%"},"width":416,"height":133,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-9.png","element":"img"}],[{"text":"Therefore, when querying the improvement module, if we select the improvement factors to be","element":"span"}],[{"style":{"width":"61%"},"width":1154,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-10.png","element":"img"}],[{"text":"then we have","element":"span"}],[{"id":"id-52","style":{"width":"59%"},"width":1116,"height":134,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-11.png","element":"img"}],[{"text":"It is worth taking note of the fact that ","element":"span"},{"style":{"height":9.19},"width":33.25,"height":22.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-12.png","element":"img","alt":" cv","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.59},"width":34.25,"height":28.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-13.png","element":"img","alt":" cp","inline":true,"padRight":true},{"text":"would be larger than 1 (cf. Property ","element":"span"},{"href":"#id-39","text":"2","element":"a"},{"text":"): a reasonable supervised learning model may generalize the same order of accuracy as training data, but unlikely for it be smaller; hence, ","element":"span"},{"style":{"height":15.59},"width":161.39,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-14.png","element":"img","alt":" ζv and ζp","inline":true,"padRight":true},{"text":"are required to be smaller than 1 in Property ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"so that ","element":"span"},{"style":{"height":14},"width":104.66,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-15.png","element":"img","alt":" ρ < 1.","inline":true}],[{"text":"By definition, ","element":"span"},{"style":{"height":23.04},"width":484.97,"height":57.61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-16.png","element":"img","alt":" ω(1)v = C0,v and ω(1)p = C0,p","inline":true},{"text":". Therefore, we have the desired inequalities:","element":"span"}],[{"style":{"width":"42%"},"width":801,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-17.png","element":"img"}],[{"text":"Finally, to complete the proof, as we mentioned before, we need to verify that we could sample enough representative states at each iteration in finite time steps. This is indeed guaranteed by the exploration property. In particular, note that at the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration, we would like to sample enough representative states that are of errors ","element":"span"},{"style":{"height":21.45},"width":97.75,"height":53.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-18.png","element":"img","alt":" ζvω(l)v","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":23.04},"width":97.2,"height":57.61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/14-19.png","element":"img","alt":" ζpω(l)p","inline":true,"padRight":true},{"text":"for the value and policy functions (cf. Eqs. (","element":"span"},{"href":"#id-51","text":"7","element":"a"},{"text":") and (","element":"span"},{"href":"#id-51","text":"8","element":"a"},{"text":")). By a ","element":"span"},{"text":"recursive argument (cf. Eqs. (","element":"span"},{"href":"#id-52","text":"10","element":"a"},{"text":") and (","element":"span"},{"href":"#id-52","text":"11","element":"a"},{"text":")), it is not hard to see that at the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration, we need to query the exploration module until the sampled states, ","element":"span"},{"style":{"height":20.45},"width":247.17,"height":51.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-0.png","element":"img","alt":" S(l) = {si}nli=1","inline":true},{"text":", contain one of the representative sets in ","element":"span"},{"style":{"height":18.98},"width":478.49,"height":47.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-1.png","element":"img","alt":"E(τ, ζvC0,vρl−1, ζpC0,pρl−1)","inline":true},{"text":", i.e., we immediately stop querying the exploration module at time ","element":"span"},{"style":{"height":13.72},"width":207.91,"height":34.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-2.png","element":"img","alt":" nl when the","inline":true,"padRight":true},{"text":"following holds:","element":"span"}],[{"style":{"width":"52%"},"width":987,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-3.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.59},"width":151.24,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-4.png","element":"img","alt":" ζv and ζp","inline":true,"padRight":true},{"text":"are given by Eq. (","element":"span"},{"href":"#id-53","text":"9","element":"a"},{"text":"). From the exploration property, we know that ","element":"span"},{"style":{"height":18.97},"width":538.08,"height":47.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-5.png","element":"img","alt":" E[T(τ, ζvC0,vρl−1, ζpC0,pρl−1)]","inline":true,"padRight":true},{"text":"is finite, which implies that ","element":"span"},{"style":{"height":9.72},"width":33.92,"height":24.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-6.png","element":"img","alt":" nl","inline":true,"padRight":true},{"text":"is also finite with high probability. Therefore, we are guaranteed that the training data ","element":"span"},{"style":{"height":15.38},"width":69.43,"height":38.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-7.png","element":"img","alt":" D(l)","inline":true,"padRight":true},{"text":"contains one of the representative sets, and hence the supervised learning module will generalize at each iteration. This completes the proof of Theorem ","element":"span"},{"href":"#id-3","text":"2","element":"a"},{"text":".","element":"span"}],[{"style":{"width":"1%"},"width":28,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-8.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Proposition ","element":"span"},{"href":"#id-4","style":{"fontWeight":"bold"},"text":"3","element":"a"}],[{"text":"To prove Proposition ","element":"span"},{"href":"#id-4","text":"3","element":"a"},{"text":", we first establish the following useful lemma:","element":"span"}],[{"id":"id-54","style":{"fontWeight":"bold"},"text":"Lemma 9. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the exploration module and suppose that ","element":"span"},{"style":{"height":17.59},"width":606.88,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-9.png","element":"img","alt":" E[T(τ, ε1,v, ε1,p)] ≤ B(τ, ε1,v, ε1,p)","inline":true},{"style":{"fontStyle":"italic"},"text":". Then, with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":14.8},"width":69.96,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-10.png","element":"img","alt":" − δ,","inline":true}],[{"style":{"width":"68%"},"width":1290,"height":82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-54","style":{"fontStyle":"italic"},"text":"9","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"Consider a total time steps of ","element":"span"},{"style":{"height":19.77},"width":441.95,"height":49.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-12.png","element":"img","alt":" n = eB(τ, ε1,v, ε1,p) log 1δ","inline":true,"padRight":true},{"text":". All the states, ","element":"span"},{"style":{"height":16.77},"width":123.72,"height":41.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-13.png","element":"img","alt":" {si}ni=1","inline":true},{"text":", are ","element":"span"},{"text":"sampled via querying the exploration module. Let us divide the total time steps ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"into ","element":"span"},{"style":{"height":17.2},"width":258.53,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-14.png","element":"img","alt":" M ≜ log(1/δ)","inline":true,"padRight":true},{"text":"segments, each consisting of ","element":"span"},{"style":{"height":17.59},"width":355.24,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-15.png","element":"img","alt":" h ≜ eB(τ, ε1,v, ε1,p)","inline":true,"padRight":true},{"text":"states. Denote by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":") ","element":"span"},{"text":"the set of states in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":"-th segment, i.e., ","element":"span"},{"style":{"height":24.89},"width":373.01,"height":62.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-16.png","element":"img","alt":" S(m) = {si}mh−1i=(m−1)h","inline":true},{"text":". The key idea of the proof is to argue that with high probability, at least ","element":"span"},{"text":"one of the sets ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":")","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"= ","element":"span"},{"text":"1, 2, ","element":"span"},{"style":{"fontStyle":"italic"},"text":". . . ","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"will contain a representative set in ","element":"span"},{"style":{"height":17.59},"width":251.3,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-17.png","element":"img","alt":" E(τ, ε1,v, ε1,p).","inline":true}],[{"text":"Denote by ","element":"span"},{"style":{"height":13.19},"width":57.42,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-18.png","element":"img","alt":" Em","inline":true,"padRight":true},{"text":"the event that the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":"-th segment does not contain any the representative sets, i.e.,","element":"span"}],[{"style":{"width":"46%"},"width":878,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-19.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":13.19},"width":56.64,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-20.png","element":"img","alt":" Fm","inline":true,"padRight":true},{"text":"be the filtration containing information untill the end of segment ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":". Since ","element":"span"},{"style":{"height":17.59},"width":346.56,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-21.png","element":"img","alt":" E[T(τ, ε1,v, ε1,p)] ≤","inline":true},{"style":{"height":17.59},"width":247.34,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-22.png","element":"img","alt":"B(τ, ε1,v, ε1,p)","inline":true},{"text":", by Markov inequality, we have,","element":"span"}],[{"style":{"width":"65%"},"width":1221,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-23.png","element":"img"}],[{"text":"This then implies that","element":"span"}],[{"style":{"width":"28%"},"width":535,"height":76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-24.png","element":"img"}],[{"text":"Therefore,","element":"span"}],[{"style":{"width":"68%"},"width":1292,"height":235,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-25.png","element":"img"}],[{"text":"which completes the proof of Lemma ","element":"span"},{"href":"#id-54","text":"9","element":"a"},{"text":".","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Proposition ","element":"span"},{"href":"#id-4","style":{"fontStyle":"italic"},"text":"3","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"With Lemma ","element":"span"},{"href":"#id-54","text":"9","element":"a"},{"text":", we are now ready to prove Proposition ","element":"span"},{"href":"#id-4","text":"3","element":"a"},{"text":". This is achieved by simply counting the sample complexity for each of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"iterations. As discussed in the convergence proof of Theorem ","element":"span"},{"href":"#id-3","text":"2","element":"a"},{"text":", at the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration, we need to query the exploration module until the sampled states, ","element":"span"},{"style":{"height":20.45},"width":245.61,"height":51.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/15-26.png","element":"img","alt":" S(l) = {si}nli=1","inline":true},{"text":", ","element":"span"},{"text":"contains one of the representative sets in ","element":"span"},{"style":{"height":18.98},"width":446.04,"height":47.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-0.png","element":"img","alt":" E(τ, C0,vρl/cv, C0,pρl/cp)","inline":true},{"text":". For each of the explored states, a query of the improvement module incurs a deterministic sample complexity of ","element":"span"},{"style":{"height":28.8},"width":505.83,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-1.png","element":"img","alt":" κ�τ, C0,vρl−1, C0,pρl−1, ρcv , ρcp","inline":true}],[{"text":"the required improvement factors ","element":"span"},{"style":{"height":16.79},"width":435.15,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-2.png","element":"img","alt":" ζv = ρ/cv and ζp = ρ/cp","inline":true},{"text":". Let us now apply Lemma ","element":"span"},{"href":"#id-54","text":"9","element":"a"},{"text":". Then, we know","element":"span"}],[{"style":{"width":"46%"},"width":862,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-3.png","element":"img"}],[{"text":"That is, with probability at most ","element":"span"},{"style":{"height":14},"width":71.37,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-4.png","element":"img","alt":" δ/L","inline":true},{"text":", the sample complexity of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration is larger than","element":"span"}],[{"style":{"width":"59%"},"width":1111,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-5.png","element":"img"}],[{"text":"Finally, applying union bound over the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"iterations, we have","element":"span"}],[{"style":{"width":"56%"},"width":1064,"height":394,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-6.png","element":"img"}],[{"text":"Therefore, with probability at least 1 ","element":"span"},{"style":{"height":11.6},"width":57.74,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-7.png","element":"img","alt":" − δ","inline":true},{"text":", for every ","element":"span"},{"style":{"height":30.17},"width":581.12,"height":75.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-8.png","element":"img","alt":" l ∈ [L], nl ≤ e · B�τ, C0,vρlcv , C0,pρlcp","inline":true}],[{"style":{"height":28.8},"width":135.21,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-9.png","element":"img","alt":"�· log Lδ ","inline":true,"padRight":true},{"text":". Equivalently, ","element":"span"},{"text":"with probability at least 1 ","element":"span"},{"style":{"height":11.6},"width":57.74,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-10.png","element":"img","alt":" − δ","inline":true},{"text":", the total sample complexity is upper bounded by","element":"span"}],[{"style":{"width":"81%"},"width":1522,"height":182,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-11.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"D ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-6","style":{"fontWeight":"bold"},"text":"4","element":"a"}],[{"text":"The recent work ","element":"span"},{"href":"#id-55","referenceIndex":25,"text":"Shah & Xie ","element":"a"},{"text":"(","element":"span"},{"href":"#id-55","referenceIndex":25,"text":"2018","element":"a"},{"text":") establishes a lower bound on the sample complexity for reinforcement learning algorithms on MDPs. We follow a similar argument to establish a lower bound on the sample complexity for two-player zero-sum Markov games. We provide the proof for completeness. The key idea is to connect the problem of estimating the value function to the problem of non-parametric regression, and then leveraging known minimax lower bound for the latter. In particular, we show that a class of non-parametric regression problem can be embedded in a Markov game problem, so any algorithm for the latter can be used to solve the former. Prior work on non-parametric regression (","element":"span"},{"href":"#id-43","referenceIndex":34,"text":"Tsybakov","element":"a"},{"text":", ","element":"span"},{"href":"#id-43","referenceIndex":34,"text":"2009","element":"a"},{"text":"; ","element":"span"},{"href":"#id-44","referenceIndex":32,"text":"Stone","element":"a"},{"text":", ","element":"span"},{"href":"#id-44","referenceIndex":32,"text":"1982","element":"a"},{"text":") establishes that a certain number of observations is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"necessary ","element":"span"},{"text":"to achieve a given accuracy using ","element":"span"},{"style":{"fontStyle":"italic"},"text":"any ","element":"span"},{"text":"algorithms, hence leading to a corresponding necessary condition for the sample size of estimating the value function in a Markov game problem. We now provide the details. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Step 1. Non-parametric regression","element":"span"}],[{"text":"Consider the following non-parametric regression problem: Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":":","element":"span"},{"text":"= [","element":"span"},{"text":"0, 1","element":"span"},{"style":{"height":18.59},"width":29.01,"height":46.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-12.png","element":"img","alt":"]d","inline":true,"padRight":true},{"text":"and assume that we have ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"independent pairs of random variables ","element":"span"},{"style":{"height":17.2},"width":539.86,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-13.png","element":"img","alt":" (x1, y1), . . . , (xT , yT ) such that","inline":true}],[{"style":{"width":"63%"},"width":1183,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-14.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.2},"width":290.09,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-15.png","element":"img","alt":" xt ∼ uniform(S)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.4},"width":185.17,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-16.png","element":"img","alt":" f : S → R","inline":true,"padRight":true},{"text":"is the unknown regression function. Suppose that the conditional distribution of ","element":"span"},{"style":{"height":13.6},"width":257.8,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-17.png","element":"img","alt":" yt given xt = x","inline":true,"padRight":true},{"text":"is a Bernoulli distribution with mean ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":")","element":"span"},{"text":". We also assume that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is 1-Lipschitz continuous with respect to the Euclidean norm, i.e.,","element":"span"}],[{"style":{"width":"36%"},"width":675,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/16-18.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"be the collection of all 1-Lipschitz continuous function on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X","element":"span"},{"text":", i.e.,","element":"span"}],[{"style":{"width":"38%"},"width":728,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-0.png","element":"img"}],[{"text":"The goal is to estimate ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"given the observations ","element":"span"},{"style":{"height":17.2},"width":362.45,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-1.png","element":"img","alt":" (x1, y1), . . . , (xT , yT )","inline":true,"padRight":true},{"text":"and the prior knowledge that ","element":"span"},{"style":{"height":14},"width":116.1,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-2.png","element":"img","alt":" f ∈ F.","inline":true}],[{"text":"It is easy to verify that the above problem is a special case of the non-parametric regression problem considered in the work by ","element":"span"},{"href":"#id-44","referenceIndex":32,"text":"Stone ","element":"a"},{"text":"(","element":"span"},{"href":"#id-44","referenceIndex":32,"text":"1982","element":"a"},{"text":") (in particular, Example 2 therein). Let ","element":"span"},{"style":{"height":17.99},"width":42.51,"height":44.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-3.png","element":"img","alt":"ˆfT","inline":true,"padRight":true},{"text":"denote an arbitrary (measurable) estimator of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"based on the training samples ","element":"span"},{"style":{"height":17.2},"width":363.34,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-4.png","element":"img","alt":" (x1, y1), . . . , (xT , yT )","inline":true},{"text":". By Theorem 1 in ","element":"span"},{"href":"#id-44","referenceIndex":32,"text":"Stone ","element":"a"},{"text":"(","element":"span"},{"href":"#id-44","referenceIndex":32,"text":"1982","element":"a"},{"text":"), we have the following result: there exists a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c > ","element":"span"},{"text":"0 such that","element":"span"}],[{"style":{"width":"72%"},"width":1366,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-5.png","element":"img"}],[{"text":"where infimum is over all possible estimators ","element":"span"},{"style":{"height":17.99},"width":55.71,"height":44.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-6.png","element":"img","alt":"ˆfT .","inline":true,"padRight":true},{"text":"Translating this result to the non-asymptotic regime, we obtain the following theorem.","element":"span"}],[{"id":"id-57","style":{"fontWeight":"bold"},"text":"Theorem 10. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Under the above assumptions, for any number ","element":"span"},{"style":{"height":17.2},"width":158.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-7.png","element":"img","alt":" δ ∈ (0, 1)","inline":true},{"style":{"fontStyle":"italic"},"text":", there exits some numbers ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c > ","element":"span"},{"text":"0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and","element":"span"}],[{"style":{"width":"79%"},"width":1489,"height":135,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-8.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Step 2. Two-player zero-sum Markov game ","element":"span"},{"text":"Consider a class of (degenerate) two-player zero-sum discounted Markov game ","element":"span"},{"style":{"height":17.2},"width":404.45,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-9.png","element":"img","alt":" (S1, S2, A1, A2, r, P, γ)","inline":true},{"text":", where","element":"span"}],[{"style":{"width":"35%"},"width":656,"height":286,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-10.png","element":"img"}],[{"text":"In words, the transition is deterministic, and the expected reward is independent of the action taken and the current state.","element":"span"}],[{"text":"Let ","element":"span"},{"style":{"height":13.19},"width":42.26,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-11.png","element":"img","alt":" Rt","inline":true,"padRight":true},{"text":"be the observed reward at step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". We assume that the distribution of ","element":"span"},{"style":{"height":19.2},"width":533.22,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-12.png","element":"img","alt":" Rt given xt is Bernoulli�r(xt)�,","inline":true,"padRight":true},{"text":"independently of ","element":"span"},{"style":{"height":17.2},"width":299.26,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-13.png","element":"img","alt":" (x1, x2, . . . , xt−1)","inline":true},{"text":". The expected reward function ","element":"span"},{"style":{"height":17.37},"width":360.44,"height":43.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-14.png","element":"img","alt":" r(xt) = E [R(xt)|xt]","inline":true,"padRight":true},{"text":"is assumed to be 1-Lipschitz and bounded. It is easy to see that for all ","element":"span"},{"style":{"height":14.8},"width":235.54,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-15.png","element":"img","alt":" x ∈ S, a ∈ A,","inline":true}],[{"id":"id-56","style":{"width":"56%"},"width":1056,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-16.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Step 3. Reduction from regression to a Markov game ","element":"span"},{"text":"Given a non-parametric regression problem as described in Step 1, we may reduce it to the problem of estimating the value function ","element":"span"},{"style":{"height":11.38},"width":48.1,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-17.png","element":"img","alt":" V ∗ ","inline":true,"padRight":true},{"text":"of the Markov game described in Step 2. To do this, we set","element":"span"}],[{"style":{"width":"22%"},"width":428,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-18.png","element":"img"}],[{"text":"and","element":"span"}],[{"style":{"width":"25%"},"width":482,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-19.png","element":"img"}],[{"text":"In this case, it follows from equations (","element":"span"},{"href":"#id-56","text":"15","element":"a"},{"text":") that the value function is given by ","element":"span"},{"style":{"height":14.59},"width":128.6,"height":36.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-20.png","element":"img","alt":" V ∗ = f","inline":true},{"text":". Moreover, the expected reward function ","element":"span"},{"style":{"height":17.2},"width":63.75,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/17-21.png","element":"img","alt":" r(·)","inline":true,"padRight":true},{"text":"is 1-Lipschitz, so the assumptions of the Markov game in Step 2 are satisfied. This reduction shows that the Markov game problem is at least as hard as the nonparametric regression problem, so a lower bound for the latter is also a lower bound for the former.","element":"span"}],[{"text":"Applying Theorem ","element":"span"},{"href":"#id-57","text":"10 ","element":"a"},{"text":"yields the following result: for any number ","element":"span"},{"style":{"height":17.2},"width":85.11,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-0.png","element":"img","alt":" δ ∈ (","inline":true},{"text":"0, 1","element":"span"},{"text":")","element":"span"},{"text":", there exist some numbers ","element":"span"},{"style":{"height":13.72},"width":263.8,"height":34.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-1.png","element":"img","alt":"c > 0 and Tδ >","inline":true,"padRight":true},{"text":"0, such that","element":"span"}],[{"style":{"width":"62%"},"width":1164,"height":115,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-2.png","element":"img"}],[{"text":"Consequently, for any reinforcement learning algorithm ","element":"span"},{"style":{"height":17.4},"width":46.25,"height":43.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-3.png","element":"img","alt":"ˆVT","inline":true,"padRight":true},{"text":"and any sufficiently small ","element":"span"},{"style":{"height":9.6},"width":60.69,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-4.png","element":"img","alt":" ε >","inline":true,"padRight":true},{"text":"0, there exists a Markov game problem such that in order to achieve","element":"span"}],[{"style":{"width":"27%"},"width":510,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-5.png","element":"img"}],[{"text":"one must have","element":"span"}],[{"style":{"width":"24%"},"width":468,"height":93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-6.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":11.6},"width":84.12,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-7.png","element":"img","alt":" C′ >","inline":true,"padRight":true},{"text":"0 is a constant.","element":"span"}],[{"id":"id-45","style":{"fontWeight":"bold"},"text":"E ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Details: A Concrete Instantiation of the Key Modules","element":"span"}],[{"text":"In Section ","element":"span"},{"text":"6","element":"span"},{"text":", we provide a sketch of our instantiation of the key modules and their informal properties. In this appendix, we close the gap by giving a detailed treatment on each of the three modules. We discuss in details each instantiation and its formal property. Combining together, we provide a precise statement on the sample complexity of the overall EIS algorithm.","element":"span"}],[{"id":"id-67","style":{"fontWeight":"bold"},"text":"E.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Improvement Module: MCTS","element":"span"}],[{"text":"Recall that the improvement module should be capable of providing improved estimates for both the value and policy functions, at the queried state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":". Since both the value and the policy are closely related to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"text":"function, one simple approach to simultaneously produce improved estimates is to obtain better estimates of ","element":"span"},{"style":{"height":14.18},"width":47.5,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-8.png","element":"img","alt":"Q∗ ","inline":true,"padRight":true},{"text":"first and then construct the improved estimates of value and policy from ","element":"span"},{"style":{"height":17.42},"width":32,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-9.png","element":"img","alt":"ˆQ","inline":true},{"text":". We will take this approach in this example and use MCTS to obtain the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"text":"estimates.","element":"span"}],[{"text":"MCTS is a class of popular search algorithms for sequential decision-makings, by building search trees and randomly sampling the state space. It is also one of the key components underlying the success of AlphaGo Zero. Most variants of MCTS in literature uses some forms of upper confidence bound (UCB) algorithm to select actions at each depth of the search tree. Since our focus is to demonstrate the improvement property, we employ a fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-depth MCTS, which takes the current model of the value function ","element":"span"},{"style":{"height":13.72},"width":33.24,"height":34.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-10.png","element":"img","alt":" Vl","inline":true,"padRight":true},{"text":"as inputs and outputs a value estimate ","element":"span"},{"style":{"height":19.02},"width":84.38,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-11.png","element":"img","alt":"ˆV (s)","inline":true,"padRight":true},{"text":"of the root node ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":". The current model ","element":"span"},{"style":{"height":13.72},"width":33.24,"height":34.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-12.png","element":"img","alt":" Vl","inline":true,"padRight":true},{"text":"of the value function is used for evaluating the value of the leaf nodes at depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H ","element":"span"},{"text":"during the Monte Carlo simulation. This fixed depth MCTS has been rigorously analyzed in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") with non-asymptotic error bound for the root node, when the state transition is deterministic.","element":"span"}],[{"text":"We refer readers to ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") (precisely, Algorithm 2) for the details of the pseudo code. We remark that in principle, many other variants of MCTS that has a precise error guarantee could be used instead; we choose the fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-depth variant here to provide a concrete example.","element":"span"}],[{"text":"We now lay down the overall algorithm of the improvement module in Algorithm ","element":"span"},{"href":"#id-46","text":"2 ","element":"a"},{"text":"below. Recall that the state transition is deterministic and the reward ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"could be random (cf. Section ","element":"span"},{"text":"2","element":"span"},{"text":"). Given the queried state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", note that the Q-value estimate ","element":"span"},{"style":{"height":19.02},"width":122.75,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-13.png","element":"img","alt":"ˆQ(s, a)","inline":true,"padRight":true},{"text":"for each ","element":"span"},{"style":{"height":12.4},"width":101.8,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-14.png","element":"img","alt":" a ∈ A","inline":true,"padRight":true},{"text":"is given by the sum of two components: (1) empirical average of the reward ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":")","element":"span"},{"text":"; (2) the estimated value ","element":"span"},{"style":{"height":19.02},"width":141.03,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-15.png","element":"img","alt":"ˆV (s ◦ a)","inline":true,"padRight":true},{"text":"for the next state, returned from calling the fixed depth MCTS algorithm with ","element":"span"},{"style":{"height":7.2},"width":75.08,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-16.png","element":"img","alt":" s ◦ a","inline":true,"padRight":true},{"text":"being the root node. Further recall that we use player ","element":"span"},{"text":"P1 ","element":"span"},{"text":"as the reference (i.e., ","element":"span"},{"style":{"height":18.19},"width":293.49,"height":45.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-17.png","element":"img","alt":" r(s, a) ≜ r1(s, a)","inline":true},{"text":"). The module then obtains improved value estimate ","element":"span"},{"style":{"height":19.02},"width":84.38,"height":47.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-18.png","element":"img","alt":"ˆV (s)","inline":true,"padRight":true},{"text":"by taking proper max or min of the Q-value estimates ","element":"span"},{"style":{"height":19.02},"width":122.7,"height":47.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-19.png","element":"img","alt":"ˆQ(s, a)","inline":true},{"text":"—depending on whether ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":") ","element":"span"},{"text":"is player ","element":"span"},{"text":"P1 ","element":"span"},{"text":"(maximizer) or player ","element":"span"},{"text":"P2 ","element":"span"},{"text":"(minimizer)—and improved policy estimate ","element":"span"},{"style":{"height":17.2},"width":98.56,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-20.png","element":"img","alt":" ˆπ(·|s)","inline":true,"padRight":true},{"text":"as the Boltzmann policy based on ","element":"span"},{"style":{"height":19.02},"width":122.78,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/18-21.png","element":"img","alt":"ˆQ(s, a)","inline":true},{"text":". It is worth mentioning that the fixed depth MCTS algorithm was designed for discounted MDP in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah ","element":"a"},{"href":"#id-33","referenceIndex":26,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":"), but extending to game setting is straightforward as in literature (","element":"span"},{"href":"#id-58","referenceIndex":9,"text":"Kocsis et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-58","referenceIndex":9,"text":"2006","element":"a"},{"text":"; ","element":"span"},{"href":"#id-59","referenceIndex":7,"text":"Kaufmann ","element":"a"},{"href":"#id-59","referenceIndex":7,"text":"& Koolen","element":"a"},{"text":", ","element":"span"},{"href":"#id-59","referenceIndex":7,"text":"2017","element":"a"},{"text":"), i.e., by alternating between max nodes (i.e., ","element":"span"},{"text":"P1 ","element":"span"},{"text":"plays) and min nodes (i.e., ","element":"span"},{"text":"P2","element":"span"},{"text":") for each depth in the tree. We defer details to Appendix ","element":"span"},{"href":"#id-60","text":"F","element":"a"},{"text":", where we prove the key theorem, Theorem ","element":"span"},{"href":"#id-47","text":"11","element":"a"},{"text":", of our improvement module.","element":"span"}],[{"id":"id-46","style":{"width":"100%"},"width":1875,"height":1104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-0.png","element":"img"}],[{"text":"The following theorem states the property of this specific improvement module (Algorithm ","element":"span"},{"href":"#id-46","text":"2","element":"a"},{"text":"). It is not hard to see that it directly implies the desired improvement property, i.e., Property ","element":"span"},{"href":"#id-38","text":"1","element":"a"},{"text":".","element":"span"}],[{"id":"id-47","style":{"fontWeight":"bold"},"text":"Theorem 11. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that the state transitions are deterministic. Given the current model ","element":"span"},{"style":{"height":17.2},"width":186.95,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-1.png","element":"img","alt":" f = (V , π)","inline":true},{"style":{"fontStyle":"italic"},"text":", a small temperature ","element":"span"},{"style":{"height":9.6},"width":64,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-2.png","element":"img","alt":" τ <","inline":true,"padRight":true},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", and the improvement factors ","element":"span"},{"text":"0 ","element":"span"},{"style":{"height":14},"width":120.32,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-3.png","element":"img","alt":" < ζv <","inline":true,"padRight":true},{"text":"1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"text":"0 ","element":"span"},{"style":{"height":15.59},"width":119.76,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-4.png","element":"img","alt":" < ζp <","inline":true,"padRight":true},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":". Suppose that the current value model, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"style":{"fontStyle":"italic"},"text":", satisfies that","element":"span"}],[{"style":{"width":"21%"},"width":398,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-5.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Then, with appropriately chosen parameters for Algorithm ","element":"span"},{"href":"#id-46","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":", for each query state ","element":"span"},{"style":{"height":20.11},"width":457.93,"height":50.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-6.png","element":"img","alt":" s0 ∈ S,� ˆV (s0), ˆπ(·|s0)� =","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Improvement Module","element":"span"},{"style":{"height":17.2},"width":287.92,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-7.png","element":"img","alt":"(f, s0), we have:","inline":true}],[{"style":{"fontStyle":"italic"},"text":"1. ","element":"span"},{"style":{"height":28.8},"width":1333.68,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-8.png","element":"img","alt":" E�� ˆV (s0) − V ∗(s0)|��≤ ζv · ε0,v, and E�DKL�ˆπ(·|s0)||P ∗τ (·|s0)��≤ ζp · ε0,v.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"2. The above is achieved with a sample complexity of","element":"span"}],[{"style":{"width":"46%"},"width":866,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/19-9.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"E.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supervised Learning Module: Nearest Neighbor Regression.","element":"span"}],[{"text":"To establish the generalization property of nearest neighbor supervised learning algorithm for estimating the optimal value function and policy, we make the following structural assumption about the Markov game. Specifically, we assume that the optimal solutions (i.e., true regression function) are smooth in some sense.","element":"span"}],[{"id":"id-63","style":{"fontWeight":"bold"},"text":"Assumption 1. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(A1.) The state space ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a compact subset of ","element":"span"},{"style":{"height":14.59},"width":49.28,"height":36.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-0.png","element":"img","alt":" Rd","inline":true},{"style":{"fontStyle":"italic"},"text":". The chosen distance metric ","element":"span"},{"style":{"height":11.6},"width":204.15,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-1.png","element":"img","alt":" d : S × S →","inline":true},{"style":{"height":14.79},"width":57.66,"height":36.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-2.png","element":"img","alt":"R+","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"associated with the state space ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"satisfies that ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"forms a compact metric space. (A2.) The optimal value function ","element":"span"},{"style":{"height":11.78},"width":215.44,"height":29.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-3.png","element":"img","alt":" V ∗ : S → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is bounded by ","element":"span"},{"style":{"height":13.19},"width":83.14,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-4.png","element":"img","alt":" Vmax","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and satisfies Lipschitz continuity with parameter ","element":"span"},{"style":{"height":13.19},"width":43.12,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-5.png","element":"img","alt":" Lv","inline":true},{"style":{"fontStyle":"italic"},"text":", i.e., ","element":"span"},{"style":{"height":17.2},"width":722.22,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-6.png","element":"img","alt":"∀s, s′ ∈ S, |V ∗(s) − V ∗(s′)| ≤ Lvd(s, s′)","inline":true},{"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(A3.) The optimal Boltzmann policy ","element":"span"},{"style":{"height":15.32},"width":47.12,"height":38.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-7.png","element":"img","alt":" P ∗τ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"defined in Eq. (","element":"span"},{"href":"#id-61","style":{"fontStyle":"italic"},"text":"4","element":"a"},{"style":{"fontStyle":"italic"},"text":") is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Lipschitz continuous with parameter ","element":"span"},{"style":{"height":17.59},"width":1179.65,"height":43.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-8.png","element":"img","alt":" Lp(τ), i.e., ∀s, s′ ∈ S, ∀a ∈ A, |P ∗τ (a|s) − P ∗τ (a|s′)| ≤ Lp(τ)d(s, s′).","inline":true}],[{"text":"For each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h > ","element":"span"},{"text":"0, the compact ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"has a finite ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":"/","element":"span"},{"text":"2-covering number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":")","element":"span"},{"text":". There exists a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"partition ","element":"span"},{"text":"of ","element":"span"},{"style":{"height":35.81},"width":1877.25,"height":89.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-9.png","element":"img","alt":" S,�Bj, j ∈ [N(h)]�","inline":true},{"text":", such that each ","element":"span"},{"style":{"height":15.72},"width":44.22,"height":39.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-10.png","element":"img","alt":" Bj","inline":true,"padRight":true},{"text":"has a diameter at most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", that is, sup","element":"span"},{"style":{"height":20.82},"width":309.3,"height":52.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-11.png","element":"img","alt":"x,y∈Bj d(x, y) ≤ h","inline":true},{"text":". We assume that states in the training set, ","element":"span"},{"style":{"height":17.2},"width":295.84,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-12.png","element":"img","alt":" T := {si, i ∈ [n]}","inline":true},{"text":", are sufficiently representative in the sense that for any given ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":", the sample size ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"can be chosen large enough to ensure that","element":"span"},{"style":{"height":19.96},"width":645.84,"height":49.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-13.png","element":"img","alt":"��Bj ∩ T�� ≥ K for all j ∈ [N(h)]. If T","inline":true,"padRight":true},{"text":"satisfies this condition, we call it ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":"-representative","element":"span"},{"text":".","element":"span"}],[{"style":{"width":"96%"},"width":1809,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-14.png","element":"img"}],[{"text":"algorithm: set","element":"span"}],[{"style":{"width":"41%"},"width":781,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-15.png","element":"img"}],[{"text":"For each ","element":"span"},{"style":{"height":12.4},"width":108.25,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-16.png","element":"img","alt":" a ∈ A","inline":true},{"text":", a similar algorithm can be used to fit the action probability ","element":"span"},{"style":{"height":17.2},"width":297.95,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-17.png","element":"img","alt":" πNN(a|·) : S → [","inline":true},{"text":"0, 1","element":"span"},{"text":"]","element":"span"},{"text":". The proposition below, proved in Appendix ","element":"span"},{"href":"#id-62","text":"G","element":"a"},{"text":", shows that this algorithm has the desired generalization property.","element":"span"}],[{"text":"To simplify the notation, we use ","element":"span"},{"style":{"height":9.59},"width":34.58,"height":23.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-18.png","element":"img","alt":" εv","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.99},"width":35.58,"height":29.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-19.png","element":"img","alt":" εp","inline":true,"padRight":true},{"text":"to represent the estimation errors of the value function and the policy, respectively, for the training data. That is, ","element":"span"},{"style":{"height":15.59},"width":396.72,"height":38.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-20.png","element":"img","alt":" εv ≜ ε1,v and εp ≜ ε1,p","inline":true,"padRight":true},{"text":"in Property 2.","element":"span"}],[{"id":"id-48","style":{"fontWeight":"bold"},"text":"Proposition 12. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose Assumption ","element":"span"},{"href":"#id-63","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"holds. If the training data is representative with respect to appropriate ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h > ","element":"span"},{"text":"0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K > ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":", the above regression algorithm satisfies Property ","element":"span"},{"href":"#id-39","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":". In particular, if","element":"span"}],[{"id":"id-65","style":{"width":"83%"},"width":1564,"height":103,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-21.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"we have","element":"span"}],[{"style":{"width":"43%"},"width":810,"height":111,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-22.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the constant ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c > ","element":"span"},{"text":"0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is independent of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"style":{"fontStyle":"italic"},"text":", the size of the training data.","element":"span"}],[{"text":"The size of a representative data set should at least scale as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"KN","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":")","element":"span"},{"text":". Consider a simple setting where the state space is a unit volume hypercube in ","element":"span"},{"style":{"height":14.58},"width":49.28,"height":36.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-23.png","element":"img","alt":" Rd","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":13.19},"width":43.89,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-24.png","element":"img","alt":" l∞","inline":true,"padRight":true},{"text":"metric. By (","element":"span"},{"href":"#id-64","referenceIndex":35,"text":"Wainwright","element":"a"},{"text":", ","element":"span"},{"href":"#id-64","referenceIndex":35,"text":"2019","element":"a"},{"text":", Lemma 5.7), the covering number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":") ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"scales as ","element":"span"},{"style":{"height":19.67},"width":189.22,"height":49.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-25.png","element":"img","alt":" Θ�(1/h)d�","inline":true},{"text":". Let ","element":"span"},{"style":{"height":16},"width":195.2,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-26.png","element":"img","alt":" ε = min{εv","inline":true},{"text":", ","element":"span"},{"style":{"height":18.09},"width":89.98,"height":45.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-27.png","element":"img","alt":"√εp}","inline":true},{"text":". Note that ","element":"span"},{"style":{"height":17.2},"width":162.87,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-28.png","element":"img","alt":" h = Θ(ε)","inline":true,"padRight":true},{"text":"Therefore, to achieve the desired generalization property, the size of the training dataset should satisfy","element":"span"}],[{"style":{"width":"38%"},"width":728,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-29.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"E.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Exploration Module: Random Sampling Policy","element":"span"}],[{"text":"As stated in Proposition ","element":"span"},{"href":"#id-48","text":"12","element":"a"},{"text":", the sampled states for nearest neighbor regression should be ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"text":"-representative, where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"are given by Eq. (","element":"span"},{"href":"#id-65","text":"19","element":"a"},{"text":"). We will show that a random sampling policy—uniformly sampling the state space—is able to visit a set of ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"text":"-representative states within a finite expected number of steps. We need to assume that the state space ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"is sufficiently regular near the boundary. In particular, we impose the following assumption which is naturally satisfied by convex compact sets in ","element":"span"},{"style":{"height":14.58},"width":49.28,"height":36.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-30.png","element":"img","alt":" Rd","inline":true},{"text":", for example.","element":"span"}],[{"id":"id-66","style":{"fontWeight":"bold"},"text":"Assumption 2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The partition ","element":"span"},{"style":{"height":17.72},"width":529.11,"height":44.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-31.png","element":"img","alt":" {Bj, j ∈ [N(h)]} of S satisfies","inline":true}],[{"style":{"width":"64%"},"width":1217,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-32.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"for some constant ","element":"span"},{"style":{"height":14},"width":270.4,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-33.png","element":"img","alt":" c0 > 0, where λ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the Lesbegue measure in ","element":"span"},{"style":{"height":14.58},"width":62.83,"height":36.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/20-34.png","element":"img","alt":" Rd.","inline":true}],[{"id":"id-49","style":{"fontWeight":"bold"},"text":"Proposition 13. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that the state space ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a compact subset of ","element":"span"},{"style":{"height":14.59},"width":49.28,"height":36.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-0.png","element":"img","alt":" Rd ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"satisfying Assumption ","element":"span"},{"href":"#id-66","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":". Given temperature ","element":"span"},{"style":{"height":11.6},"width":95.07,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-1.png","element":"img","alt":" τ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", and estimation errors ","element":"span"},{"style":{"height":15.59},"width":310.83,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-2.png","element":"img","alt":" εv > 0 and εp > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for the value and policy respectively, define ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"style":{"fontStyle":"italic"},"text":"as in Eq. (","element":"span"},{"href":"#id-65","style":{"fontStyle":"italic"},"text":"19","element":"a"},{"style":{"fontStyle":"italic"},"text":"). Then the expected number of steps to obtain a set of ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":"-representative states under the random sampling policy is upper bounded by","element":"span"}],[{"style":{"width":"35%"},"width":662,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-3.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"E.4 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Convergence Guarantees of the Instance","element":"span"}],[{"text":"For the instance of EIS algorithm with MCTS, random sampling policy and nearest neighbor supervised learning, we have shown that each module satisfies the desired properties (cf. Theorem ","element":"span"},{"href":"#id-47","text":"11 ","element":"a"},{"text":"and Propositions ","element":"span"},{"href":"#id-48","text":"12","element":"a"},{"text":"- ","element":"span"},{"href":"#id-49","text":"13","element":"a"},{"text":"). Therefore, the convergence result stated in Theorem ","element":"span"},{"href":"#id-3","text":"2 ","element":"a"},{"text":"holds for the specific instance we consider here. Moreover, the non-asymptotic analysis of these three methods provides an explicit upper bound on the sample complexity of this instance. The following corollary states the precise result.","element":"span"}],[{"id":"id-50","style":{"fontWeight":"bold"},"text":"Theorem 14. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that Assumptions ","element":"span"},{"href":"#id-63","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-66","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"hold. For a given ","element":"span"},{"style":{"height":17.2},"width":159.45,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-4.png","element":"img","alt":" ρ ∈ (0, 1)","inline":true},{"style":{"fontStyle":"italic"},"text":", and a small ","element":"span"},{"style":{"height":13.6},"width":105.59,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-5.png","element":"img","alt":" τ < 1,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"there exist appropriately chosen parameters for the instance of Algorithm ","element":"span"},{"href":"#id-37","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"with MCTS, random sampling and nearest neighbor supervised learning, such that:","element":"span"}],[{"style":{"width":"81%"},"width":1529,"height":674,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-6.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"3. In particular, if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a unit volume hypercube in ","element":"span"},{"style":{"height":14.59},"width":49.28,"height":36.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-7.png","element":"img","alt":" Rd","inline":true},{"style":{"fontStyle":"italic"},"text":", then the total sample complexity to achieve ","element":"span"},{"style":{"height":7.2},"width":121.42,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-8.png","element":"img","alt":" ε-error","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"value function and policy is given by","element":"span"}],[{"style":{"width":"34%"},"width":652,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-9.png","element":"img"}],[{"text":"Theorem ","element":"span"},{"href":"#id-50","text":"14 ","element":"a"},{"text":"states that the sample complexity of the instance of EIS algorithm scales as ","element":"span"},{"style":{"height":21.34},"width":316.66,"height":53.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-10.png","element":"img","alt":"�Θ� 1ε4+d�(omitting","inline":true},{"text":"the logarithmic factor). Note that Theorem ","element":"span"},{"href":"#id-6","text":"4 ","element":"a"},{"text":"implies that for any policy to learn the optimal value function within ","element":"span"},{"style":{"height":7.2},"width":19,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-11.png","element":"img","alt":" ε","inline":true,"padRight":true},{"text":"approximation error, the number of samples required must scale as ","element":"span"},{"style":{"height":21.34},"width":146.23,"height":53.34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/21-12.png","element":"img","alt":"�Ω� 1ε2+d�","inline":true},{"text":". Hence in terms of the dependence on the dimension, the instance we consider here is nearly optimal.","element":"span"}],[{"id":"id-60","style":{"fontWeight":"bold"},"text":"F ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Example Improvement Module and Proof of Theorem ","element":"span"},{"href":"#id-47","style":{"fontWeight":"bold"},"text":"11","element":"a"}],[{"text":"In this section, we formally show the improvement property of the specific example in Section ","element":"span"},{"href":"#id-67","text":"E.1","element":"a"},{"text":". To this end, we first elaborate some details regarding Algorithm ","element":"span"},{"href":"#id-46","text":"2 ","element":"a"},{"text":"in Appendix ","element":"span"},{"href":"#id-68","text":"F.1","element":"a"},{"text":". We then state two useful lemmas in Appendix ","element":"span"},{"href":"#id-69","text":"F.2 ","element":"a"},{"text":"and finally, we complete the proof of Theorem ","element":"span"},{"href":"#id-47","text":"11 ","element":"a"},{"text":"in Appendix ","element":"span"},{"href":"#id-70","text":"F.3","element":"a"},{"text":".","element":"span"}],[{"id":"id-68","style":{"fontWeight":"bold"},"text":"F.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Details of the Improvement Module Example","element":"span"}],[{"text":"Before proving the theorem, let us first discuss some details of the improvement module (i.e., Algorithm ","element":"span"},{"href":"#id-46","text":"2","element":"a"},{"text":"). It is worth mentioning some necessary modifications for applying the fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-depth MCTS algorithm (","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") in Algorithm 2. In particular, the original algorithm is introduced and analyzed for infinite-horizon discounted MDPs, but extending to a game setting is straightforward as similar to the literature (","element":"span"},{"href":"#id-58","referenceIndex":9,"text":"Kocsis et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-58","referenceIndex":9,"text":"2006","element":"a"},{"text":"; ","element":"span"},{"href":"#id-59","referenceIndex":7,"text":"Kaufmann & Koolen","element":"a"},{"text":", ","element":"span"},{"href":"#id-59","referenceIndex":7,"text":"2017","element":"a"},{"text":"). We now elaborate both the algorithmic and the technical extensions of the fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-depth MCTS algorithm for Markov games.","element":"span"}],[{"text":"Algorithmically, for turn-based zero-sum games, each layer in the tree would alternate between max nodes (i.e., player ","element":"span"},{"text":"P1","element":"span"},{"text":"’s turn) and min nodes (i.e., player ","element":"span"},{"text":"P2","element":"span"},{"text":"’s turn). For max nodes, the algorithm proceeds as usual by selecting the action with the maximum sum of the empirical average and the upper confidence term. For min nodes, the algorithm could choose the action with the minimum value of the empirical average minus the upper confidence term. More precisely, if the current node is a min node, then Line 6 (action selection) of the fixed depth MCTS algorithm in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") should be modified to (using the notation in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") to be consistent):","element":"span"}],[{"style":{"width":"89%"},"width":1682,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/22-0.png","element":"img"}],[{"text":"Alternatively, the algorithm could first negate the empirical average and then choose the action that maximizes the sum of the negated empirical average and the upper confidence term. With these modifications, the fixed depth MCTS algorithm could be used to estimate values for the game setting considered in this paper.","element":"span"}],[{"text":"Technically, we note that one could readily obtain the same guarantees for the fixed depth MCTS algorithm for the game setting as the algorithm for MDPs in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":"), by following essentially the same proof in Appendix A of ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":"). We only remark some technical points in the following:","element":"span"}],[{"text":"1. The concentration results (cf. Appendix A.4 of ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":")) still hold in the game setting. The original concentration inequalities in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") are two-sided. Therefore, they apply to both the max nodes and min nodes.","element":"span"}],[{"text":"2. The technical results were derived for rewards that are bounded in ","element":"span"},{"text":"[","element":"span"},{"text":"0, 1","element":"span"},{"text":"] ","element":"span"},{"text":"for convenience. It is not hard to see (cf. Remark 1 in Appendix A.3 of ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":")) that the same proof applies seamlessly for our setting, i.e., bounded rewards in ","element":"span"},{"style":{"height":17.2},"width":269.18,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/22-1.png","element":"img","alt":" [−Rmax, Rmax].","inline":true}],[{"text":"3. Since the original derivation was for MDPs, Lemma 4 in Appendix A.8 of ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") used the Bellman equation for MDPs. In the game setting, it is straightforward to replace it with the Bellman equation for the Markov games:","element":"span"}],[{"style":{"width":"76%"},"width":1436,"height":200,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/22-2.png","element":"img"}],[{"id":"id-69","style":{"fontWeight":"bold"},"text":"F.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Two Useful Lemmas","element":"span"}],[{"text":"We first state two useful lemmas. The first lemma bounds the difference between the two Boltzmann policies in terms of the difference of the underlying ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"text":"values that are used to construct the policies.","element":"span"}],[{"id":"id-75","style":{"fontWeight":"bold"},"text":"Lemma 15. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Fix a state ","element":"span"},{"style":{"height":12},"width":93.41,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/22-3.png","element":"img","alt":" s ∈ S","inline":true},{"style":{"fontStyle":"italic"},"text":". Suppose that the Q-value estimates satisfy","element":"span"}],[{"style":{"width":"36%"},"width":675,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/22-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Consider the two Boltzmann policies with temperature ","element":"span"},{"style":{"height":11.6},"width":107.99,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-0.png","element":"img","alt":" τ > 0:","inline":true}],[{"style":{"width":"73%"},"width":1383,"height":249,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-1.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Since ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"is fixed, we drop it for the ease of exposition. Let ","element":"span"},{"style":{"height":21.52},"width":299.1,"height":53.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-2.png","element":"img","alt":" C = �a′ e ˆQ(a′)/τ","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20.13},"width":334.45,"height":50.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-3.png","element":"img","alt":" C∗ = �a′ eQ∗(a′)/τ","inline":true},{"text":". ","element":"span"},{"text":"Then","element":"span"}],[{"style":{"width":"66%"},"width":1251,"height":605,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-4.png","element":"img"}],[{"text":"The second term above can be bounded using the log-sum inequality (cf. Appendix ","element":"span"},{"href":"#id-71","text":"A","element":"a"},{"text":"), which gives ","element":"span"},{"style":{"height":20.85},"width":208.1,"height":52.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-5.png","element":"img","alt":" C∗ log C∗C ≤","inline":true},{"style":{"height":28.6},"width":402.42,"height":71.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-6.png","element":"img","alt":"�a eQ∗(a)/τ log eQ∗(a)/τe ˆQ(a)/τ","inline":true,"padRight":true},{"text":". We then continue the above chain of inequalities to obtain","element":"span"}],[{"style":{"width":"63%"},"width":1182,"height":520,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-7.png","element":"img"}],[{"text":"Taking expectation, we have the bound","element":"span"}],[{"style":{"width":"53%"},"width":1000,"height":301,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/23-8.png","element":"img"}],[{"text":"as desired.","element":"span"}],[{"text":"The following lemma states a generic result regarding the maximum difference of two vectors.","element":"span"}],[{"id":"id-72","style":{"width":"98%"},"width":1851,"height":245,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Assume that ","element":"span"},{"style":{"height":19.18},"width":963.51,"height":47.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-1.png","element":"img","alt":" i∗ ∈ arg maxi∈[n]{xi}, and j∗ ∈ arg maxj∈[n]{yj}. Then","inline":true}],[{"style":{"width":"74%"},"width":1403,"height":541,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-2.png","element":"img"}],[{"text":"The same argument holds for the other inequality, and this completes the proof of Lemma ","element":"span"},{"href":"#id-72","text":"16","element":"a"},{"text":".","element":"span"}],[{"style":{"width":"1%"},"width":28,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-3.png","element":"img"}],[{"id":"id-70","style":{"fontWeight":"bold"},"text":"F.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Improvement Property: Proof of Theorem ","element":"span"},{"href":"#id-47","style":{"fontWeight":"bold"},"text":"11","element":"a"}],[{"text":"We are now ready to prove Theorem ","element":"span"},{"href":"#id-47","text":"11","element":"a"},{"text":". As discussed in Appendix ","element":"span"},{"href":"#id-68","text":"F.1","element":"a"},{"text":", the same guarantees of the modified fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-depth MCTS algorithm for the game setting can be established as in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":"). In the sequel, we extend these guarantees, together with the previous two lemmas, to analyze the improvement module example. In particular, we derive error bounds for the outputs of Algorithm ","element":"span"},{"href":"#id-46","text":"2","element":"a"},{"text":", ","element":"span"},{"style":{"height":14.62},"width":144.47,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-4.png","element":"img","alt":"ˆV and ˆπ","inline":true},{"text":", and analyze the corresponding sample complexity.","element":"span"}],[{"text":"Consider deterministic state transitions. The complete proof proceeds in two steps. The first step is to analyze the outputs of querying the fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-depth MCTS algorithm. Based on those outputs, as the next step we then analyze the outcomes of Algorithm ","element":"span"},{"href":"#id-46","text":"2","element":"a"},{"text":", ","element":"span"},{"style":{"height":14.62},"width":31,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-5.png","element":"img","alt":"ˆV","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.2},"width":23,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-6.png","element":"img","alt":" ˆπ","inline":true,"padRight":true},{"text":". Finally, we characterize the corresponding sample complexity of the overall process. Throughout the first two steps, since the current model ","element":"span"},{"style":{"height":17.2},"width":186.65,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-7.png","element":"img","alt":" f = (V , π)","inline":true,"padRight":true},{"text":"may be random, let us fix a realization; we will take expectation in the end to arrive at the desired results in Theorem ","element":"span"},{"href":"#id-47","text":"11","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Step 1: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Error bounds for outputs of the fixed depth MCTS algorithm. ","element":"span"},{"text":"In ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":"), the authors establish the following concentration result for the estimated value function of the root node ","element":"span"},{"style":{"height":19.02},"width":271.23,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-8.png","element":"img","alt":" s, ˆVm(s), under","inline":true,"padRight":true},{"text":"the fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-depth MCTS algorithm with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"simulations (cf. Proof of Theorem 1 in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":")): there exist constants ","element":"span"},{"style":{"height":17.2},"width":573.14,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-9.png","element":"img","alt":" β > 1 and ξ > 0 and η ∈ [1/2, 1)","inline":true,"padRight":true},{"text":"such that for every ","element":"span"},{"style":{"height":14},"width":104.34,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-10.png","element":"img","alt":" z ≥ 1,","inline":true}],[{"style":{"width":"32%"},"width":610,"height":84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-11.png","element":"img"}],[{"text":"where the probability is measured with respect to the randomness in the MCTS algorithm, and ","element":"span"},{"style":{"height":21.45},"width":213.52,"height":53.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-12.png","element":"img","alt":" µ(0)s satisfies","inline":true,"padRight":true},{"text":"the following condition","element":"span"}],[{"style":{"width":"20%"},"width":387,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-13.png","element":"img"}],[{"text":"Here, ","element":"span"},{"style":{"height":9.59},"width":34.58,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-14.png","element":"img","alt":" ε0","inline":true,"padRight":true},{"text":"denotes the error when evaluating the leaf nodes using current model, i.e., ","element":"span"},{"style":{"height":16},"width":420.74,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-15.png","element":"img","alt":" ε0 = ∥V − V ∗∥∞, where","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"is the current value model. Note that ","element":"span"},{"style":{"height":17.2},"width":131.48,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-16.png","element":"img","alt":" η ∈ [1/","inline":true},{"text":"2, 1","element":"span"},{"text":") ","element":"span"},{"text":"is a hyper-parameter for the MCTS algorithm that could be freely chosen. Throughout the proof, we set","element":"span"}],[{"style":{"width":"5%"},"width":112,"height":82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/24-17.png","element":"img"}],[{"text":"Recall that ","element":"span"},{"style":{"height":17.2},"width":88.05,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-0.png","element":"img","alt":" γ ∈ (","inline":true},{"text":"0, 1","element":"span"},{"text":") ","element":"span"},{"text":"is the discount factor. In addition, ","element":"span"},{"style":{"height":14},"width":18,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-1.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"is larger than 1 (cf. Section A.6.4 of ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":")). Therefore, for every ","element":"span"},{"style":{"height":16.99},"width":173.86,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-2.png","element":"img","alt":" t ≥ mη−1,","inline":true}],[{"style":{"width":"35%"},"width":671,"height":85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-3.png","element":"img"}],[{"text":"It follows that","element":"span"}],[{"style":{"width":"71%"},"width":1349,"height":435,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-4.png","element":"img"}],[{"text":"Thus","element":"span"}],[{"style":{"width":"53%"},"width":998,"height":137,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-5.png","element":"img"}],[{"text":"This leads to a variant of Theorem 1 in ","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"Shah et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":26,"text":"2019","element":"a"},{"text":") for the performance of the fixed depth MCTS, as stated following.","element":"span"}],[{"id":"id-73","style":{"fontWeight":"bold"},"text":"Proposition 17. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":17.2},"width":186.86,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-6.png","element":"img","alt":" f = (V , π)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be the current model and consider the fixed depth MCTS algorithm (with depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"style":{"fontStyle":"italic"},"text":") employed in Algorithm ","element":"span"},{"href":"#id-46","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":". Then, for each query state ","element":"span"},{"style":{"height":11.6},"width":93.38,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-7.png","element":"img","alt":" s ∈ S","inline":true},{"style":{"fontStyle":"italic"},"text":", the following claim holds for the output ","element":"span"},{"style":{"height":17.01},"width":51.24,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-8.png","element":"img","alt":"ˆVm","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"of the fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"style":{"fontStyle":"italic"},"text":"-depth MCTS with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"style":{"fontStyle":"italic"},"text":"simulations:","element":"span"}],[{"style":{"width":"47%"},"width":888,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-9.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":17.2},"width":127.25,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-10.png","element":"img","alt":" η ∈ [1/","inline":true},{"text":"2, 1","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a constant and the expectation is taken with respect to the randomness in the MCTS simulations.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Step 2: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Error bounds for outputs of the improvement module. ","element":"span"},{"text":"Now, we are ready to obtain a non-asymptotic analysis for the outputs of Algorithm ","element":"span"},{"href":"#id-46","text":"2","element":"a"},{"text":". Consider Line 4 of Algorithm ","element":"span"},{"href":"#id-46","text":"2","element":"a"},{"text":", where we call the fixed depth MCTS algorithm on the state ","element":"span"},{"style":{"height":14.8},"width":492.2,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-11.png","element":"img","alt":" s0 ◦ a. For each a ∈ A, as m","inline":true,"padRight":true},{"text":"simulations are performed with root node ","element":"span"},{"style":{"height":9.59},"width":93.18,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-12.png","element":"img","alt":"s0 ◦ a","inline":true,"padRight":true},{"text":"during the simulation, Proposition ","element":"span"},{"href":"#id-73","text":"17 ","element":"a"},{"text":"implies that","element":"span"}],[{"style":{"width":"76%"},"width":1440,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-13.png","element":"img"}],[{"text":"Recall that state transitions are assumed to be deterministic and note that the estimated Q-value for ","element":"span"},{"style":{"height":17.2},"width":108.52,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-14.png","element":"img","alt":" (s0, a)","inline":true,"padRight":true},{"text":"is given by (i.e., Line 6 of Algorithm ","element":"span"},{"href":"#id-46","text":"2","element":"a"},{"text":")","element":"span"}],[{"style":{"width":"30%"},"width":565,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-15.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.77},"width":477.87,"height":49.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-16.png","element":"img","alt":" ˆr(s0, a) = 1m�mi=1 ri(s0, a)","inline":true,"padRight":true},{"text":"is the empirical average of the immediate rewards for playing action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"when in state ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-17.png","element":"img","alt":" s0","inline":true},{"text":". Note that ","element":"span"},{"style":{"height":17.2},"width":191.29,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-18.png","element":"img","alt":" {ri(s0, a)}i","inline":true,"padRight":true},{"text":"are independent random variables that satisfy ","element":"span"},{"style":{"height":17.2},"width":385.27,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-19.png","element":"img","alt":" |ri(s0, a)| ≤ Rmax. By","inline":true,"padRight":true},{"text":"Hoeffding inequality, it holds that","element":"span"}],[{"style":{"width":"45%"},"width":844,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/25-20.png","element":"img"}],[{"text":"Thus","element":"span"}],[{"style":{"width":"60%"},"width":1139,"height":321,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-0.png","element":"img"}],[{"text":"It follows that","element":"span"}],[{"id":"id-76","style":{"width":"86%"},"width":1619,"height":436,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-1.png","element":"img"}],[{"text":"where the last inequality follows from the fact that ","element":"span"},{"style":{"height":17.2},"width":213.16,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-2.png","element":"img","alt":" η ∈ [1/2, 1).","inline":true}],[{"style":{"width":"96%"},"width":1807,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-3.png","element":"img"}],[{"text":"separately:","element":"span"}],[{"style":{"width":"96%"},"width":1810,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-4.png","element":"img"}],[{"text":"i.e., ","element":"span"},{"style":{"height":19.02},"width":428.51,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-5.png","element":"img","alt":"ˆV (s0) = maxa∈A ˆQ(s, a)","inline":true},{"text":", we apply Lemma ","element":"span"},{"href":"#id-72","text":"16 ","element":"a"},{"text":"from the previous section. For the query state ","element":"span"},{"style":{"height":17.2},"width":187.69,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-6.png","element":"img","alt":" s0, if I(s0)","inline":true,"padRight":true},{"text":"is player ","element":"span"},{"text":"P1","element":"span"},{"text":", applying Lemma ","element":"span"},{"href":"#id-72","text":"16 ","element":"a"},{"text":"yields","element":"span"}],[{"style":{"width":"78%"},"width":1473,"height":66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-7.png","element":"img"}],[{"text":"Therefore,","element":"span"}],[{"id":"id-74","style":{"width":"76%"},"width":1430,"height":271,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-8.png","element":"img"}],[{"text":"Similarly, if ","element":"span"},{"style":{"height":17.2},"width":308.98,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-9.png","element":"img","alt":" I(s0) is player P2","inline":true},{"text":", applying Lemma ","element":"span"},{"href":"#id-72","text":"16 ","element":"a"},{"text":"also yields the same desired result, Eq. (","element":"span"},{"href":"#id-74","text":"23","element":"a"},{"text":").","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(b) Policy estimate ","element":"span"},{"style":{"height":17.2},"width":130.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-10.png","element":"img","alt":" ˆπ(·|s0):","inline":true,"padRight":true},{"text":"In order to obtain an error bound for the policy estimate of the query state ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-11.png","element":"img","alt":"s0","inline":true},{"text":", i.e., ","element":"span"},{"style":{"height":22.32},"width":684.85,"height":55.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-12.png","element":"img","alt":" ˆπ(a|s0) = e ˆQ(s0,a)/τ/ �a′∈A e ˆQ(s0,a′)/τ","inline":true},{"text":", we apply Lemma ","element":"span"},{"href":"#id-75","text":"15 ","element":"a"},{"text":"from the previous section. Together ","element":"span"},{"text":"with Eq. (","element":"span"},{"href":"#id-76","text":"22","element":"a"},{"text":"), Lemma ","element":"span"},{"href":"#id-75","text":"15 ","element":"a"},{"text":"yields the following bound:","element":"span"}],[{"style":{"width":"78%"},"width":1479,"height":85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-13.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Step 3: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Completing the proof of Theorem ","element":"span"},{"href":"#id-47","style":{"fontStyle":"italic"},"text":"11","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"Recall that by the assumption of Theorem ","element":"span"},{"href":"#id-47","text":"11","element":"a"},{"text":",","element":"span"}],[{"id":"id-77","style":{"width":"21%"},"width":398,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-14.png","element":"img"}],[{"text":"Now, taking expectation of Eqs. (","element":"span"},{"href":"#id-74","text":"23","element":"a"},{"text":") and (","element":"span"},{"href":"#id-77","text":"24","element":"a"},{"text":") over the randomness in the current model ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":", we have","element":"span"}],[{"id":"id-78","style":{"width":"75%"},"width":1408,"height":175,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/26-15.png","element":"img"}],[{"text":"Recall that in Theorem ","element":"span"},{"href":"#id-47","text":"11","element":"a"},{"text":", our goal for improvement is as follows:","element":"span"}],[{"style":{"width":"84%"},"width":1580,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-0.png","element":"img"}],[{"text":"It is not hard to see that we can choose the parameters of the fixed depth MCTS algorithm, in particular, the depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H ","element":"span"},{"text":"and the number of simulations ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":", in an appropriate way such that Eqs. (","element":"span"},{"href":"#id-77","text":"25","element":"a"},{"text":") and (","element":"span"},{"href":"#id-78","text":"26","element":"a"},{"text":") satisfy the desired improvement bound. In particular, we could choose the depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H ","element":"span"},{"text":"such that","element":"span"}],[{"id":"id-80","style":{"width":"67%"},"width":1265,"height":86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-1.png","element":"img"}],[{"text":"and choose the number of simulations, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":", to be large enough such that the term ","element":"span"},{"style":{"height":18.19},"width":160.43,"height":45.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-2.png","element":"img","alt":" O(mη−1)","inline":true,"padRight":true},{"text":"is less than ","element":"span"},{"style":{"height":18.98},"width":126.97,"height":47.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-3.png","element":"img","alt":" γHε0,v.","inline":true,"padRight":true},{"text":"For small temperature ","element":"span"},{"style":{"height":9.6},"width":64,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-4.png","element":"img","alt":" τ <","inline":true,"padRight":true},{"text":"1, choosing the depth","element":"span"}],[{"id":"id-79","style":{"width":"18%"},"width":345,"height":119,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-5.png","element":"img"}],[{"text":"would satisfy the condition Eq. (","element":"span"},{"href":"#id-79","text":"28","element":"a"},{"text":"). Recall that the tunable hyper-parameter ","element":"span"},{"style":{"height":15.6},"width":304.93,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-6.png","element":"img","alt":" η is set to η = 1/","inline":true},{"text":"2. With the above ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":", this implies that the number of simulations should be","element":"span"}],[{"style":{"width":"60%"},"width":1134,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-7.png","element":"img"}],[{"text":"To summarize, we show that the desired improvement, Eq. (","element":"span"},{"href":"#id-80","text":"27","element":"a"},{"text":"), can indeed be satisfied with appropriate algorithmic parameters. Finally, regarding the sample complexity, we note that with a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-depth tree, each simulation of the fixed depth MCTS algorithm incurs ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H ","element":"span"},{"text":"state transitions. Therefore, the total sample complexity of querying the improvement module is ","element":"span"},{"style":{"height":10.8},"width":97.56,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-8.png","element":"img","alt":" m · H","inline":true},{"text":", which is equal to","element":"span"}],[{"style":{"width":"46%"},"width":866,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-9.png","element":"img"}],[{"id":"id-62","style":{"fontWeight":"bold"},"text":"G ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Proposition ","element":"span"},{"href":"#id-48","style":{"fontWeight":"bold"},"text":"12","element":"a"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":14.8},"width":198.82,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-10.png","element":"img","alt":" h, K and ∆","inline":true,"padRight":true},{"text":"be positive numbers to be chosen later, and recall that ","element":"span"},{"style":{"height":17.2},"width":522.46,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-11.png","element":"img","alt":" N ≡ N(h) is the h/2-covering","inline":true,"padRight":true},{"text":"number of ","element":"span"},{"style":{"height":19.96},"width":1106.22,"height":49.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-12.png","element":"img","alt":" S. Let Tj := Bj ∩ T and Kj :=��Tj��. For each j ∈ [N], we have","inline":true}],[{"style":{"width":"79%"},"width":1493,"height":468,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-13.png","element":"img"}],[{"text":"where the second step follows from the Jensen’s inequality, and the last step follows from the premise on the training error of the value function. To bound the first term of RHS above, we note that the ","element":"span"},{"style":{"height":15.72},"width":47.85,"height":39.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-14.png","element":"img","alt":" Kj","inline":true,"padRight":true},{"text":"random variables","element":"span"},{"style":{"height":20.1},"width":264.55,"height":50.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-15.png","element":"img","alt":"� ˆV (x), x ∈ Tj�","inline":true},{"text":"are independent and bounded by ","element":"span"},{"style":{"height":13.19},"width":83.14,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-16.png","element":"img","alt":" Vmax","inline":true},{"text":". So Hoffedings inequality ensures that","element":"span"}],[{"style":{"width":"73%"},"width":1379,"height":146,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/27-17.png","element":"img"}],[{"text":"Combining the last two equations and applying a union bound over ","element":"span"},{"style":{"height":17.2},"width":128.77,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-0.png","element":"img","alt":" j ∈ [N]","inline":true},{"text":", we obtain","element":"span"}],[{"style":{"width":"64%"},"width":1217,"height":146,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-1.png","element":"img"}],[{"text":"Since the random variable ","element":"span"},{"style":{"height":29.53},"width":1118.87,"height":73.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-2.png","element":"img","alt":" Z := maxj∈[N]�� 1Kj�x∈Tj� ˆV (x) − V ∗(x)�� satisfies |Z| ≤ 2Vmax","inline":true},{"text":", we may convert the above inequality into an expectation bound:","element":"span"}],[{"style":{"width":"38%"},"width":719,"height":230,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-3.png","element":"img"}],[{"text":"We are now ready to bound the quantity of interest:","element":"span"}],[{"style":{"width":"77%"},"width":1453,"height":675,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-4.png","element":"img"}],[{"text":"where step ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":") ","element":"span"},{"text":"holds because ","element":"span"},{"style":{"height":13.77},"width":146.38,"height":34.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-5.png","element":"img","alt":" V ∗ is Lv","inline":true},{"text":"-Lipschitz, and step ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"ii","element":"span"},{"text":") ","element":"span"},{"text":"holds because the sets ","element":"span"},{"style":{"height":15.72},"width":137.46,"height":39.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-6.png","element":"img","alt":" Tj ⊆ Bj","inline":true,"padRight":true},{"text":"have diameter at most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":". Now taking ","element":"span"},{"style":{"height":14.8},"width":282.12,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-7.png","element":"img","alt":" ∆ = εv, we have","inline":true}],[{"id":"id-81","style":{"width":"99%"},"width":1871,"height":469,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-8.png","element":"img"}],[{"text":"For each ","element":"span"},{"style":{"height":12.4},"width":101.86,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-9.png","element":"img","alt":" a ∈ A","inline":true},{"text":", let us fit the action probability ","element":"span"},{"style":{"height":17.2},"width":285.15,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-10.png","element":"img","alt":" πNN(a|·) : S → [","inline":true},{"text":"0, 1","element":"span"},{"text":"] ","element":"span"},{"text":"using a similar Nearest Neighbor type algorithm as ","element":"span"},{"style":{"height":13.65},"width":84.44,"height":34.14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-11.png","element":"img","alt":" VNN:","inline":true}],[{"style":{"width":"43%"},"width":820,"height":103,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-12.png","element":"img"}],[{"text":"Note that ","element":"span"},{"style":{"height":17.2},"width":889.71,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-13.png","element":"img","alt":" ∀x ∈ T, ∀a ∈ A, ˆπ(a|x) ∈ [0, 1] and P ∗τ (a|s) ∈ [0, 1]","inline":true},{"text":". Applying a similar argument as above for the ","element":"span"},{"text":"fitted action probability function ","element":"span"},{"style":{"height":17.2},"width":149.72,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-14.png","element":"img","alt":" πNN(a|·)","inline":true,"padRight":true},{"text":"w.r.t. the squared error, we have","element":"span"}],[{"style":{"width":"65%"},"width":1232,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/28-15.png","element":"img"}],[{"text":"Choosing ","element":"span"},{"style":{"height":19.2},"width":183.97,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-0.png","element":"img","alt":" ∆ =�εp/","inline":true},{"text":"2, we obtain that","element":"span"}],[{"style":{"width":"79%"},"width":1487,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-1.png","element":"img"}],[{"text":"Taking","element":"span"}],[{"style":{"width":"83%"},"width":1563,"height":103,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-2.png","element":"img"}],[{"text":"from Eqs (","element":"span"},{"href":"#id-81","text":"29","element":"a"},{"text":")-(","element":"span"},{"href":"#id-82","text":"30","element":"a"},{"text":"), we have the following bounds","element":"span"}],[{"id":"id-82","style":{"width":"32%"},"width":600,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-3.png","element":"img"}],[{"text":"This proves the first inequality of the proposition. We now focus on the inequality for the policy function. By Jensen’s inequality, we have","element":"span"}],[{"style":{"width":"64%"},"width":1214,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-4.png","element":"img"}],[{"text":"This is equivalent to saying that for each ","element":"span"},{"style":{"height":14},"width":105.12,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-5.png","element":"img","alt":" s ∈ S,","inline":true}],[{"id":"id-83","style":{"width":"69%"},"width":1294,"height":180,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-6.png","element":"img"}],[{"text":"On the other hand, for each ","element":"span"},{"style":{"height":14.8},"width":310.32,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-7.png","element":"img","alt":" s ∈ S, each a ∈ A","inline":true},{"text":", we have the bound","element":"span"}],[{"style":{"width":"31%"},"width":591,"height":210,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-8.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":9.6},"width":67.7,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-9.png","element":"img","alt":" α >","inline":true,"padRight":true},{"text":"0. Therefore, by the reverse Pinsker’s inequality (cf. Appendix ","element":"span"},{"href":"#id-71","text":"A","element":"a"},{"text":") we have","element":"span"}],[{"style":{"width":"55%"},"width":1037,"height":190,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-10.png","element":"img"}],[{"text":"Combining with the bound (","element":"span"},{"href":"#id-83","text":"32","element":"a"},{"text":"), we obtain that","element":"span"}],[{"style":{"width":"43%"},"width":816,"height":146,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/29-11.png","element":"img"}],[{"text":"This completes the proof of the second inequality.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"H ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Proposition ","element":"span"},{"href":"#id-49","style":{"fontWeight":"bold"},"text":"13","element":"a"}],[{"text":"In the sequel, we use the shorthand ","element":"span"},{"style":{"height":17.2},"width":182.41,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-0.png","element":"img","alt":" N ≡ N(h)","inline":true,"padRight":true},{"text":"and refer to each ","element":"span"},{"style":{"height":15.72},"width":44.23,"height":39.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-1.png","element":"img","alt":" Bj","inline":true,"padRight":true},{"text":"as a ball. For each integer ","element":"span"},{"style":{"height":14},"width":218.15,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-2.png","element":"img","alt":" t ≥ 1, let Wt","inline":true,"padRight":true},{"text":"be the number of balls visited up to time ","element":"span"},{"style":{"height":16},"width":554.63,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-3.png","element":"img","alt":" t. Let T ≜ inf {t ≥ 1 : Wt = N}","inline":true,"padRight":true},{"text":"be the first time when all balls are visited. For each ","element":"span"},{"style":{"height":16},"width":835.77,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-4.png","element":"img","alt":" w ∈ {1, 2, . . . , N}, let Tw ≜ inf {t ≥ 1 : Wt = w}","inline":true,"padRight":true},{"text":"be the the first time when the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"w","element":"span"},{"text":"-th ball is visited, and let ","element":"span"},{"style":{"height":13.19},"width":296.64,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-5.png","element":"img","alt":" Dw ≜ Tw − Tw−1","inline":true,"padRight":true},{"text":"be the time to visit the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"w ","element":"span"},{"text":"ball after ","element":"span"},{"style":{"height":17.2},"width":128.12,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-6.png","element":"img","alt":" (w − 1)","inline":true,"padRight":true},{"text":"balls have been visited. We use the convention that ","element":"span"},{"style":{"height":13.19},"width":192.43,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-7.png","element":"img","alt":" T0 = D0 =","inline":true,"padRight":true},{"text":"0. By definition, we have ","element":"span"},{"style":{"height":20.3},"width":270.11,"height":50.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-8.png","element":"img","alt":" T = �Nw=1 Dw.","inline":true}],[{"text":"When ","element":"span"},{"style":{"height":6.8},"width":68.35,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-9.png","element":"img","alt":" w −","inline":true,"padRight":true},{"text":"1 balls have been visited, the probability of visiting a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"new ","element":"span"},{"text":"ball is at least","element":"span"}],[{"style":{"width":"43%"},"width":810,"height":394,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-10.png","element":"img"}],[{"text":"where the last inequality follows from the regularity assumption. Therefore, the time to visit the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"w","element":"span"},{"text":"-th pair after ","element":"span"},{"style":{"height":6.8},"width":68.45,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-11.png","element":"img","alt":" w −","inline":true,"padRight":true},{"text":"1 pairs have been visited, ","element":"span"},{"style":{"height":13.19},"width":55.99,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-12.png","element":"img","alt":" Dw","inline":true},{"text":", is stochastically dominated by a geometric random variable with mean at most ","element":"span"},{"style":{"height":24.32},"width":173.62,"height":60.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-13.png","element":"img","alt":"N(N−w+1)c0 ","inline":true,"padRight":true},{"text":". It follows that","element":"span"}],[{"style":{"width":"51%"},"width":974,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-14.png","element":"img"}],[{"text":"This prove that the expected time to sample a ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", 1","element":"span"},{"text":")","element":"span"},{"text":"-representative set is upper bounded by ","element":"span"},{"style":{"height":28.8},"width":227.13,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-15.png","element":"img","alt":" O�Nc0 log N�","inline":true},{"text":".","element":"span"}],[{"text":"Note that if the trajectory samples ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", 1","element":"span"},{"text":")","element":"span"},{"text":"-representative sets, then each ball must be visited at least ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"times. Therefore, ","element":"span"},{"style":{"height":11.6},"width":122.29,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-16.png","element":"img","alt":" K · ET","inline":true,"padRight":true},{"text":"gives an upper bound for the expected time to sample a ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":")","element":"span"},{"text":"-representative set, hence","element":"span"}],[{"style":{"width":"29%"},"width":548,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-17.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"I ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-50","style":{"fontWeight":"bold"},"text":"14","element":"a"}],[{"text":"We will reuse the notation introduced in the proof of Theorem ","element":"span"},{"href":"#id-3","text":"2","element":"a"},{"text":". We initialize the value model ","element":"span"},{"style":{"height":17.2},"width":138.75,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-18.png","element":"img","alt":" V0(s) =","inline":true,"padRight":true},{"text":"0, ","element":"span"},{"style":{"height":12},"width":120.56,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-19.png","element":"img","alt":"∀s ∈ S","inline":true},{"text":". Hence","element":"span"},{"style":{"height":20.7},"width":358.74,"height":51.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-20.png","element":"img","alt":"��V0 − V ∗��∞ ≤ Vmax","inline":true},{"text":". Consider the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration. Let ","element":"span"},{"style":{"height":20.53},"width":61.57,"height":51.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-21.png","element":"img","alt":" ω(l)v","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":22.93},"width":61.57,"height":57.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-22.png","element":"img","alt":" ω(l)p","inline":true,"padRight":true},{"text":"denote the estimation errors for the model ","element":"span"},{"style":{"height":17.2},"width":331.42,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-23.png","element":"img","alt":" fl−1 = (Vl−1, πl−1)","inline":true},{"text":", at the beginning of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration:","element":"span"}],[{"style":{"width":"96%"},"width":1816,"height":274,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/30-24.png","element":"img"}],[{"text":"We require ","element":"span"},{"style":{"height":19.79},"width":367.76,"height":49.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-0.png","element":"img","alt":" S(l) to be (h(l), K(l))","inline":true},{"text":"-representative, where","element":"span"}],[{"id":"id-84","style":{"width":"79%"},"width":1498,"height":347,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-1.png","element":"img"}],[{"text":"We use ","element":"span"},{"style":{"height":29.2},"width":210.55,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-2.png","element":"img","alt":" D(l) =��si","inline":true},{"text":", ","element":"span"},{"style":{"height":31.09},"width":387.92,"height":77.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-3.png","element":"img","alt":"ˆV (l)(si), ˆπ(l(·|si)��nli=1","inline":true,"padRight":true},{"text":"to denote the set of training data generated by querying ","element":"span"},{"text":"MCTS. Consider choosing the depth ","element":"span"},{"style":{"height":15.39},"width":71.7,"height":38.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-4.png","element":"img","alt":" H(l)","inline":true,"padRight":true},{"text":"and simulation number ","element":"span"},{"style":{"height":15.39},"width":70.32,"height":38.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-5.png","element":"img","alt":" m(l)","inline":true,"padRight":true},{"text":"parameters for MCTS at the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration as follows:","element":"span"}],[{"id":"id-85","style":{"width":"60%"},"width":1140,"height":222,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-6.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":9.19},"width":33.24,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-7.png","element":"img","alt":" c1","inline":true,"padRight":true},{"text":"is a sufficiently large constant. By Theorem ","element":"span"},{"href":"#id-47","text":"11","element":"a"},{"text":", for each query state ","element":"span"},{"style":{"height":9.32},"width":29.68,"height":23.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-8.png","element":"img","alt":" si","inline":true},{"text":", the output ","element":"span"},{"style":{"height":19.02},"width":260.8,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-9.png","element":"img","alt":" ( ˆV (si), ˆπ(·|si))","inline":true,"padRight":true},{"text":"from MCTS satisfies","element":"span"}],[{"style":{"width":"66%"},"width":1246,"height":74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-10.png","element":"img"}],[{"text":"That is, the improvement factors for the value function and policy, ","element":"span"},{"style":{"height":15.59},"width":161.39,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-11.png","element":"img","alt":" ζv and ζp","inline":true},{"text":", of MCTS are follows:","element":"span"}],[{"style":{"width":"11%"},"width":220,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-12.png","element":"img"}],[{"text":"Note that the training set ","element":"span"},{"style":{"height":15.39},"width":67.18,"height":38.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-13.png","element":"img","alt":" D(l)","inline":true,"padRight":true},{"text":"have estimation error ","element":"span"},{"style":{"height":23.75},"width":82.83,"height":59.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-14.png","element":"img","alt":"ρ4ω(l)v","inline":true,"padRight":true},{"text":"for both value and policy, and the sampled states ","element":"span"},{"style":{"height":15.38},"width":62.07,"height":38.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-15.png","element":"img","alt":" S(l) ","inline":true,"padRight":true},{"text":"of the training set are ","element":"span"},{"style":{"height":19.78},"width":186.12,"height":49.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-16.png","element":"img","alt":" (h(l), K(l))","inline":true},{"text":"-representative. By Proposition ","element":"span"},{"href":"#id-48","text":"12","element":"a"},{"text":", the output of nearest neighbor supervised learning at the end of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration satisfies the following generalization property:","element":"span"}],[{"style":{"width":"44%"},"width":839,"height":176,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-17.png","element":"img"}],[{"text":"Therefore,","element":"span"}],[{"style":{"width":"35%"},"width":673,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-18.png","element":"img"}],[{"text":"Since ","element":"span"},{"style":{"height":21.45},"width":373.12,"height":53.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-19.png","element":"img","alt":" ω(1)v = Vmax, we have","inline":true}],[{"style":{"width":"40%"},"width":750,"height":82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-20.png","element":"img"}],[{"text":"That is,","element":"span"}],[{"style":{"width":"44%"},"width":836,"height":172,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/31-21.png","element":"img"}],[{"text":"During ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"-th iteration, the total sample complexity ","element":"span"},{"style":{"height":15.39},"width":78.34,"height":38.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-0.png","element":"img","alt":" M(l) ","inline":true,"padRight":true},{"text":"is given by ","element":"span"},{"style":{"height":18.31},"width":317.52,"height":45.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-1.png","element":"img","alt":" M(l) = H(l)m(l)nl","inline":true},{"text":". From Eqs. (","element":"span"},{"href":"#id-84","text":"33","element":"a"},{"text":")-(","element":"span"},{"href":"#id-85","text":"37","element":"a"},{"text":"), we have","element":"span"}],[{"style":{"width":"21%"},"width":409,"height":372,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-2.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":10},"width":192.36,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-3.png","element":"img","alt":" c2, c3, c4, c5","inline":true,"padRight":true},{"text":"are positive constants independent of ","element":"span"},{"style":{"height":14},"width":123.37,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-4.png","element":"img","alt":" ρ and l","inline":true},{"text":". By Proposition ","element":"span"},{"href":"#id-49","text":"13","element":"a"},{"text":", we have","element":"span"}],[{"style":{"width":"60%"},"width":1136,"height":100,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":11.19},"width":77.4,"height":27.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-6.png","element":"img","alt":" c6 >","inline":true,"padRight":true},{"text":"0 is a constant. Following the argument in the proof of Proposition ","element":"span"},{"href":"#id-4","text":"3","element":"a"},{"text":", we have: with probability at least 1 ","element":"span"},{"style":{"height":14.8},"width":68.96,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-7.png","element":"img","alt":" − δ,","inline":true}],[{"style":{"width":"73%"},"width":1383,"height":553,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-8.png","element":"img"}],[{"text":"Thus","element":"span"}],[{"style":{"width":"67%"},"width":1259,"height":122,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-9.png","element":"img"}],[{"text":"Given 0 ","element":"span"},{"style":{"height":19.77},"width":831.6,"height":49.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-10.png","element":"img","alt":" < ρ < 1, for L = Θ(log 1ε), i.e., ρL ≍ ε, we have","inline":true}],[{"style":{"width":"68%"},"width":1281,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-11.png","element":"img"}],[{"text":"In particular, if the state space ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"is unit hypercube in ","element":"span"},{"style":{"height":19.67},"width":500.21,"height":49.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-12.png","element":"img","alt":" Rd, we have N(ε) = O�ε−d�","inline":true},{"text":". Therefore,","element":"span"}],[{"style":{"width":"45%"},"width":851,"height":122,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/32-13.png","element":"img"}],[{"id":"id-36","style":{"fontWeight":"bold"},"text":"J ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Empirical Results: A Case-Study","element":"span"}],[{"text":"To understand how well the EIS method does for turn-based Markov games, we consider a simple game as a proof of concept. We shall design a simple non-deterministic game and apply our EIS framework. As ","element":"span"},{"text":"mentioned in Section ","element":"span"},{"text":"2","element":"span"},{"text":", the sparse sampling oracle (","element":"span"},{"href":"#id-35","referenceIndex":8,"text":"Kearns et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-35","referenceIndex":8,"text":"2002","element":"a"},{"text":") could be used for the improvement module in this case. This oracle is simple but suffices to convey the insights. In what follows, we shall demonstrate the effectiveness of EIS by comparing its final estimates of value function with the optimal one obtained via standard value iteration for games.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Setup. ","element":"span"},{"text":"Consider a two-player turn-based Markov game ","element":"span"},{"style":{"height":17.2},"width":397.97,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-0.png","element":"img","alt":" (S1, S2, A1, A2, r, P, γ)","inline":true},{"text":", where ","element":"span"},{"style":{"height":17.2},"width":111.13,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-1.png","element":"img","alt":" S1 = [","inline":true},{"text":"0.1, 1.1","element":"span"},{"text":"] ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":17.2},"width":301.92,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-2.png","element":"img","alt":"S2 = [−1.1, −0.1]","inline":true,"padRight":true},{"text":"are the set of states controlled by ","element":"span"},{"text":"P1 ","element":"span"},{"text":"and ","element":"span"},{"text":"P2","element":"span"},{"text":", respectively; ","element":"span"},{"style":{"height":16},"width":230.63,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-3.png","element":"img","alt":" A1 = A2 = {","inline":true},{"text":"0.1, 0.2, 0.3, 0.4, 0.5","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"are the set of actions; ","element":"span"},{"style":{"height":17.2},"width":298.45,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-4.png","element":"img","alt":" r(s, a) = 3(|s| −","inline":true,"padRight":true},{"text":"0.5","element":"span"},{"style":{"height":18.18},"width":104.91,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-5.png","element":"img","alt":")2 − a","inline":true,"padRight":true},{"text":"is the reward received by ","element":"span"},{"text":"P1 ","element":"span"},{"text":"when the corresponding player ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":") ","element":"span"},{"text":"takes action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"at state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":". For each real number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"u","element":"span"},{"text":", define two clipping operators ","element":"span"},{"style":{"height":17.75},"width":175.32,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-6.png","element":"img","alt":" ΠSi(u) =","inline":true,"padRight":true},{"text":"min","element":"span"},{"style":{"height":16},"width":285.82,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-7.png","element":"img","alt":"{max{min Si, u}","inline":true},{"text":", max ","element":"span"},{"style":{"height":16},"width":187.59,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-8.png","element":"img","alt":" Si}, i ∈ {","inline":true},{"text":"1, 2","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":". That is, ","element":"span"},{"style":{"height":17.75},"width":124.19,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-9.png","element":"img","alt":" ΠSi(u)","inline":true,"padRight":true},{"text":"projects ","element":"span"},{"style":{"fontStyle":"italic"},"text":"u ","element":"span"},{"text":"to the state space of player ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":". At state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":",, upon taking an action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"by the corresponding player ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":")","element":"span"},{"text":", the system transitions to next state ","element":"span"},{"style":{"height":19.2},"width":387.44,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-10.png","element":"img","alt":"s′ ∼ ΠSi�− |s| + N (a","inline":true},{"text":", 1","element":"span"},{"style":{"height":19.2},"width":34.6,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-11.png","element":"img","alt":")�","inline":true},{"text":", where ","element":"span"},{"style":{"height":17.2},"width":112.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-12.png","element":"img","alt":" N (·, ·)","inline":true,"padRight":true},{"text":"is the Guassian distribution, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"= ","element":"span"},{"text":"1 if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":") = ","element":"span"},{"text":"2, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"= ","element":"span"},{"text":"2 if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":") = ","element":"span"},{"text":"1. We consider the case ","element":"span"},{"style":{"height":14.4},"width":140.18,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-13.png","element":"img","alt":" γ = 0.8.","inline":true}],[{"text":"Before proceeding, let us intuitively understand how the solution of this game should be. Recall that ","element":"span"},{"text":"P1 ","element":"span"},{"text":"is the referenced reward maximizer. Since the reward ","element":"span"},{"style":{"height":18.18},"width":443.18,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-14.png","element":"img","alt":" r(s, a) = 3(|s| − 0.5)2 − a","inline":true},{"text":", by design, ","element":"span"},{"text":"P1 ","element":"span"},{"text":"would prefer to stay at states that are far from 0.5. That is, we expect the value function for ","element":"span"},{"text":"P1 ","element":"span"},{"text":"in the range ","element":"span"},{"text":"[","element":"span"},{"text":"0.1, 1.1","element":"span"},{"text":"] ","element":"span"},{"text":"(recall that ","element":"span"},{"style":{"height":17.2},"width":241.51,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-15.png","element":"img","alt":" S1 = [0.1, 1.1]","inline":true},{"text":") to be larger at states far from 0.5. On the contrary, as the reward minimizer, ","element":"span"},{"text":"P2 ","element":"span"},{"text":"would like to stay at state ","element":"span"},{"style":{"height":4.4},"width":31,"height":11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-16.png","element":"img","alt":" −","inline":true},{"text":"0.5 (recall that ","element":"span"},{"style":{"height":17.2},"width":303.62,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-17.png","element":"img","alt":" S2 = [−1.1, −0.1]","inline":true},{"text":") and potentially take large action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"to minimize the reward. As such, we expect the value function in ","element":"span"},{"style":{"height":17.2},"width":174.38,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-18.png","element":"img","alt":" [−1.1, 0.1]","inline":true,"padRight":true},{"text":"to be small around state ","element":"span"},{"style":{"height":4.4},"width":31,"height":11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-19.png","element":"img","alt":" −","inline":true},{"text":"0.5. We will confirm this intuition with both value iteration and our EIS method.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Algorithm. ","element":"span"},{"text":"We apply the EIS algorithm to learn the Nash equilibrium of the Markov game described above. Specifically, we use nearest neighbor regression as the supervised learning module and random sampling as the exploration module (cf. Section ","element":"span"},{"text":"6","element":"span"},{"text":")). As the Markov game is stochastic, we will use a variant of sparse sampling method ","element":"span"},{"href":"#id-35","referenceIndex":8,"text":"Kearns et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-35","referenceIndex":8,"text":"2002","element":"a"},{"text":") as the improvement module. This algorithm is simple to describe and analyze, while suffices to convey essential insights.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Sparse Sampling Oracle. ","element":"span"},{"text":"The sparse sampling algorithm can be viewed as a form of non-adaptive tree search for estimating the value of a given state. In particular, each node on the tree represents a state and each edge is labeled by an action and a reward. Starting from the root node, i.e., the queried state ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-20.png","element":"img","alt":" s0","inline":true,"padRight":true},{"text":"of the oracle, the tree is built in a simple manner: consider a node (state) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", for each action ","element":"span"},{"style":{"height":12.4},"width":104.46,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-21.png","element":"img","alt":" a ∈ A","inline":true},{"text":", call the generative model ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"times on the state-action pair ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"and obtain ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"children of the action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":"; the children nodes are the states returned by the generative model, and each edge is labeled by the action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"and the reward returned by the generative model. The process is repeated for all nodes of each level, and then moves on to the next level until reaching a depth of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":". In essence, this process builds a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"|A|","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":"-array tree of depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":". It represents a partial ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":"-step look-ahead tree starting from the queried state ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-22.png","element":"img","alt":" s0","inline":true},{"text":", and hence the term sparse sampling oracle.","element":"span"}],[{"text":"To obtain estimates for the value of the queried state, estimation from the supervised learning module are assigned to the leaf nodes at depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H","element":"span"},{"text":". These values, together with the associated rewards on the edges, are then backed-up to find estimates of values for their parents, i.e., nodes at depth ","element":"span"},{"style":{"height":10.8},"width":75.12,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-23.png","element":"img","alt":" H −","inline":true,"padRight":true},{"text":"1. The backup is just a simple average over the children, followed by taking appropriate max or min operation depending on who is the acting player at this layer of the tree. The process is recursively applied from the leaves up to the root level to find estimates of ","element":"span"},{"style":{"height":19.01},"width":102.46,"height":47.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-24.png","element":"img","alt":"ˆV (s0)","inline":true,"padRight":true},{"text":"for the root node ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-25.png","element":"img","alt":" s0","inline":true},{"text":". In the experiments, we set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H ","element":"span"},{"text":"= ","element":"span"},{"text":"2, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"= ","element":"span"},{"text":"30.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Evaluation. ","element":"span"},{"text":"We first use approximate value iteration to compute the optimal value function ","element":"span"},{"style":{"height":11.39},"width":48.1,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-26.png","element":"img","alt":" V ∗","inline":true},{"text":". The value iteration operator ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"is defined as follows:","element":"span"}],[{"style":{"width":"65%"},"width":1227,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-27.png","element":"img"}],[{"text":"For the continuous game considered here, we discretize the state space of each player to be 1, 500 equally spaced states. The above value iteration operator is then applied to obtain an approximate optimal value function ","element":"span"},{"style":{"height":11.38},"width":48.1,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-28.png","element":"img","alt":"V ∗","inline":true},{"text":". The approximate ","element":"span"},{"style":{"height":11.38},"width":48.1,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/33-29.png","element":"img","alt":" V ∗ ","inline":true,"padRight":true},{"text":"generated by 30 iterations of value iteration is plotted in red in Figure ","element":"span"},{"href":"#id-86","text":"1 ","element":"a"},{"text":"(Left). As","element":"span"}],[{"style":{"width":"97%"},"width":1834,"height":629,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-0.png","element":"img"}],[{"text":"Figure 1: Results of EIS for various iterations. Left: Approximate optimal ","element":"figcaption","subtype":"caption"},{"style":{"height":14.62},"width":48.1,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-1.png","element":"img","alt":"ˆV ∗","inline":true,"padRight":true},{"id":"id-86","text":"and the value function ","element":"figcaption","subtype":"caption"},{"text":"estimation ","element":"figcaption","subtype":"caption"},{"style":{"height":13.19},"width":35.24,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-2.png","element":"img","alt":" Vt","inline":true,"padRight":true},{"text":"of EIS obtained at various iterations. Right: Average distance and maximum distance between ","element":"figcaption","subtype":"caption"},{"style":{"height":17.01},"width":176.2,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-3.png","element":"img","alt":"ˆV ∗ and Vt","inline":true,"padRight":true},{"text":"at each EIS iteration.","element":"figcaption","subtype":"caption"}],[{"text":"expected, the result is consistent with our intuition that ","element":"span"},{"text":"P1 ","element":"span"},{"text":"attempts to stay away from 0.5 and ","element":"span"},{"text":"P2 ","element":"span"},{"text":"tries to minimize the reward by staying around ","element":"span"},{"style":{"height":11.2},"width":92.91,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-4.png","element":"img","alt":" −0.5.","inline":true}],[{"text":"Next, we evaluate our EIS method and compare the outputs against the approximate ","element":"span"},{"style":{"height":14.62},"width":48.1,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-5.png","element":"img","alt":"ˆV ∗","inline":true},{"text":". In particular, let ","element":"span"},{"style":{"height":13.19},"width":35.25,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-6.png","element":"img","alt":" Vt","inline":true,"padRight":true},{"text":"denote the value function obtained by EIS after ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"iterations. Figure ","element":"span"},{"href":"#id-86","text":"1 ","element":"a"},{"text":"(Left) shows the progress of EIS (i.e., ","element":"span"},{"style":{"height":13.19},"width":35.25,"height":32.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-7.png","element":"img","alt":" Vt","inline":true},{"text":") at various iterations. It is clear that EIS gradually improves the estimation of value function. On the right of Figure ","element":"span"},{"href":"#id-86","text":"1","element":"a"},{"text":", we plot the average distance as well as the maximum distance between ","element":"span"},{"style":{"height":17.01},"width":261.95,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-8.png","element":"img","alt":" Vt and ˆV ∗ over","inline":true,"padRight":true},{"text":"15 iterations. We remark that there is an inevitable gap between ","element":"span"},{"style":{"height":14.62},"width":187.27,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.10620/images/34-9.png","element":"img","alt":"ˆV ∗ and V ∗ ","inline":true,"padRight":true},{"text":"due to discretization. As can be seen, the error of EIS output decays gradually.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]