1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMi4wNjQ4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2021-08-07T18:51:37.000Z","paperID":"2002.06487","published":"2020-02-16T02:02:23.000Z","authors":"[\"Qingfeng Lan\",\"Yangchen Pan\",\"Alona Fyshe\",\"Martha White\"]","title":"Maxmin Q-learning: Controlling the Estimation Bias of Q-learning","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2025-06-16T20:02:49.762Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9tYXhtaW4tcS1sZWFybmluZy1jb250cm9sbGluZy10aGUtZXN0aW1hdGlvbi0xIn0=","type":"pwc","url":"https://paperswithcode.com/paper/maxmin-q-learning-controlling-the-estimation-1","data":null},{"id":"eyJ1cmwiOiJodHRwczovL2dpdGh1Yi5jb20vcWxhbjMvZXhwbG9yZXIifQ==","type":"code","url":"https://github.com/qlan3/explorer","data":"{\"framework\":\"pytorch\"}"}],"reposConnection":{"edges":[{"official":null,"node":{"id":"eyJyZXBvSUQiOiIxODgyNjM4NDMiLCJzb3VyY2UiOiJnaXRodWIifQ==","source":"github","repoID":"188263843","url":"https://github.com/qlan3/Explorer","title":"Explorer","language":"python","stars":92,"forks":14,"framework":null,"scoreTrending":null,"updated":null,"created":null,"downloads":null,"likes":null,"owner":[{"username":"qlan3","avatar":"https://avatars.githubusercontent.com/u/46080716?v=4"}]}},{"official":null,"node":{"id":"eyJyZXBvSUQiOiI1MDcxNTQxNDUiLCJzb3VyY2UiOiJnaXRodWIifQ==","source":"github","repoID":"507154145","url":"https://github.com/homayoonfarrahi/rpg-ur5","title":"rpg-ur5","language":"python","stars":1,"forks":0,"framework":null,"scoreTrending":null,"updated":null,"created":null,"downloads":null,"likes":null,"owner":[{"username":"homayoonfarrahi","avatar":"https://avatars.githubusercontent.com/u/4238934?v=4"}]}}]},"models":[],"tags":[{"id":"eyJuYW1lIjoicS1sZWFybmluZyIsInR5cGUiOiJ0YXNrIn0=","name":"q-learning","description":"Q-learning is a reinforcement learning algorithm where the input is the current state and action, and the output is the updated Q-value for that state-action pair. It's used in real-world scenarios like game playing and robotics, where an agent learns to make optimal decisions by interacting with its environment.","scoreTrending":0.13242120207138006,"count":{"stars":3312,"papers":1618,"models":1750},"__typename":"Tag"}],"summaries":[{"model":"gpt-4o-mini","header":"paper.summary.expertise.beginner","summary":"This research paper discusses a problem in a popular type of machine learning called Q-learning, where it often makes errors by overestimating values. The authors introduce a new method called Maxmin Q-learning, which helps control these errors by using multiple estimates and can reduce both overestimation and estimation mistakes. They show through experiments that this new method performs better than traditional Q-learning and other similar methods in different environments, making it easier for machines to learn effectively. Additionally, they confirm that Maxmin Q-learning helps improve learning in various scenarios and can be used alongside deep learning techniques."}],"emailsConnection":{"edges":[{"author":"martha white","node":{"id":"eyJhZGRyZXNzIjoid2hpdGVtQHVhbGJlcnRhLmNhIn0=","address":"whitem@ualberta.ca","name":"Martha White","avatar":null,"linkedin":"https://www.linkedin.com/in/martha-white-49713a59","bio":null,"site":"http://marthawhite.ca/","override":null,"membership":[{"name":"University of Alberta"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/11522871?v=4","username":"marthawhite"}],"scholar":[{"thirdPartyID":"1GqGhcsAAAAJ"},{"thirdPartyID":"t5zdD_IAAAAJ"}],"twitter":[],"location":[{"formatted":"Bloomington, IN, USA"}],"owner":[{"id":"eyJ1aWQiOiI0Y2ViMWFlYS00NTIwLTQ2ZDMtOWU5YS03ODFiNTIzNWM2MDcifQ==","name":"martha white","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/b3c1ce3f91f62dbfa68f0b11427d52b4_0847b6223b397938ea9748e33b98c22b3fa24e3f98fcc3cb55ba9fc4847fe34f"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTYwOS4wMTk5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1609.01995"},{"id":"eyJwYXBlcklEIjoiMTkwNS4xMjU4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.12588"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wNjQ4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.06487"},{"id":"eyJwYXBlcklEIjoiMTYwMS4wMTk0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1601.01944"},{"id":"eyJwYXBlcklEIjoiMTYwMi4wODc3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1602.08771"},{"id":"eyJwYXBlcklEIjoiMTYwNy4wMDQ0NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1607.00446"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wMjU5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.02597"},{"id":"eyJwYXBlcklEIjoiMTYwNi4wODU2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1606.08561"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNDYyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.04624"},{"id":"eyJwYXBlcklEIjoiMTgwNy4wNjc2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.06763"},{"id":"eyJwYXBlcklEIjoiMTUxMS4wODQ5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1511.08495"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wOTMyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.09328"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wNjYyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.06626"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wOTAxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.09013"},{"id":"eyJwYXBlcklEIjoiMjMwNC4wMTMxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.01315"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNDYxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.04613"},{"id":"eyJwYXBlcklEIjoiMTgxMC4wOTEwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.09103"},{"id":"eyJwYXBlcklEIjoiMTgwMS4wODI4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1801.08287"},{"id":"eyJwYXBlcklEIjoiMTkwNy4wNzc1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.07751"},{"id":"eyJwYXBlcklEIjoiMTcwNy4wODMxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1707.08316"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wNDU5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.04590"},{"id":"eyJwYXBlcklEIjoiMjExMi4wNzc3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.07774"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wNzQxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.07417"},{"id":"eyJwYXBlcklEIjoiMjExMi4wNzgwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.07806"},{"id":"eyJwYXBlcklEIjoiMTkwNC4wMTE5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.01191"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xNTcxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.15719"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xNDM3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.14372"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wOTEyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.09127"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wODI4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.08285"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wNTQwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.05405"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNzg2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.07865"},{"id":"eyJwYXBlcklEIjoiMjExMS4wODE3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.08172"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNzc5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.07791"},{"id":"eyJwYXBlcklEIjoiMjEwNC4xMzg0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.13844"},{"id":"eyJwYXBlcklEIjoiMjQwMi4xMzQyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.13425"},{"id":"eyJwYXBlcklEIjoiMjAwNS4wNDkxMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.04912"},{"id":"eyJwYXBlcklEIjoiMjQwNC4wMjExMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.02113"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wODA2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.08068"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wMzgwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.03807"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMjkwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.02902"},{"id":"eyJwYXBlcklEIjoiMjIwNC4wMDU2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2204.00565"},{"id":"eyJwYXBlcklEIjoiMjMxMi4wMjM1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.02355"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wNTMyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.05326"},{"id":"eyJwYXBlcklEIjoiMjMxMi4wMTIwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.01203"},{"id":"eyJwYXBlcklEIjoiMjMxMi4wMTYyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.01624"},{"id":"eyJwYXBlcklEIjoiMjIwNS4wODcxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.08716"},{"id":"eyJwYXBlcklEIjoiMTcwOC4wMTI5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1708.01298"},{"id":"eyJwYXBlcklEIjoiMjQwOC4wNzI0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2408.07245"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xMjI4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.12284"},{"id":"eyJwYXBlcklEIjoiMjExMS4wODA2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.08066"},{"id":"eyJwYXBlcklEIjoiMjMwNy4wNDg4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.04887"},{"id":"eyJwYXBlcklEIjoiMjQwOS4wMTQ0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2409.01449"},{"id":"eyJwYXBlcklEIjoiMTgxMi4wMDkxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1812.00914"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wNjYyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.06629"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wMDUxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.00518"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMTcwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.01705"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNzQzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.07435"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wOTU2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.09569"},{"id":"eyJwYXBlcklEIjoiMjIwMi4xMTEzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.11133"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xMTk5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.11992"},{"id":"eyJwYXBlcklEIjoiMjMwMS4xMTQ3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2301.11476"},{"id":"eyJwYXBlcklEIjoiMjIxMS4wNzgwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.07805"},{"id":"eyJwYXBlcklEIjoiMjQwMi4xMDMzOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.10339"},{"id":"eyJwYXBlcklEIjoiNzI5NzciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72977"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wMTU2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.01562"},{"id":"eyJwYXBlcklEIjoiMjQwNy4xODg0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.18840"},{"id":"eyJwYXBlcklEIjoiMjQwNy4wOTcwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.09702"}]}]}},{"author":"qingfeng lan","node":{"id":"eyJhZGRyZXNzIjoicWxhbjNAdWFsYmVydGEuY2EifQ==","address":"qlan3@ualberta.ca","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/46080716?v=4","username":"qlan3"}],"scholar":[{"thirdPartyID":"xq6um_0AAAAJ"},{"thirdPartyID":"I6FrxcsAAAAJ"},{"thirdPartyID":"qpcsAL4AAAAJ"},{"thirdPartyID":"jKD9jpoAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI5ZTIxNDA3NS1mODcxLTQ2NjUtYjQzMi00YzU3M2EzM2I3MDYifQ==","name":"chen qingfeng","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMi4wNjQ4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.06487"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wMTQ3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.01470"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xODI0NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.18246"},{"id":"eyJwYXBlcklEIjoiMjExMi4xMTkyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.11921"},{"id":"eyJwYXBlcklEIjoiMjQwNS4xNjc3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.16771"},{"id":"eyJwYXBlcklEIjoiMjQwNy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.01704"},{"id":"eyJwYXBlcklEIjoiMTkxMi4wOTA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.09040"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wNTE0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.05147"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wOTg3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.09876"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wMTM2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.01365"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xMDg2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.10868"},{"id":"eyJwYXBlcklEIjoiMjEwNS4xNDIxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.14214"},{"id":"eyJwYXBlcklEIjoiMjQwNy4xMzIwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.13205"},{"id":"eyJwYXBlcklEIjoiMjQwNy4xOTk0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.19944"}]}]}},{"author":"alona fyshe","node":{"id":"eyJhZGRyZXNzIjoiYWxvbmFAdWFsYmVydGEuY2EifQ==","address":"alona@ualberta.ca","name":"Alona Fyshe","avatar":null,"linkedin":"https://www.linkedin.com/in/alona-fyshe-8054225","bio":null,"site":"http://www.cs.cmu.edu/~afyshe/","override":null,"membership":[{"name":"University of Victoria"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"Vw8z7qwAAAAJ"}],"twitter":[],"location":[{"formatted":"Victoria, BC, Canada"}],"owner":[{"id":"eyJ1aWQiOiJlYzQ4ZTE5MC0zNGJkLTRhMjMtYTQwNy01ZjUyZGNhNzM2YjYifQ==","name":"Alona Fyshe","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMi4wNjQ4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.06487"},{"id":"eyJwYXBlcklEIjoiMjIwNC4wMjkwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2204.02908"},{"id":"eyJwYXBlcklEIjoiMjQwNS4xODU3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.18570"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xNzM2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.17369"},{"id":"eyJwYXBlcklEIjoiMjQwNS4wMTAxMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.01012"},{"id":"eyJwYXBlcklEIjoiMjIwNS4wODcxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.08716"},{"id":"eyJwYXBlcklEIjoiMjIwOS4wMjU4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2209.02582"},{"id":"eyJwYXBlcklEIjoiMjQwNS4xNzY2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.17663"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wMzM3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.03375"},{"id":"eyJwYXBlcklEIjoiMjMwNC4wNDg4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.04881"},{"id":"eyJwYXBlcklEIjoiMTkwNS4xMDY3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.10679"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNzQzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.07435"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xMTk5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.11992"},{"id":"eyJwYXBlcklEIjoiMjMwMS4wOTY0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2301.09640"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMjg5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.02892"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wNTM4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.05387"},{"id":"eyJwYXBlcklEIjoiMjIwOC4xMDU3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2208.10576"},{"id":"eyJwYXBlcklEIjoiMjMwNy4wNTY0NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.05646"},{"id":"eyJwYXBlcklEIjoiMjQwMy4wMjI3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.02271"}]}]}},{"author":"yangchen pan","node":{"id":"eyJhZGRyZXNzIjoicGFuNkB1YWxiZXJ0YS5jYSJ9","address":"pan6@ualberta.ca","name":"Pan","avatar":"https://img.fullcontact.com/static/1eeb5b08de2550317e7a468699a9fd7b_a4565e028d064416f721041f6b9507c0e28f90fe1f50984586ebd4cfa534cfca","linkedin":"https://www.linkedin.com/in/jonathan-pan-4b56bb61","bio":null,"site":null,"override":null,"membership":[{"name":"Cenovus Energy"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/14236651?v=4","username":"yannickycpan"}],"scholar":[{"thirdPartyID":"QyAsyYEAAAAJ"}],"twitter":[],"location":[{"formatted":"Canada"}],"owner":[{"id":"eyJ1aWQiOiI2NGQ3ZDAxZC02YzExLTQ5Y2EtOWE3Ny05MjVmMjIzM2YyZTQifQ==","name":"yangchen pan","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/1eeb5b08de2550317e7a468699a9fd7b_a4565e028d064416f721041f6b9507c0e28f90fe1f50984586ebd4cfa534cfca"}],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMi4wNjQ4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.06487"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNDYyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.04624"},{"id":"eyJwYXBlcklEIjoiMTUxMS4wODQ5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1511.08495"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wOTMyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.09328"},{"id":"eyJwYXBlcklEIjoiMTgxMC4wOTEwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.09103"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNjkzMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.06931"},{"id":"eyJwYXBlcklEIjoiMjIxMS4xNDk2MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.14960"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xNDM3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.14372"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wNTgyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.05822"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNzc5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.07791"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wNjE5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.06195"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wODA2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.08068"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xODQ5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.18495"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xMDg2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.10868"},{"id":"eyJwYXBlcklEIjoiMjMwNy4wODg3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.08873"},{"id":"eyJwYXBlcklEIjoiMjMwOC4wNjcwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2308.06703"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wOTU2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.09569"},{"id":"eyJwYXBlcklEIjoiNzI1MDUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72505"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xNTUxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.15518"}]}]}}]},"__typename":"paper","authorArray":["Qingfeng Lan","Yangchen Pan","Alona Fyshe","Martha White"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2002.06487","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2002.06487","publisher":"arxiv","paperJSON":{"title":"Maxmin Q-learning: Controlling the Estimation Bias of Q-learning","paperID":"2002.06487","avgLineHeight":10.96,"imgScale":4,"sections":[{"heading":"ABSTRACT","paragraphs":[[{"text":"Q-learning suffers from overestimation bias, because it approximates the maximum action value using the maximum estimated action value. Algorithms have been proposed to reduce overestimation bias, but we lack an understanding of how bias interacts with performance, and the extent to which existing algorithms mitigate bias. In this paper, we 1) highlight that the effect of overestimation bias on learning efficiency is environment-dependent; 2) propose a generalization of Qlearning, called ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Maxmin Q-learning","element":"span"},{"text":", which provides a parameter to flexibly control bias; 3) show theoretically that there exists a parameter choice for Maxmin Q-learning that leads to unbiased estimation with a lower approximation variance than Q-learning; and 4) prove the convergence of our algorithm in the tabular case, as well as convergence of several previous Q-learning variants, using a novel Generalized Q-learning framework. We empirically verify that our algorithm better controls estimation bias in toy environments, and that it achieves superior performance on several benchmark problems. ","element":"span"},{"text":"1","element":"span"}]]},{"heading":"1 INTRODUCTION","paragraphs":[[{"text":"Q-learning ","element":"span"},{"href":"#id-0","referenceIndex":18,"text":"(Watkins, ","element":"a"},{"href":"#id-0","referenceIndex":18,"text":"1989) ","element":"a"},{"text":"is one of the most popular reinforcement learning algorithms. One of the reasons for this widespread adoption is the simplicity of the update. On each step, the agent updates its action value estimates towards the observed reward and the estimated value of the maximal action in the next state. This target represents the highest value the agent thinks it could obtain from the current state and action, given the observed reward.","element":"span"}],[{"text":"Unfortunately, this simple update rule has been shown to suffer from overestimation bias ","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"(Thrun & ","element":"a"},{"href":"#id-1","referenceIndex":15,"text":"Schwartz, ","element":"a"},{"href":"#id-1","referenceIndex":15,"text":"1993; ","element":"a"},{"href":"#id-2","referenceIndex":17,"text":"van Hasselt, ","element":"a"},{"href":"#id-2","referenceIndex":17,"text":"2010)","element":"a"},{"text":". The agent updates with the maximum over action values might be large because an action’s value actually is high, or it can be misleadingly high simply because of the stochasticity or errors in the estimator. With many actions, there is a higher probability that one of the estimates is large simply due to stochasticity and the agent will overestimate the value. This issue is particularly problematic under function approximation, and can significant impede the quality of the learned policy ","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"(Thrun & Schwartz, ","element":"a"},{"href":"#id-1","referenceIndex":15,"text":"1993; ","element":"a"},{"href":"#id-3","referenceIndex":13,"text":"Szita & L˝orincz, ","element":"a"},{"href":"#id-3","referenceIndex":13,"text":"2008; ","element":"a"},{"href":"#id-4","referenceIndex":11,"text":"Strehl et al., ","element":"a"},{"href":"#id-4","referenceIndex":11,"text":"2009) ","element":"a"},{"text":"or even lead to failures of Q-learning ","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"(Thrun & Schwartz, ","element":"a"},{"href":"#id-1","referenceIndex":15,"text":"1993)","element":"a"},{"text":". More recently, experiments across several domains suggest that this overestimation problem is common ","element":"span"},{"href":"#id-5","referenceIndex":8,"text":"(Hado van Hasselt et al., ","element":"a"},{"href":"#id-5","referenceIndex":8,"text":"2016)","element":"a"},{"text":".","element":"span"}],[{"text":"Double Q-learning ","element":"span"},{"href":"#id-2","referenceIndex":17,"text":"(van Hasselt, ","element":"a"},{"href":"#id-2","referenceIndex":17,"text":"2010) ","element":"a"},{"text":"is introduced to instead ensure ","element":"span"},{"style":{"fontStyle":"italic"},"text":"under","element":"span"},{"text":"estimation bias. The idea is to maintain two unbiased independent estimators of the action values. The expected action value of estimator one is selected for the maximal action from estimator two, which is guaranteed not to overestimate the true maximum action value. Double DQN ","element":"span"},{"href":"#id-5","referenceIndex":8,"text":"(Hado van Hasselt et al., ","element":"a"},{"href":"#id-5","referenceIndex":8,"text":"2016)","element":"a"},{"text":", the extension of this idea to Q-learning with neural networks, has been shown to significantly improve performance over Q-learning. However, this is not a complete answer to this problem, because trading overestimation bias for underestimation bias is not always desirable, as we show in our experiments.","element":"span"}],[{"text":"Several other methods have been introduced to reduce overestimation bias, without fully moving towards underestimation. Weighted Double Q-learning ","element":"span"},{"href":"#id-6","referenceIndex":21,"text":"(Zhang et al., ","element":"a"},{"href":"#id-6","referenceIndex":21,"text":"2017) ","element":"a"},{"text":"uses a weighted combination of the Double Q-learning estimate, which likely has underestimation bias, and the Q-learning estimate, which likely has overestimation bias. Bias-corrected Q-Learning ","element":"span"},{"href":"#id-7","referenceIndex":9,"text":"(Lee et al., ","element":"a"},{"href":"#id-7","referenceIndex":9,"text":"2013) ","element":"a"},{"text":"reduces the overestimation bias through a bias correction term. Ensemble Q-learning and Averaged Q-learning ","element":"span"},{"href":"#id-8","referenceIndex":1,"text":"(Anschel et al., ","element":"a"},{"href":"#id-8","referenceIndex":1,"text":"2017) ","element":"a"},{"text":"take averages of multiple action values, to both reduce the overestimation bias and the estimation variance. However, with a finite number of action-value functions, the average operation in these two algorithms will never completely remove the overestimation bias, as the average of several overestimation biases is always positive. Further, these strategies do not guide how strongly we should correct for overestimation bias, nor how to determine—or control— the level of bias.","element":"span"}],[{"text":"The overestimation bias also appears in the actor-critic setting ","element":"span"},{"href":"#id-9","referenceIndex":6,"text":"(Fujimoto et al., ","element":"a"},{"href":"#id-9","referenceIndex":6,"text":"2018; ","element":"a"},{"href":"#id-10","referenceIndex":7,"text":"Haarnoja ","element":"a"},{"href":"#id-10","referenceIndex":7,"text":"et al., ","element":"a"},{"href":"#id-10","referenceIndex":7,"text":"2018)","element":"a"},{"text":". For example, ","element":"span"},{"href":"#id-9","referenceIndex":6,"text":"Fujimoto et al. ","element":"a"},{"href":"#id-9","referenceIndex":6,"text":"(2018) ","element":"a"},{"text":"propose the Twin Delayed Deep Deterministic policy gradient algorithm (TD3) which reduces the overestimation bias by taking the minimum value between two critics. However, they do not provide a rigorous theoretical analysis for the effect of applying the minimum operator. There is also no theoretical guide for choosing the number of estimators such that the overestimation bias can be reduced to 0.","element":"span"}],[{"text":"$31","element":"span"}]]},{"heading":"2 PROBLEM SETTING","paragraphs":[[{"text":"We formalize the problem as a Markov Decision Process (MDP), ","element":"span"},{"style":{"height":16},"width":228.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-0.png","element":"img","alt":" (S, A, P, r, γ)","inline":true},{"text":", where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"is the state space, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"is the action space, ","element":"span"},{"style":{"height":16},"width":358,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-1.png","element":"img","alt":" P : S×A×S → [0, 1]","inline":true,"padRight":true},{"text":"is the transition probabilities, ","element":"span"},{"style":{"height":12.4},"width":299.33,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-2.png","element":"img","alt":" r : S×A×S → R","inline":true,"padRight":true},{"text":"is the reward mapping, and ","element":"span"},{"style":{"height":16},"width":153.22,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-3.png","element":"img","alt":" γ ∈ [0, 1]","inline":true,"padRight":true},{"text":"is the discount factor. At each time step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", the agent observes a state ","element":"span"},{"style":{"height":13.19},"width":117.27,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-4.png","element":"img","alt":" St ∈ S","inline":true,"padRight":true},{"text":"and takes an action ","element":"span"},{"style":{"height":13.99},"width":128.73,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-5.png","element":"img","alt":" At ∈ A","inline":true,"padRight":true},{"text":"and then transitions to a new state ","element":"span"},{"style":{"height":14.79},"width":157.62,"height":36.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-6.png","element":"img","alt":" St+1 ∈ S","inline":true,"padRight":true},{"text":"according to the transition probabilities ","element":"span"},{"text":"P ","element":"span"},{"text":"and receives a scalar reward ","element":"span"},{"style":{"height":16},"width":462.18,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-7.png","element":"img","alt":" Rt+1 = r(St, At, St+1) ∈ R","inline":true},{"text":". The goal of the agent is to find a policy ","element":"span"},{"style":{"height":16},"width":309.97,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-8.png","element":"img","alt":" π : S × A → [0, 1]","inline":true,"padRight":true},{"text":"that maximizes the expected return starting from some initial state.","element":"span"}],[{"text":"Q-learning is an off-policy algorithm which attempts to learn the state-action values ","element":"span"},{"style":{"height":14.8},"width":253.96,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-9.png","element":"img","alt":" Q : S ×A → R","inline":true,"padRight":true},{"text":"for the optimal policy. It tries to solve for","element":"span"}],[{"style":{"width":"61%"},"width":970,"height":76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-10.png","element":"img"}],[{"text":"The optimal policy is to act greedily with respect to these action values: from each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"select ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"from ","element":"span"},{"style":{"height":16.7},"width":348.59,"height":41.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-11.png","element":"img","alt":" arg maxa∈A Q∗(s, a)","inline":true},{"text":". ","element":"span"},{"text":"The update rule for an approximation ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"text":"for a sampled transition ","element":"span"},{"style":{"height":10.79},"width":264.48,"height":26.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-12.png","element":"img","alt":"st, at, rt+1, st+1","inline":true,"padRight":true},{"text":"is:","element":"span"}],[{"style":{"width":"94%"},"width":1503,"height":69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-13.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":6.8},"width":26,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/1-14.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"is the step-size. The transition can be generated off-policy, from any behaviour that suffi-ciently covers the state space. This algorithm is known to converge in the tabular setting ","element":"span"},{"href":"#id-11","referenceIndex":16,"text":"(Tsitsiklis, ","element":"a"},{"href":"#id-11","referenceIndex":16,"text":"1994)","element":"a"},{"text":", with some limited results for the function approximation setting ","element":"span"},{"href":"#id-12","referenceIndex":10,"text":"(Melo & Ribeiro, ","element":"a"},{"href":"#id-12","referenceIndex":10,"text":"2007)","element":"a"},{"text":".","element":"span"}]]},{"heading":"3 UNDERSTANDING WHEN OVERESTIMATION BIAS HELPS AND HURTS","paragraphs":[[{"text":"In this section, we briefly discuss the estimation bias issue, and empirically show that both overestimation and underestimation bias may improve learning performance, depending on the environment. This motivates our Maxmin Q-learning algorithm described in the next section, which allows us to flexibly control the estimation bias and reduce the estimation variance.","element":"span"}],[{"text":"The overestimation bias occurs since the target ","element":"span"},{"style":{"height":16},"width":343.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-0.png","element":"img","alt":" maxa′∈A Q(st+1, a′)","inline":true,"padRight":true},{"text":"is used in the Q-learning update. Because ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"text":"is an approximation, it is probable that the approximation is higher than the true value for one or more of the actions. The maximum over these estimators, then, is likely to be skewed towards an overestimate. For example, even unbiased estimates ","element":"span"},{"style":{"height":16},"width":186.02,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-1.png","element":"img","alt":" Q(st+1, a′)","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":6.8},"width":35.07,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-2.png","element":"img","alt":" a′","inline":true},{"text":", will vary due to stochasticity. ","element":"span"},{"style":{"height":16},"width":540.98,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-3.png","element":"img","alt":" Q(st+1, a′) = Q∗(st+1, a′) + ea′","inline":true},{"text":", and for some actions, ","element":"span"},{"style":{"height":9.19},"width":45.84,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-4.png","element":"img","alt":" ea′","inline":true,"padRight":true},{"text":"will be positive. As a result, ","element":"span"},{"style":{"height":16},"width":1252.86,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-5.png","element":"img","alt":" E[maxa′∈A Q(st+1, a′)] ≥ maxa′∈A E[Q(st+1, a′)] = maxa′∈A Q∗(st+1, a′)","inline":true},{"text":".","element":"span"}],[{"text":"This overestimation bias, however, may not always be detrimental. And, further, in some cases, erring towards an underestimation bias can be harmful. Overestimation bias can help encourage exploration for overestimated actions, whereas underestimation bias might discourage exploration. In particular, we expect more overestimation bias in highly stochastic areas of the world; if those highly stochastic areas correspond to high-value regions, then encouraging exploration there might be beneficial. An underestimation bias might actually prevent an agent from learning that a region is high-value. Alternatively, if highly stochastic areas also have low values, overestimation bias might cause an agent to over-explore a low-value region.","element":"span"}],[{"text":"We show this effect in the simple MDP, shown in Figure ","element":"span"},{"href":"#id-13","text":"1. ","element":"a"},{"text":"The MDP for state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"has only two actions: Left and Right. It has a deterministic neutral reward for both the Left action and the Right action. The Left action transitions to state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"where there are eight actions transitions to a terminate state with a highly stochastic reward. The mean of this stochastic reward is ","element":"span"},{"style":{"height":10},"width":24,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-6.png","element":"img","alt":" µ","inline":true},{"text":". By selecting ","element":"span"},{"style":{"height":14},"width":100.08,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-7.png","element":"img","alt":" µ > 0","inline":true},{"text":", the stochastic region becomes high-value, and we expect overestimation bias to help and underestimation bias to hurt. By selecting ","element":"span"},{"style":{"height":14},"width":101.44,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-8.png","element":"img","alt":" µ < 0","inline":true},{"text":", the stochastic region becomes low-value, and we expect overestimation bias to hurt and underestimation bias to help.","element":"span"}],[{"style":{"width":"46%"},"width":729,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-9.png","element":"img"}],[{"text":"Figure 1: A simple episodic MDP, adapted from Figure ","element":"figcaption","subtype":"caption"},{"text":"6","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":".","element":"figcaption","subtype":"caption"},{"text":"5 ","element":"figcaption","subtype":"caption"},{"text":"in ","element":"figcaption","subtype":"caption"},{"href":"#id-14","referenceIndex":12,"text":"Sutton & Barto ","element":"a","subtype":"caption"},{"href":"#id-14","referenceIndex":12,"text":"(2018) ","element":"a","subtype":"caption"},{"text":"which is used to highlight the difference between Double Q-learning and Q-learning. This MDP has two nonterminal states ","element":"figcaption","subtype":"caption"},{"id":"id-13","style":{"fontStyle":"italic"},"text":"A ","element":"figcaption","subtype":"caption"},{"text":"and ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B","element":"figcaption","subtype":"caption"},{"text":". Every episode starts from ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"A ","element":"figcaption","subtype":"caption"},{"text":"which has two actions: Left and Right. The Right action transitions to a terminal state with reward ","element":"figcaption","subtype":"caption"},{"text":"0","element":"figcaption","subtype":"caption"},{"text":". The Left action transitions to state ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B ","element":"figcaption","subtype":"caption"},{"text":"with reward ","element":"figcaption","subtype":"caption"},{"text":"0","element":"figcaption","subtype":"caption"},{"text":". From state ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"B","element":"figcaption","subtype":"caption"},{"text":", there are 8 actions that all transition to a terminal state with a reward ","element":"figcaption","subtype":"caption"},{"style":{"height":14},"width":86.66,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-10.png","element":"img","alt":" µ + ξ","inline":true},{"text":", where ","element":"figcaption","subtype":"caption"},{"style":{"height":14},"width":18,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-11.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"is drawn from a uniform distribution ","element":"figcaption","subtype":"caption"},{"style":{"height":16},"width":151.61,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-12.png","element":"img","alt":" U(−1, 1)","inline":true},{"text":". When ","element":"figcaption","subtype":"caption"},{"style":{"height":14},"width":98.56,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-13.png","element":"img","alt":" µ > 0","inline":true},{"text":", the optimal action in state ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"A ","element":"figcaption","subtype":"caption"},{"text":"is Left; when ","element":"figcaption","subtype":"caption"},{"style":{"height":14},"width":97.14,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-14.png","element":"img","alt":" µ < 0","inline":true},{"text":", it is Right.","element":"figcaption","subtype":"caption"}],[{"text":"We test Q-learning, Double Q-learning and our new algorithm Maxmin Q-learning in this environment. Maxmin Q-learning (described fully in the next section) uses ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"estimates of the action values in the targets. For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 1","element":"span"},{"text":", it corresponds to Q-learning; otherwise, it progresses from overestimation bias at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"towards underestimation bias with increasing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":". In the experiment, we used a discount factor ","element":"span"},{"style":{"height":14.4},"width":103.65,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-15.png","element":"img","alt":" γ = 1","inline":true},{"text":"; a replay buffer with size ","element":"span"},{"text":"100","element":"span"},{"text":"; an ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-16.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy behaviour with ","element":"span"},{"style":{"height":10.8},"width":127.96,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-17.png","element":"img","alt":" ϵ = 0.1","inline":true},{"text":"; tabular action-values, initialized with a Gaussian distribution ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"01)","element":"span"},{"text":"; and a step-size of ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"01 ","element":"span"},{"text":"for all algorithms.","element":"span"}],[{"text":"The results in Figure ","element":"span"},{"href":"#id-15","text":"2 ","element":"a"},{"text":"verify our hypotheses for when overestimation and underestimation bias help and hurt. Double Q-learning underestimates too much for ","element":"span"},{"style":{"height":14},"width":133.12,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-18.png","element":"img","alt":" µ = +1","inline":true},{"text":", and converges to a suboptimal policy. Q-learning learns the optimal policy the fastest, though for all values of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 2","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"4","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"6","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"8","element":"span"},{"text":", Maxmin Q-learning does progress towards the optimal policy. All methods get to the optimal policy for ","element":"span"},{"style":{"height":14},"width":141.4,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/2-19.png","element":"img","alt":" µ = −1","inline":true},{"text":", but now Double Q-learning reaches the optimal policy the fastest, and followed by Maxmin Q-learning with larger ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":".","element":"span"}]]},{"heading":"4 MAXMIN Q-LEARNING","paragraphs":[[{"text":"In this section, we develop Maxmin Q-learning, a simple generalization of Q-learning designed to control the estimation bias, as well as reduce the estimation variance of action values. The idea is","element":"span"}],[{"style":{"width":"97%"},"width":1540,"height":623,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-0.png","element":"img"}],[{"text":"Figure 2: Comparison of three algorithms using the simple MDP in Figure ","element":"figcaption","subtype":"caption"},{"href":"#id-13","text":"1 ","element":"a","subtype":"caption"},{"text":"with different values of ","element":"figcaption","subtype":"caption"},{"style":{"height":10},"width":24,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-1.png","element":"img","alt":" µ","inline":true},{"id":"id-15","text":", and thus different expected rewards. For ","element":"figcaption","subtype":"caption"},{"style":{"height":14},"width":160.8,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-2.png","element":"img","alt":" µ = +0.1","inline":true},{"text":", shown in (a), the optimal ","element":"figcaption","subtype":"caption"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-3.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy policy is to take the Left action with ","element":"figcaption","subtype":"caption"},{"text":"95% ","element":"figcaption","subtype":"caption"},{"text":"probability. For ","element":"figcaption","subtype":"caption"},{"style":{"height":14},"width":159.14,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-4.png","element":"img","alt":" µ = −0.1","inline":true},{"text":", shown in in (b), the optimal policy is to take the Left action with ","element":"figcaption","subtype":"caption"},{"text":"5% ","element":"figcaption","subtype":"caption"},{"text":"probability. The reported distance is the absolute difference between the probability of taking the Left action under the learned policy compared to the optimal ","element":"figcaption","subtype":"caption"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-5.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy policy. All results were averaged over ","element":"figcaption","subtype":"caption"},{"text":"5","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":", ","element":"figcaption","subtype":"caption"},{"text":"000 ","element":"figcaption","subtype":"caption"},{"text":"runs.","element":"figcaption","subtype":"caption"}],[{"text":"to maintain ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"estimates of the action values, ","element":"span"},{"style":{"height":16.18},"width":42.5,"height":40.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-6.png","element":"img","alt":" Qi","inline":true},{"text":", and use the minimum of these estimates in the Q-learning target: ","element":"span"},{"style":{"height":18.66},"width":498.85,"height":46.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-7.png","element":"img","alt":" maxa′ mini∈{1,...,N} Qi(s′, a′)","inline":true},{"text":". For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 1","element":"span"},{"text":", the update is simply Q-learning, and so likely has overestimation bias. As ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"increase, the overestimation decreases; for some ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N > ","element":"span"},{"text":"1","element":"span"},{"text":", this maxmin estimator switches from an overestimate, in expectation, to an underestimate. We characterize the relationship between ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"and the expected estimation bias below in Theorem ","element":"span"},{"href":"#id-16","text":"1. ","element":"a"},{"text":"Note that Maxmin Q-learning uses a different mechanism to reduce overestimation bias than Double Qlearning; Maxmin Q-learning with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 2 ","element":"span"},{"text":"is not Double Q-learning.","element":"span"}],[{"text":"The full algorithm is summarized in Algorithm ","element":"span"},{"href":"#id-17","text":"1, ","element":"a"},{"text":"and is a simple modification of Q-learning with experience replay. We use random subsamples of the observed data for each of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"estimators, to make them nearly independent. To do this training online, we keep a replay buffer. On each step, a random estimator ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"is chosen and updated using a mini-batch from the buffer. Multiple such updates can be performed on each step, just like in experience replay, meaning multiple estimators can be updated per step using different random mini-batches. In our experiments, to better match DQN, we simply do one update per step. Finally, it is also straightforward to incorporate target networks to get Maxmin DQN, by maintaining a target network for each estimator.","element":"span"}],[{"text":"We now characterize the relation between the number of action-value functions used in Maxmin Q-learning and the estimation bias of action values. For compactness, we write ","element":"span"},{"style":{"height":16.92},"width":63.54,"height":42.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-8.png","element":"img","alt":" Qisa","inline":true,"padRight":true},{"text":"instead of ","element":"span"},{"style":{"height":16.98},"width":133.72,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-9.png","element":"img","alt":"Qi(s, a)","inline":true},{"text":". Each ","element":"span"},{"style":{"height":16.92},"width":63.54,"height":42.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-10.png","element":"img","alt":" Qisa","inline":true,"padRight":true},{"text":"has random approximation error ","element":"span"},{"style":{"height":16.92},"width":50.59,"height":42.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-11.png","element":"img","alt":" eisa","inline":true}],[{"style":{"width":"18%"},"width":293,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-12.png","element":"img"}],[{"text":"We assume that ","element":"span"},{"style":{"height":16.92},"width":50.59,"height":42.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-13.png","element":"img","alt":" eisa","inline":true,"padRight":true},{"text":"is a uniform random variable ","element":"span"},{"style":{"height":16},"width":153.41,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-14.png","element":"img","alt":" U(−τ, τ)","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":11.6},"width":100.03,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-15.png","element":"img","alt":" τ > 0","inline":true},{"text":". The uniform random ","element":"span"},{"text":"assumption was used by ","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"Thrun & Schwartz ","element":"a"},{"href":"#id-1","referenceIndex":15,"text":"(1993) ","element":"a"},{"text":"to demonstrate bias in Q-learning, and reflects that non-negligible positive and negative ","element":"span"},{"style":{"height":16.93},"width":50.59,"height":42.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-16.png","element":"img","alt":" eisa","inline":true,"padRight":true},{"text":"are possible. Notice that for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"estimators with ","element":"span"},{"style":{"height":9.19},"width":55.96,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-17.png","element":"img","alt":" nsa","inline":true,"padRight":true},{"text":"samples, the ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-18.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"will be proportional to some function of ","element":"span"},{"style":{"height":16},"width":111.94,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-19.png","element":"img","alt":" nsa/N","inline":true},{"text":", because the data will be shared amongst the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"estimators. For the general theorem, we use a generic ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-20.png","element":"img","alt":" τ","inline":true},{"text":", and in the following corollary provide a specific form for ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-21.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"in terms of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":9.19},"width":55.95,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-22.png","element":"img","alt":" nsa","inline":true},{"text":".","element":"span"}],[{"text":"Recall that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"is the number of actions applicable at state ","element":"span"},{"style":{"height":6.8},"width":32.68,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-23.png","element":"img","alt":" s′","inline":true},{"text":". Define the estimation bias ","element":"span"},{"style":{"height":13.19},"width":87.56,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-24.png","element":"img","alt":" ZMN","inline":true,"padRight":true},{"text":"for transition ","element":"span"},{"style":{"height":10},"width":142.43,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-25.png","element":"img","alt":" s, a, r, s′","inline":true,"padRight":true},{"text":"to be","element":"span"}],[{"style":{"width":"49%"},"width":786,"height":147,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/3-26.png","element":"img"}],[{"id":"id-17","style":{"width":"100%"},"width":1584,"height":1042,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-0.png","element":"img"}],[{"text":"We now show how the expected estimation bias ","element":"span"},{"style":{"height":16},"width":144.59,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-1.png","element":"img","alt":" E[ZMN]","inline":true,"padRight":true},{"text":"and the variance of ","element":"span"},{"style":{"height":16.92},"width":91.06,"height":42.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-2.png","element":"img","alt":" Qminsa","inline":true,"padRight":true},{"text":"are related to the number of action-value functions ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"in Maxmin Q-learning.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Under the conditions stated above and assume all actions share the same true action-value,","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(i) the expected estimation bias is","element":"span"}],[{"id":"id-16","style":{"width":"87%"},"width":1381,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-3.png","element":"img"}],[{"style":{"height":16},"width":144.59,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-4.png","element":"img","alt":"E[ZMN]","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"decreases as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"style":{"fontStyle":"italic"},"text":"increases: ","element":"span"},{"style":{"height":20.97},"width":369.74,"height":52.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-5.png","element":"img","alt":" E[ZM,N=1] = γτ M−1M+1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":16.79},"width":343.95,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-6.png","element":"img","alt":" E[ZM,N→∞] = −γτ","inline":true},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"width":"67%"},"width":1076,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-7.png","element":"img"}],[{"style":{"height":16.98},"width":187.07,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-8.png","element":"img","alt":"V ar[Qminsa ]","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"decreases as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"style":{"fontStyle":"italic"},"text":"increases: ","element":"span"},{"style":{"height":22.18},"width":263.7,"height":55.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-9.png","element":"img","alt":" V ar[Qminsa ]= τ 23","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for N=1 and ","element":"span"},{"style":{"height":16.98},"width":260.31,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-10.png","element":"img","alt":" V ar[Qminsa ] = 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for ","element":"span"},{"style":{"height":11.2},"width":138.36,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-11.png","element":"img","alt":" N → ∞","inline":true},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"text":"Theorem ","element":"span"},{"href":"#id-16","text":"1 ","element":"a"},{"text":"is a generalization of the first lemma in ","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"Thrun & Schwartz ","element":"a"},{"href":"#id-1","referenceIndex":15,"text":"(1993)","element":"a"},{"text":"; we provide the proof in Appendix ","element":"span"},{"href":"#id-18","referenceIndex":22,"text":"A ","element":"a"},{"text":"as well as a visualization of the expected bias for varying ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":". This theorem shows that the average estimation bias ","element":"span"},{"style":{"height":16},"width":144.6,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-12.png","element":"img","alt":" E[ZMN]","inline":true},{"text":", decreases as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"increases. Thus, we can control the bias by changing the number of estimators in Maxmin Q-learning. Specifically, the average estimation bias can be reduced from positive to negative as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"increases. Notice that ","element":"span"},{"style":{"height":16},"width":221.18,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-13.png","element":"img","alt":" E[ZMN] = 0","inline":true,"padRight":true},{"text":"when ","element":"span"},{"style":{"height":19.37},"width":156.58,"height":48.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-14.png","element":"img","alt":" tMN = 12","inline":true},{"text":". This suggests that by choosing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"such that ","element":"span"},{"style":{"height":19.37},"width":156.58,"height":48.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-15.png","element":"img","alt":" tMN ≈ 12","inline":true},{"text":", we can reduce the bias to ","element":"span"},{"text":"near ","element":"span"},{"text":"0","element":"span"},{"text":".","element":"span"}],[{"text":"Furthermore, ","element":"span"},{"style":{"height":16.98},"width":187.07,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-16.png","element":"img","alt":" V ar[Qminsa ]","inline":true,"padRight":true},{"text":"decreases as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"increases. This indicates that we can control the estimation ","element":"span"},{"text":"variance of target action value through ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":". We show just this in the following Corollary. The subtlety is that with increasing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":", each estimator will receive less data. The fair comparison is to compare the variance of a single estimator that uses all of the data, as compared to the maxmin estimator which shares the samples across ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"estimators. We show that there is an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"such that the variance is lower, which arises largely due to the fact that the variance of each estimator decreases linearly in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":", but the ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-17.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"parameter for each estimator only decreases at a square root rate in the number of samples.","element":"span"}],[{"id":"id-35","style":{"fontWeight":"bold"},"text":"Corollary 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assuming the ","element":"span"},{"style":{"height":9.19},"width":55.95,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-18.png","element":"img","alt":" nsa","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"samples are evenly allocated amongst the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"style":{"fontStyle":"italic"},"text":"estimators, then ","element":"span"},{"style":{"height":6.8},"width":67.88,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-19.png","element":"img","alt":" τ =","inline":true},{"style":{"height":19.2},"width":210.77,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-20.png","element":"img","alt":"�3σ2N/nsa","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":13.39},"width":40.2,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-21.png","element":"img","alt":" σ2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the variance of samples for ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and, for ","element":"span"},{"style":{"height":14},"width":63.54,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/4-22.png","element":"img","alt":" Qsa","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the estimator that uses all","element":"span"}],[{"style":{"width":"97%"},"width":1540,"height":624,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-0.png","element":"img"}],[{"text":"Figure 3: Comparison of four algorithms on Mountain Car under different reward variances. The lines in ","element":"figcaption","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"a","element":"figcaption","subtype":"caption"},{"id":"id-20","text":") ","element":"figcaption","subtype":"caption"},{"text":"show the average number of steps taken in the last episode with one standard error. The lines in ","element":"figcaption","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"b","element":"figcaption","subtype":"caption"},{"text":") ","element":"figcaption","subtype":"caption"},{"text":"show the number of steps to reach the goal position during training when the reward variance ","element":"figcaption","subtype":"caption"},{"style":{"height":13.38},"width":151.59,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-1.png","element":"img","alt":" σ2 = 10","inline":true},{"text":". All results were averaged across ","element":"figcaption","subtype":"caption"},{"text":"100 ","element":"figcaption","subtype":"caption"},{"text":"runs, with standard errors. Additional experiments with further elevated ","element":"figcaption","subtype":"caption"},{"style":{"height":13.38},"width":40.2,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-2.png","element":"img","alt":" σ2","inline":true,"padRight":true},{"text":"can be found in Appendix ","element":"figcaption","subtype":"caption"},{"href":"#id-19","referenceIndex":214,"text":"C.2.","element":"a","subtype":"caption"}],[{"style":{"height":9.19},"width":55.95,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-3.png","element":"img","alt":"nsa","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"samples for a single estimate,","element":"span"}],[{"style":{"width":"44%"},"width":707,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Under this uniform random noise assumption, for ","element":"span"},{"style":{"height":16.98},"width":529.77,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-5.png","element":"img","alt":" N ≥ 8, V ar[Qminsa ] < V ar[Qsa]","inline":true},{"style":{"fontStyle":"italic"},"text":".","element":"span"}]]},{"heading":"5 EXPERIMENTS","paragraphs":[[{"text":"In this section, we first investigate robustness to reward variance, in a simple environment (Mountain Car) in which we can perform more exhaustive experiments. Then, we investigate performance in seven benchmark environments.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Robustness under increasing reward variance in Mountain Car ","element":"span"},{"text":"Mountain Car ","element":"span"},{"href":"#id-14","referenceIndex":12,"text":"(Sutton & ","element":"a"},{"href":"#id-14","referenceIndex":12,"text":"Barto, ","element":"a"},{"href":"#id-14","referenceIndex":12,"text":"2018) ","element":"a"},{"text":"is a classic testbed in Reinforcement Learning, where the agent receives a reward of ","element":"span"},{"style":{"height":10.8},"width":51,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-6.png","element":"img","alt":" −1","inline":true,"padRight":true},{"text":"per step with ","element":"span"},{"style":{"height":14.4},"width":110.43,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-7.png","element":"img","alt":" γ = 1","inline":true},{"text":", until the car reaches the goal position and the episode ends. In our experiment, we modify the rewards to be stochastic with the same mean value: the reward signal is sampled from a Gaussian distribution ","element":"span"},{"style":{"height":17.38},"width":180.78,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-8.png","element":"img","alt":" N(−1, σ2)","inline":true,"padRight":true},{"text":"on each time step. An agent should learn to reach the goal position in as few steps as possible.","element":"span"}],[{"text":"The experimental setup is as follows. We trained each algorithm with ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"000 ","element":"span"},{"text":"episodes. The number of steps to reach the goal position in the last training episode was used as the performance measure. The fewer steps, the better performance. All experimental results were averaged over ","element":"span"},{"text":"100 ","element":"span"},{"text":"runs. The key algorithm settings included the function approximator, step-sizes, exploration parameter and replay buffer size. All algorithm used ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-9.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy with ","element":"span"},{"style":{"height":10.8},"width":135.43,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-10.png","element":"img","alt":" ϵ = 0.1","inline":true,"padRight":true},{"text":"and a buffer size of ","element":"span"},{"text":"100","element":"span"},{"text":". For each algorithm, the best step-size was chosen from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"005","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"01","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"02","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"04","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"08","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":", separately for each reward setting. Tile-coding was used to approximate the action-value function, where we used ","element":"span"},{"text":"8 ","element":"span"},{"text":"tilings with each tile covering ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"8","element":"span"},{"text":"th of the bounded distance in each dimension. For Maxmin Q-learning, we randomly chose one action-value function to update at each step.","element":"span"}],[{"text":"As shown in Figure ","element":"span"},{"href":"#id-20","text":"3, ","element":"a"},{"text":"when the reward variance is small, the performance of Q-learning, Double Qlearning, Averaged Q-learning, and Maxmin Q-learning are comparable. However, as the variance increases, Q-learning, Double Q-learning, and Averaged Q-learning became much less stable than Maxmin Q-learning. In fact, when the variance was very high (","element":"span"},{"style":{"height":11.2},"width":131.79,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/5-11.png","element":"img","alt":"σ = 50","inline":true},{"text":", see Appendix ","element":"span"},{"href":"#id-19","referenceIndex":214,"text":"C.2)","element":"a"},{"text":", Qlearning and Averaged Q-learning failed to reach the goal position in ","element":"span"},{"text":"5","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"000 ","element":"span"},{"text":"steps, and Double Qlearning produced runs ","element":"span"},{"style":{"fontStyle":"italic"},"text":"> ","element":"span"},{"text":"400 ","element":"span"},{"text":"steps, even after many episodes.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Results on Benchmark Environments ","element":"span"},{"text":"To evaluate Maxmin DQN, we choose seven games from Gym ","element":"span"},{"href":"#id-21","referenceIndex":4,"text":"(Brockman et al., ","element":"a"},{"href":"#id-21","referenceIndex":4,"text":"2016)","element":"a"},{"text":", PyGame Learning Environment (PLE) ","element":"span"},{"href":"#id-22","referenceIndex":14,"text":"(Tasfi, ","element":"a"},{"href":"#id-22","referenceIndex":14,"text":"2016)","element":"a"},{"text":", and MinAtar ","element":"span"},{"href":"#id-23","referenceIndex":19,"text":"(Young & Tian, ","element":"a"},{"href":"#id-23","referenceIndex":19,"text":"2019)","element":"a"},{"text":": Lunarlander, Catcher, Pixelcopter, Asterix, Seaquest, Breakout, and Space Invaders. For games in MinAtar (i.e. Asterix, Seaquest, Breakout, and Space Invaders), we reused the hyper-parameters and settings of neural networks in ","element":"span"},{"href":"#id-23","referenceIndex":19,"text":"(Young & Tian, ","element":"a"},{"href":"#id-23","referenceIndex":19,"text":"2019)","element":"a"},{"text":". And the step-size was chosen from ","element":"span"},{"style":{"height":17.78},"width":660.67,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-0.png","element":"img","alt":" [3∗10−3, 10−3, 3∗10−4, 10−4, 3∗10−5]","inline":true},{"text":". For Lunarlander, Catcher, and Pixelcopter, the neural network was a multi-layer perceptron with hidden layers fixed to ","element":"span"},{"text":"[64","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"64]","element":"span"},{"text":". The discount factor was ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"99","element":"span"},{"text":". The size of the replay buffer was ","element":"span"},{"text":"10","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"000","element":"span"},{"text":". The weights of neural networks were optimized by RMSprop with gradient clip ","element":"span"},{"text":"5","element":"span"},{"text":". The batch size was ","element":"span"},{"text":"32","element":"span"},{"text":". The target network was updated every ","element":"span"},{"text":"200 ","element":"span"},{"text":"frames. ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-1.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy was applied as the exploration strategy with ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-2.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"decreasing linearly from ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"0 ","element":"span"},{"text":"to ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"01 ","element":"span"},{"text":"in ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"000 ","element":"span"},{"text":"steps. After ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"000 ","element":"span"},{"text":"steps, ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-3.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"was fixed to ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"01","element":"span"},{"text":". For Lunarlander, the best step-size was chosen from ","element":"span"},{"style":{"height":17.78},"width":697.44,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-4.png","element":"img","alt":" [3 ∗ 10−3, 10−3, 3 ∗ 10−4, 10−4, 3 ∗ 10−5]","inline":true},{"text":". For Catcher and Pixelcopter, the best step-size was chosen from ","element":"span"},{"style":{"height":17.78},"width":621.21,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-5.png","element":"img","alt":" [10−3, 3 ∗ 10−4, 10−4, 3 ∗ 10−5, 10−5]","inline":true},{"text":".","element":"span"}],[{"text":"For both Maxmin DQN and Averaged DQN, the number of target networks ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"was chosen from ","element":"span"},{"text":"[2","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"3","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"4","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"5","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"6","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"7","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"8","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"9]","element":"span"},{"text":". And we randomly chose one action-value function to update at each step. We first trained each algorithm in a game for certain number of steps. After that, each algorithm was tested by running ","element":"span"},{"text":"100 ","element":"span"},{"text":"test episodes with ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-6.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy where ","element":"span"},{"style":{"height":10.8},"width":156.05,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-7.png","element":"img","alt":" ϵ = 0.01","inline":true},{"text":". Results were averaged over ","element":"span"},{"text":"20 ","element":"span"},{"text":"runs for each algorithm, with learning curves shown for the best hyper-parameter setting (see Appendix ","element":"span"},{"href":"#id-24","referenceIndex":219,"text":"C.3 ","element":"a"},{"text":"for the parameter sensitivity curves).","element":"span"}],[{"text":"We see from Figure ","element":"span"},{"href":"#id-25","text":"4 ","element":"a"},{"text":"that Maxmin DQN performs as well as or better than other algorithms. In environments where final performance is noticeably better—-Pixelcopter, Lunarlander and Asterix—the initial learning is slower. A possible explanation for this is that the Maxmin agent more extensively explored early on, promoting better final performance. We additionally show on Pixelcopter and Asterix that for smaller ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":", Maxmin DQN learns faster but reaches suboptimal performance—behaving more like Q-learning—and for larger ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"learns more slowly but reaches better final performance.","element":"span"}]]},{"heading":"6 CONVERGENCE ANALYSIS OF MAXMIN Q-LEARNING","paragraphs":[[{"text":"In this section, we show Maxmin Q-learning is convergent in the tabular setting. We do so by providing a more general result for what we call Generalized Q-learning: Q-learning where the bootstrap target uses a function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"action values. The main condition on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"is that it maintains relative maximum values, as stated in Assumption ","element":"span"},{"href":"#id-26","text":"1. ","element":"a"},{"text":"We use this more general result to prove Maxmin Q-learning is convergent, and then discuss how it provides convergence results for Q-learning, Ensemble Q-learning, Averaged Q-learning and Historical Best Q-learning as special cases.","element":"span"}],[{"text":"Many variants of Q-learning have been proposed, including Double Q-learning ","element":"span"},{"href":"#id-2","referenceIndex":17,"text":"(van Hasselt, ","element":"a"},{"href":"#id-2","referenceIndex":17,"text":"2010)","element":"a"},{"text":", Weighted Double Q-learning ","element":"span"},{"href":"#id-6","referenceIndex":21,"text":"(Zhang et al., ","element":"a"},{"href":"#id-6","referenceIndex":21,"text":"2017)","element":"a"},{"text":", Ensemble Q-learning ","element":"span"},{"href":"#id-8","referenceIndex":1,"text":"(Anschel et al., ","element":"a"},{"href":"#id-8","referenceIndex":1,"text":"2017)","element":"a"},{"text":", Averaged Q-learning ","element":"span"},{"href":"#id-8","referenceIndex":1,"text":"(Anschel et al., ","element":"a"},{"href":"#id-8","referenceIndex":1,"text":"2017)","element":"a"},{"text":", and Historical Best Q-learning ","element":"span"},{"href":"#id-27","referenceIndex":20,"text":"(Yu et al., ","element":"a"},{"href":"#id-27","referenceIndex":20,"text":"2018)","element":"a"},{"text":". These algorithms differ in their estimate of the one-step bootstrap target. To encompass all variants, the target action-value of Generalized Q-learning ","element":"span"},{"style":{"height":13.38},"width":81.79,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-8.png","element":"img","alt":" Y GQ","inline":true,"padRight":true},{"text":"is defined based on action-value estimates from both dimensions:","element":"span"}],[{"id":"id-30","style":{"width":"63%"},"width":1002,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-9.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"is the current time step and the action-value function ","element":"span"},{"style":{"height":17.38},"width":129.29,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-10.png","element":"img","alt":" QGQs (t)","inline":true,"padRight":true},{"text":"is a function of ","element":"span"},{"style":{"height":17.38},"width":123.39,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-11.png","element":"img","alt":" Q1s(t −","inline":true},{"style":{"height":17.38},"width":850.02,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-12.png","element":"img","alt":"K), . . . , Q1s(t − 1), . . . , QNs (t − K), . . . , QNs (t − 1)","inline":true},{"text":":","element":"span"}],[{"style":{"width":"75%"},"width":1201,"height":209,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-13.png","element":"img"}],[{"text":"For simplicity, the vector ","element":"span"},{"style":{"height":17.39},"width":224.55,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-14.png","element":"img","alt":" (QGQsa (t))a∈A","inline":true,"padRight":true},{"text":"is denoted as ","element":"span"},{"style":{"height":17.39},"width":129.29,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-15.png","element":"img","alt":" QGQs (t)","inline":true},{"text":", same for ","element":"span"},{"style":{"height":16.99},"width":94.43,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-16.png","element":"img","alt":" Qis(t)","inline":true},{"text":". The corresponding ","element":"span"},{"text":"update rule is","element":"span"}],[{"style":{"height":18.18},"width":919.51,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-17.png","element":"img","alt":"Qisa(t) ← Qisa(t − 1) + αisa(t − 1)(Y GQ − Qisa(t − 1))","inline":true,"padRight":true},{"text":"(4) For different ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"functions, Generalized Q-learning reduces to different variants of Q-learning, including Q-learning itself. For example, Generalized Q-learning can be reduced to Q-learning simply by setting ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"= 1","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"with ","element":"span"},{"style":{"height":16},"width":388.13,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-18.png","element":"img","alt":" G(Qs) = maxa∈A Qsa","inline":true},{"text":". Double Q-learning can be specified with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"= 1","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 2","element":"span"},{"text":", and ","element":"span"},{"style":{"height":24.23},"width":533.84,"height":60.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/6-19.png","element":"img","alt":" G(Q1s, Q2s) = Q2s,arg maxa′∈A Q1sa′","inline":true,"padRight":true},{"text":".","element":"span"}],[{"style":{"width":"98%"},"width":1560,"height":1383,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/7-0.png","element":"img"}],[{"text":"Figure 4: Learning curves on the seven benchmark environments. The depicted return is averaged over the last ","element":"figcaption","subtype":"caption"},{"id":"id-25","text":"100 ","element":"figcaption","subtype":"caption"},{"text":"episodes, and the curves are smoothed using an exponential average, to match previous reported results ","element":"figcaption","subtype":"caption"},{"href":"#id-23","referenceIndex":19,"text":"(Young & Tian, ","element":"a","subtype":"caption"},{"href":"#id-23","referenceIndex":19,"text":"2019)","element":"a","subtype":"caption"},{"text":". The results were averaged over 20 runs, with the shaded area representing one standard error. Plots ","element":"figcaption","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"h","element":"figcaption","subtype":"caption"},{"text":") ","element":"figcaption","subtype":"caption"},{"text":"and ","element":"figcaption","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"i","element":"figcaption","subtype":"caption"},{"text":") ","element":"figcaption","subtype":"caption"},{"text":"show the performance of Maxmin DQN on Pixelcopter and Asterix, with different ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"N","element":"figcaption","subtype":"caption"},{"text":", highlighting that larger ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"N ","element":"figcaption","subtype":"caption"},{"text":"seems to result in slower early learning but better final performance in both environments.","element":"figcaption","subtype":"caption"}],[{"text":"We first introduce Assumption ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"for function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"in Generalized Q-learning, and then state the theorem. The proof can be found in Appendix ","element":"span"},{"href":"#id-28","referenceIndex":63,"text":"B.","element":"a"}],[{"style":{"fontWeight":"bold"},"text":"Assumption 1 (Conditions on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G","element":"span"},{"style":{"fontWeight":"bold"},"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":13.79},"width":302.77,"height":34.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-0.png","element":"img","alt":" G : RnNK �→ R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"text":") = ","element":"span"},{"style":{"fontStyle":"italic"},"text":"q ","element":"span"},{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":16.99},"width":243.12,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-1.png","element":"img","alt":" Q = (Qija ) ∈","inline":true},{"style":{"height":15.79},"width":229.15,"height":39.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-2.png","element":"img","alt":"RnNK, a ∈ A","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":16},"width":736.91,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-3.png","element":"img","alt":" |A| = n, i ∈ {1, . . . , N}, j ∈ {0, . . . , K − 1}","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":14},"width":96.92,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-4.png","element":"img","alt":" q ∈ R","inline":true},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"id":"id-26","style":{"width":"67%"},"width":1070,"height":135,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-5.png","element":"img"}],[{"text":"We can verify that Assumption ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"holds for Maxmin Q-learning. Set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"to be a positive integer. Let ","element":"span"},{"style":{"height":17.38},"width":337.79,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-6.png","element":"img","alt":" Qs = (Q1s, . . . , QNs )","inline":true,"padRight":true},{"text":"and define ","element":"span"},{"style":{"height":19.06},"width":670.38,"height":47.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-7.png","element":"img","alt":" GMQ(Qs) = maxa∈A mini∈{1,...,N} Qisa","inline":true},{"text":". It ","element":"span"},{"text":"is easy to check that part (i) of Assumption ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"is satisfied. Part (ii) is also satisfied because","element":"span"}],[{"style":{"width":"79%"},"width":1256,"height":71,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-8.png","element":"img"}],[{"id":"id-29","style":{"fontWeight":"bold"},"text":"Assumption 2 (Conditions on the step-sizes) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"There exists some (deterministic) constant ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"style":{"fontStyle":"italic"},"text":"such that for every ","element":"span"},{"style":{"height":16.98},"width":780.96,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-9.png","element":"img","alt":" (s, a) ∈ S × A, i ∈ {1, . . . , N}, 0 ≤ αisa(t) ≤ 1","inline":true},{"style":{"fontStyle":"italic"},"text":", and with probability ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":",","element":"span"}],[{"style":{"width":"38%"},"width":610,"height":110,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-10.png","element":"img"}],[{"id":"id-31","style":{"fontWeight":"bold"},"text":"Theorem 2 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume a finite MDP ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"P","element":"span"},{"style":{"fontStyle":"italic"},"text":", R","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and that Assumption ","element":"span"},{"href":"#id-26","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-29","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"hold. Then the action-value functions in Generalized Q-learning, using the tabular update in Equation ","element":"span"},{"href":"#id-30","text":"(3)","element":"a"},{"style":{"fontStyle":"italic"},"text":", will converge to the optimal action-value function with probability ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", in either of the following cases: (i) ","element":"span"},{"style":{"height":14.4},"width":101.48,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-11.png","element":"img","alt":" γ < 1","inline":true},{"style":{"fontStyle":"italic"},"text":", or (ii) ","element":"span"},{"style":{"height":18.52},"width":530.23,"height":46.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-12.png","element":"img","alt":" γ = 1, ∀a ∈ A, Qis1a(t = 0) = 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-13.png","element":"img","alt":" s1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is an absorbing state and all policies are proper.","element":"span"}],[{"text":"As shown above, because the function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"for Maxmin Q-learning satisfies Assumption ","element":"span"},{"href":"#id-26","text":"1, ","element":"a"},{"text":"then by Theorem ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"it converges. Next, we apply Theorem ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"to Q-learning and its variants, proving the convergence of these algorithms in the tabular case. For Q-learning, set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 1","element":"span"},{"text":". Let ","element":"span"},{"style":{"height":17.39},"width":405,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-14.png","element":"img","alt":"GQ(Qs) = maxa∈A Qsa","inline":true},{"text":". It is straightforward to check that Assumption ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"holds for function ","element":"span"},{"style":{"height":13.39},"width":56.33,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-15.png","element":"img","alt":" GQ","inline":true},{"text":". For Ensemble Q-learning, set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"to be a positive integer. Let ","element":"span"},{"style":{"height":17.39},"width":387.09,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-16.png","element":"img","alt":" GEQ((Q1s, . . . , QNs )) =","inline":true},{"style":{"height":21.11},"width":355.82,"height":52.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-17.png","element":"img","alt":"maxa∈A 1N�Ni=1 Qisa","inline":true},{"text":". Easy to check that Assumption ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"is satisfied. For Averaged Q-learning, the ","element":"span"},{"text":"proof is similar to Ensemble Q-learning except that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"is a positive integer. For Historical Best Q-learning, set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"to be a positive integer. We assume that all auxiliary action-value functions are selected from action-value functions at most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"updates ago. Define ","element":"span"},{"style":{"height":13.38},"width":110,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-18.png","element":"img","alt":" GHBQ","inline":true,"padRight":true},{"text":"to be the largest action-value among ","element":"span"},{"style":{"height":16},"width":469.51,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/8-19.png","element":"img","alt":" Qsa(t − 1), . . . , Qsa(t − K)","inline":true,"padRight":true},{"text":"for state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":". Assumption ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"is satisfied and the convergence is guaranteed.","element":"span"}]]},{"heading":"7 CONCLUSION","paragraphs":[[{"text":"Overestimation bias is a byproduct of Q-learning, stemming from the selection of a maximal value to estimate the expected maximal value. In practice, overestimation bias leads to poor performance in a variety of settings. Though multiple Q-learning variants have been proposed, Maxmin Q-learning is the first solution that allows for a flexible control of bias, allowing for overestimation or underestimation determined by the choice of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"and the environment. We showed theoretically that we can decrease the estimation bias and the estimation variance by choosing an appropriate number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"of action-value functions. We empirically showed that advantages of Maxmin Q-learning, both on toy problems where we investigated the effect of reward noise and on several benchmark environments. Finally, we introduced a new Generalized Q-learning framework which we used to prove the convergence of Maxmin Q-learning as well as several other Q-learning variants that use ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"action-value estimates.","element":"span"}]]},{"heading":"ACKNOWLEDGMENTS","paragraphs":[[{"text":"We would like to thank Huizhen Yu and Yi Wan for their valuable feedback and helpful discussion.","element":"span"}]]},{"heading":"REFERENCES","paragraphs":[[{"id":"id-8","text":"Oron Anschel, Nir Baram, and Nahum Shimkin. Averaged-DQN: Variance Reduction and Stabi- ","element":"span"},{"text":"lization for Deep Reinforcement Learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pp. 176–185, 2017.","element":"span"}],[{"id":"id-43","text":"Dimitri P Bertsekas and John N Tsitsiklis. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Parallel and Distributed Computation: Numerical Methods","element":"span"},{"text":", volume 23. Prentice hall Englewood Cliffs, NJ, 1989.","element":"span"}],[{"id":"id-41","text":"Dimitri P Bertsekas and John N Tsitsiklis. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neuro-dynamic Programming","element":"span"},{"text":", volume 5. Athena Scien-tific Belmont, MA, 1996.","element":"span"}],[{"id":"id-21","text":"Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John Schulman, Jie Tang, and ","element":"span"},{"text":"Wojciech Zaremba. OpenAI Gym. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1606.01540","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-33","text":"Herbert Aron David and Haikady Navada Nagaraja. Order Statistics. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Encyclopedia of Statistical Sciences","element":"span"},{"text":", 2004.","element":"span"}],[{"id":"id-9","text":"Scott Fujimoto, Herke Hoof, and David Meger. Addressing function approximation error in actor- ","element":"span"},{"text":"critic methods. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pp. 1587–1596, 2018.","element":"span"}],[{"id":"id-10","text":"Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine. Soft actor-critic: Off-policy ","element":"span"},{"text":"maximum entropy deep reinforcement learning with a stochastic actor. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pp. 1861–1870, 2018.","element":"span"}],[{"id":"id-5","text":"Hado Hado van Hasselt, Arthur Guez, and David Silver. Deep Reinforcement Learning with Double ","element":"span"},{"text":"Q-learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AAAI Conference on Artificial Intelligence","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-7","text":"Donghun Lee, Boris Defourny, and Warren B. Powell. Bias-corrected Q-learning to Control Max- ","element":"span"},{"text":"operator Bias in Q-learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Symposium on Adaptive Dynamic Programming and Reinforcement Learning","element":"span"},{"text":", pp. 93–99, 2013.","element":"span"}],[{"id":"id-12","text":"Francisco S Melo and M Isabel Ribeiro. Q-learning with Linear Function Approximation. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Computational Learning Theory","element":"span"},{"text":", pp. 308–322, 2007.","element":"span"}],[{"id":"id-4","text":"Alexander L. Strehl, Lihong Li, and Michael L. Littman. Reinforcement Learning in Finite MDPs: ","element":"span"},{"text":"PAC Analysis. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research","element":"span"},{"text":", 10(Nov):2413–2444, 2009.","element":"span"}],[{"id":"id-14","text":"Richard S Sutton and Andrew G Barto. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Reinforcement Learning: An Introduction","element":"span"},{"text":". MIT Press, second edition, 2018.","element":"span"}],[{"id":"id-3","text":"Istv´an Szita and Andr´as L˝orincz. The Many Faces of Optimism: A Unifying Approach. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine learning","element":"span"},{"text":", pp. 1048–1055. ACM, 2008.","element":"span"}],[{"id":"id-22","text":"Norman ","element":"span"},{"text":"Tasfi. ","element":"span"},{"text":"Pygame ","element":"span"},{"text":"learning ","element":"span"},{"text":"environment. ","element":"span"},{"href":"https://github.com/ntasfi/PyGame-Learning-Environment","text":"https://github.com/ntasfi/ ","element":"a"},{"href":"https://github.com/ntasfi/PyGame-Learning-Environment","text":"PyGame-Learning-Environment","element":"a"},{"text":", 2016.","element":"span"}],[{"id":"id-1","text":"Sebastian Thrun and Anton Schwartz. Issues in Using Function Approximation for Reinforcement ","element":"span"},{"text":"Learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Fourth Connectionist Models Summer School","element":"span"},{"text":", 1993.","element":"span"}],[{"id":"id-11","text":"John N Tsitsiklis. Asynchronous Stochastic Approximation and Q-learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine learning","element":"span"},{"text":", 1994.","element":"span"}],[{"id":"id-2","text":"Hado van Hasselt. Double Q-learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pp. 2613–2621, 2010.","element":"span"}],[{"id":"id-0","text":"Chris Watkins. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Learning from Delayed Rewards","element":"span"},{"text":". PhD thesis, King’s College, Cambridge, 1989.","element":"span"}],[{"id":"id-23","text":"Kenny Young and Tian Tian. MinAtar: An Atari-inspired Testbed for More Efficient Reinforcement ","element":"span"},{"text":"Learning Experiments. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1903.03176","element":"span"},{"text":", 2019.","element":"span"}],[{"id":"id-27","text":"Wenwu Yu, Rui Wang, Ruiying Li, Jing Gao, and Xiaohui Hu. Historical Best Q-Networks for ","element":"span"},{"text":"Deep Reinforcement Learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Tools with Artificial Intelligence","element":"span"},{"text":", pp. 6–11, 2018.","element":"span"}],[{"id":"id-6","text":"Zongzhang Zhang, Zhiyuan Pan, and Mykel J. Kochenderfer. Weighted Double Q-learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Joint Conference on Artificial Intelligence","element":"span"},{"text":", pp. 3455–3461, 2017.","element":"span"}],[{"id":"id-18","text":"A ","element":"span"},{"text":"T","element":"span"},{"text":"HE ","element":"span"},{"text":"P","element":"span"},{"text":"ROOF OF ","element":"span"},{"text":"T","element":"span"},{"text":"HEOREM ","element":"span"},{"href":"#id-16","text":"1","element":"a"}],[{"text":"We first present Lemma ","element":"span"},{"href":"#id-32","referenceIndex":24,"text":"1 ","element":"a"},{"text":"here as a tool to prove Theorem ","element":"span"},{"href":"#id-16","text":"1. ","element":"a"},{"text":"Note that the first three properties in","element":"span"}],[{"id":"id-32","text":"this lemma are well-known results of order statistics ","element":"span"},{"href":"#id-33","referenceIndex":5,"text":"(David & Nagaraja, ","element":"a"},{"href":"#id-33","referenceIndex":5,"text":"2004)","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":14},"width":199.46,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-0.png","element":"img","alt":" X1, . . . , XN","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.i.d. random variables from an absolutely continuous distri-","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"bution with probability density function(PDF) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and cumulative distribution function (CDF)","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":". Denote ","element":"span"},{"style":{"height":20.24},"width":200.9,"height":50.59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-1.png","element":"img","alt":" µ def= E[Xi]","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":20.24},"width":407.39,"height":50.59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-2.png","element":"img","alt":" σ2 def= V ar[Xi] < +∞","inline":true},{"style":{"fontStyle":"italic"},"text":". Set ","element":"span"},{"style":{"height":21.91},"width":440.46,"height":54.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-3.png","element":"img","alt":" X1:N def= mini∈{1,...,N}Xi","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and","element":"span"}],[{"style":{"height":21.91},"width":456.26,"height":54.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-4.png","element":"img","alt":"XN:N def= maxi∈{1,...,N}Xi","inline":true},{"style":{"fontStyle":"italic"},"text":". Denote the PDF and CDF of ","element":"span"},{"style":{"height":13.19},"width":84.92,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-5.png","element":"img","alt":" X1:N","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"as ","element":"span"},{"style":{"height":16},"width":128.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-6.png","element":"img","alt":" f1:N(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":16},"width":135.03,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-7.png","element":"img","alt":" F1:N(x)","inline":true},{"style":{"fontStyle":"italic"},"text":", re-","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"spectively. Similarly, denote the PDF and CDF of ","element":"span"},{"style":{"height":13.19},"width":97.27,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-8.png","element":"img","alt":" XN:N","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"as ","element":"span"},{"style":{"height":16},"width":141.26,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-9.png","element":"img","alt":" fN:N(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":16},"width":147.38,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-10.png","element":"img","alt":" FN:N(x)","inline":true},{"style":{"fontStyle":"italic"},"text":", respectively.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"We then have","element":"span"}],[{"style":{"width":"71%"},"width":1129,"height":285,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"(iv) If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X","element":"span"},{"style":{"height":7.6},"width":16,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-12.png","element":"img","alt":"1","inline":true},{"style":{"fontStyle":"italic"},"text":", . . . , X","element":"span"},{"style":{"height":16},"width":168.1,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-13.png","element":"img","alt":"N ∼ U(−","inline":true},{"style":{"fontStyle":"italic"},"text":"τ, τ","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":", we have ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ar","element":"span"},{"style":{"height":24.98},"width":385.84,"height":62.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-14.png","element":"img","alt":"(X1:N) = 4Nτ 2(N+1)2(N+2)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ar","element":"span"},{"style":{"height":16},"width":160,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-15.png","element":"img","alt":"(X1:N+1)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"< ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ar","element":"span"},{"style":{"height":16},"width":161.21,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-16.png","element":"img","alt":"(X1:N) ≤","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"V ar","element":"span"},{"style":{"height":17.39},"width":200.13,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-17.png","element":"img","alt":"(X1:1) = σ2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for any positive integer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proof.","element":"span"}],[{"text":"(i) By the definition of ","element":"span"},{"style":{"height":13.19},"width":84.92,"height":32.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-18.png","element":"img","alt":" X1:N","inline":true},{"text":", we have ","element":"span"},{"style":{"height":14.79},"width":266.74,"height":36.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-19.png","element":"img","alt":" X1:N+1 ≤ X1:N","inline":true},{"text":". Thus ","element":"span"},{"style":{"height":16},"width":377.6,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-20.png","element":"img","alt":" E[X1:N+1] ≤ E[X1:N]","inline":true},{"text":". Since ","element":"span"},{"style":{"height":16},"width":803.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-21.png","element":"img","alt":"E[X1:1] = E[X1] = µ, E[X1:N] ≤ E[X1:1] = µ","inline":true},{"text":". The proof of ","element":"span"},{"style":{"height":24.99},"width":396.36,"height":62.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-22.png","element":"img","alt":" µ − (N−1)σ√2N−1 ≤ E[X1:N]","inline":true,"padRight":true},{"text":"can be found in ","element":"span"},{"href":"#id-33","referenceIndex":5,"text":"(David & Nagaraja, ","element":"a"},{"href":"#id-33","referenceIndex":5,"text":"2004, ","element":"a"},{"text":"Chapter 4 Section 4.2).","element":"span"}],[{"text":"(ii) We first consider the cdf of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X","element":"span"},{"style":{"height":7.6},"width":51.9,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-23.png","element":"img","alt":"1:N","inline":true},{"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"style":{"height":16},"width":686.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-24.png","element":"img","alt":"1:N(x) := P(X1:N ≤ x) = 1 − P(X1:N","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"> x","element":"span"},{"text":") = ","element":"span"},{"style":{"height":16},"width":165.94,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-25.png","element":"img","alt":"1 − P(X1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"> x, . . . , X","element":"span"},{"style":{"height":7.6},"width":32,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-26.png","element":"img","alt":"M","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"> x","element":"span"},{"style":{"height":16},"width":238.77,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-27.png","element":"img","alt":") = 1 − P(X1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"> x","element":"span"},{"style":{"height":16},"width":181.9,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-28.png","element":"img","alt":") · · · P(XN","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"> x","element":"span"},{"style":{"height":17.39},"width":356.38,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-29.png","element":"img","alt":") = 1 − (1 − F(x))N","inline":true},{"text":". Then the pdf of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X","element":"span"},{"style":{"height":7.6},"width":51.91,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-30.png","element":"img","alt":"1:N","inline":true,"padRight":true},{"text":"is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"style":{"height":19.64},"width":308.73,"height":49.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-31.png","element":"img","alt":"1:N(x) := dF1:Ndx =","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Nf","element":"span"},{"style":{"height":17.39},"width":307.47,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-32.png","element":"img","alt":"(x)(1 − F(x))N−1","inline":true},{"text":".","element":"span"}],[{"text":"(iii) Similar to (ii), we first consider cdf of ","element":"span"},{"style":{"height":16},"width":805.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-33.png","element":"img","alt":" XN:N. FN:N(x) := P(XN:N ≤ x) = P(X1 ≤","inline":true},{"style":{"height":17.39},"width":1005.84,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-34.png","element":"img","alt":"x, . . . , XN ≤ x) = P(X1 ≤ x) · · · P(XM ≤ x) = (F(x))N","inline":true},{"text":". Then the pdf of ","element":"span"},{"style":{"height":13.19},"width":97.27,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-35.png","element":"img","alt":" XN:N","inline":true,"padRight":true},{"text":"is ","element":"span"},{"style":{"height":19.64},"width":660.69,"height":49.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-36.png","element":"img","alt":"fN:N(x) := dFN:Ndx = Nf(x)(F(x))N−1","inline":true},{"text":".","element":"span"}],[{"text":"(iv) Since ","element":"span"},{"style":{"height":16},"width":567.9,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-37.png","element":"img","alt":" X1, . . . , XN ∼ Uniform(−τ, τ)","inline":true},{"text":", we have ","element":"span"},{"style":{"height":19.37},"width":283.24,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-38.png","element":"img","alt":" F(x) = 12 + x2τ","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.37},"width":192.2,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-39.png","element":"img","alt":" f(x) = 12τ","inline":true,"padRight":true},{"text":". ","element":"span"},{"style":{"height":24.98},"width":1425.36,"height":62.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-40.png","element":"img","alt":"V ar(X1:N) = E[X1:N2] − E[X1:N]2 = 4τ 2( 2(N+1)(N+2) − 1(N+1)2 ) = 4nτ 2(N+1)2(N+2)","inline":true},{"text":". ","element":"span"},{"text":"It is easy to check that ","element":"span"},{"style":{"height":17.38},"width":809.38,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-41.png","element":"img","alt":" V ar(X1:N+1) < V ar(X1:N) ≤ V ar(X1:1) = σ2","inline":true,"padRight":true},{"text":"for any positive integer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":".","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-42.png","element":"img"}],[{"text":"Next, we prove Theorem ","element":"span"},{"href":"#id-16","text":"1.","element":"a"}],[{"style":{"fontWeight":"bold"},"text":"Proof. ","element":"span"},{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"be the cdf and pdf of ","element":"span"},{"style":{"height":9.19},"width":50.59,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-43.png","element":"img","alt":" esa","inline":true},{"text":", respectively. Similarly, Let ","element":"span"},{"style":{"height":16},"width":104.01,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-44.png","element":"img","alt":" fN(x)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":110.13,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-45.png","element":"img","alt":" FN(x)","inline":true}],[{"text":"be the cdf and pdf of ","element":"span"},{"style":{"height":18.66},"width":282.49,"height":46.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-46.png","element":"img","alt":" mini∈{1,...,N} eisa","inline":true},{"text":". Since ","element":"span"},{"style":{"height":9.19},"width":50.59,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-47.png","element":"img","alt":" esa","inline":true,"padRight":true},{"text":"is sampled from ","element":"span"},{"style":{"height":16},"width":288.25,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-48.png","element":"img","alt":" Uniform(−τ, τ)","inline":true},{"text":", it is easy to","element":"span"}],[{"text":"get ","element":"span"},{"style":{"height":19.37},"width":177.68,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-49.png","element":"img","alt":" f(x) = 12τ","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.37},"width":262.91,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-50.png","element":"img","alt":" F(x) = 12 + x2τ","inline":true,"padRight":true},{"text":". By Lemma 1, we have ","element":"span"},{"style":{"height":17.39},"width":577.04,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/10-51.png","element":"img","alt":" fN(x) = Nf(x)[1 − F(x)]N−1 =","inline":true}],[{"style":{"fontStyle":"italic"},"text":"N","element":"span"}],[{"style":{"height":19.37},"width":256.25,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-0.png","element":"img","alt":"2τ ( 12 − x2τ )N−1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.37},"width":743.46,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-1.png","element":"img","alt":" FN(x) = 1 − (1 − F(x))N = 1 − ( 12 − x2τ )N","inline":true},{"text":". The expectation of ","element":"span"},{"style":{"height":13.19},"width":87.56,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-2.png","element":"img","alt":" ZMN","inline":true,"padRight":true},{"text":"is","element":"span"}],[{"style":{"width":"67%"},"width":1062,"height":716,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-3.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":22.17},"width":406.14,"height":55.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-4.png","element":"img","alt":" tMN =� 10 (1 − yN)Mdy","inline":true},{"text":", so that ","element":"span"},{"style":{"height":16},"width":434.8,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-5.png","element":"img","alt":" E[ZMN] = γτ[1 − 2tMN]","inline":true},{"text":". Substitute ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y ","element":"span"},{"text":"by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"where ","element":"span"},{"style":{"height":16.59},"width":118.07,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-6.png","element":"img","alt":" t = yN","inline":true},{"text":",","element":"span"}],[{"text":"then","element":"span"}],[{"style":{"width":"45%"},"width":723,"height":542,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-7.png","element":"img"}],[{"text":"Each term in the denominator decreases as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"increases, because ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/N ","element":"span"},{"text":"gets smaller. Therefore,","element":"span"}],[{"style":{"height":20.97},"width":257.86,"height":52.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-8.png","element":"img","alt":"tM,N=1 = 1M+1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.59},"width":222.44,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-9.png","element":"img","alt":" tM,N→∞ = 1","inline":true},{"text":". Using this, we conclude that ","element":"span"},{"style":{"height":16},"width":144.59,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-10.png","element":"img","alt":" E[ZMN]","inline":true,"padRight":true},{"text":"decreases as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"increases","element":"span"}],[{"text":"and ","element":"span"},{"style":{"height":20.97},"width":369.74,"height":52.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-11.png","element":"img","alt":" E[ZM,N=1] = γτ M−1M+1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16.79},"width":343.96,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-12.png","element":"img","alt":" E[ZM,N→∞] = −γτ","inline":true},{"text":".","element":"span"}],[{"text":"By Lemma ","element":"span"},{"href":"#id-32","referenceIndex":24,"text":"1, ","element":"a"},{"text":"the variance of ","element":"span"},{"style":{"height":16.92},"width":91.06,"height":42.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-13.png","element":"img","alt":" Qminsa","inline":true,"padRight":true},{"text":"is","element":"span"}],[{"style":{"width":"33%"},"width":534,"height":98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-14.png","element":"img"}],[{"style":{"height":16.98},"width":187.07,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-15.png","element":"img","alt":"V ar[Qminsa ]","inline":true,"padRight":true},{"text":"decreases as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"increases. In particular, ","element":"span"},{"style":{"height":22.18},"width":276.98,"height":55.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-16.png","element":"img","alt":" V ar[Qminsa ] = τ 23","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":16.98},"width":229.23,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-17.png","element":"img","alt":" V ar[Qminsa ] =","inline":true}],[{"text":"0 ","element":"span"},{"text":"for ","element":"span"},{"style":{"height":11.2},"width":138.36,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-18.png","element":"img","alt":" N → ∞","inline":true},{"text":".","element":"span"}],[{"text":"The bias-variance trade-off of Maxmin Q-learning is illustrated by the empirical results in Figure ","element":"span"},{"href":"#id-34","text":"5,","element":"a"}],[{"text":"which support Theorem ","element":"span"},{"href":"#id-16","text":"1. ","element":"a"},{"text":"For each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"can be selected such that the absolute value of the","element":"span"}],[{"text":"expected estimation bias is close to ","element":"span"},{"text":"0 ","element":"span"},{"text":"according to Theorem ","element":"span"},{"href":"#id-16","text":"1. ","element":"a"},{"text":"As ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"increases, we can adjust ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"to","element":"span"}],[{"text":"reduce both the estimation variance and the estimation bias.","element":"span"}],[{"text":"Finally, we prove the result of the Corollary.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Corollary ","element":"span"},{"href":"#id-35","style":{"fontWeight":"bold"},"text":"1 ","element":"a"},{"text":"Assuming the ","element":"span"},{"style":{"height":9.19},"width":55.95,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-19.png","element":"img","alt":" nsa","inline":true,"padRight":true},{"text":"samples are evenly allocated amongst the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"estimators, then ","element":"span"},{"style":{"height":6.8},"width":68.44,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-20.png","element":"img","alt":" τ =","inline":true}],[{"style":{"height":19.2},"width":210.77,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-21.png","element":"img","alt":"�3σ2N/nsa","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":13.39},"width":40.2,"height":33.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-22.png","element":"img","alt":" σ2","inline":true,"padRight":true},{"text":"is the variance of samples for ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":") ","element":"span"},{"text":"and, for ","element":"span"},{"style":{"height":14},"width":63.54,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-23.png","element":"img","alt":" Qsa","inline":true,"padRight":true},{"text":"the estimator that uses all","element":"span"}],[{"style":{"height":9.19},"width":55.95,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-24.png","element":"img","alt":"nsa","inline":true,"padRight":true},{"text":"samples for a single estimate,","element":"span"}],[{"style":{"width":"44%"},"width":707,"height":98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-25.png","element":"img"}],[{"text":"Under this uniform random noise assumption, for ","element":"span"},{"style":{"height":16.99},"width":529.77,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/11-26.png","element":"img","alt":" N ≥ 8, V ar[Qminsa ] < V ar[Qsa]","inline":true},{"text":".","element":"span"}],[{"style":{"width":"97%"},"width":1540,"height":621,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-0.png","element":"img"}],[{"text":"Figure 5: Empirical results of Theorem ","element":"figcaption","subtype":"caption"},{"href":"#id-16","text":"1. ","element":"a","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"M ","element":"figcaption","subtype":"caption"},{"text":"is the number of available actions for some state ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"s","element":"figcaption","subtype":"caption"},{"text":". ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"N ","element":"figcaption","subtype":"caption"},{"text":"is the number of action-value functions in Maxmin Q-learning. In Figure ","element":"figcaption","subtype":"caption"},{"href":"#id-34","text":"5 ","element":"a","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"a","element":"figcaption","subtype":"caption"},{"text":")","element":"figcaption","subtype":"caption"},{"text":", we show a heat map of bias control in Maxmin Q-learning. In Figure ","element":"figcaption","subtype":"caption"},{"href":"#id-34","text":"5 ","element":"a","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"b","element":"figcaption","subtype":"caption"},{"text":")","element":"figcaption","subtype":"caption"},{"text":", we show how the variance ratio of ","element":"figcaption","subtype":"caption"},{"style":{"height":16.93},"width":91.06,"height":42.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-1.png","element":"img","alt":" Qminsa","inline":true,"padRight":true},{"text":"and ","element":"figcaption","subtype":"caption"},{"style":{"height":14},"width":63.54,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-2.png","element":"img","alt":"Qsa","inline":true,"padRight":true},{"text":"(i.e. ","element":"figcaption","subtype":"caption"},{"id":"id-34","style":{"height":16.98},"width":367.21,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-3.png","element":"img","alt":" V ar[Qminsa ]/V ar[Qsa]","inline":true},{"text":") reduces as ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"N ","element":"figcaption","subtype":"caption"},{"text":"increases. For a better comparison, we set ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":117.92,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-4.png","element":"img","alt":" γτ = 1","inline":true},{"text":".","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"Proof. ","element":"span"},{"text":"Because ","element":"span"},{"style":{"height":16.93},"width":63.54,"height":42.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-5.png","element":"img","alt":" Qisa","inline":true,"padRight":true},{"text":"is a sample mean, its variance is ","element":"span"},{"style":{"height":17.39},"width":151,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-6.png","element":"img","alt":" σ2N/nsa","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":13.39},"width":40.2,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-7.png","element":"img","alt":" σ2","inline":true,"padRight":true},{"text":"is the variance of samples","element":"span"}],[{"text":"for ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":") ","element":"span"},{"text":"and its mean is ","element":"span"},{"style":{"height":14.93},"width":63.54,"height":37.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-8.png","element":"img","alt":" Q∗sa","inline":true,"padRight":true},{"text":"(because it is an unbiased sample average). Consequently, ","element":"span"},{"style":{"height":9.19},"width":50.59,"height":22.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-9.png","element":"img","alt":" esa","inline":true,"padRight":true},{"text":"has","element":"span"}],[{"text":"mean zero and variance ","element":"span"},{"style":{"height":17.39},"width":151,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-10.png","element":"img","alt":" σ2N/nsa","inline":true},{"text":". Because ","element":"span"},{"style":{"height":9.19},"width":50.59,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-11.png","element":"img","alt":" esa","inline":true,"padRight":true},{"text":"is a uniform random variable which has variance","element":"span"}],[{"text":"1","element":"span"}],[{"style":{"height":18.88},"width":58.6,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-12.png","element":"img","alt":"3τ 2","inline":true},{"text":", we know that ","element":"span"},{"style":{"height":19.2},"width":288.18,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-13.png","element":"img","alt":" τ =�3σ2N/nsa","inline":true},{"text":". Plugging this value into the variance formula in Theorem ","element":"span"},{"href":"#id-16","text":"1,","element":"a"}],[{"text":"we get that","element":"span"}],[{"style":{"width":"43%"},"width":694,"height":323,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-14.png","element":"img"}],[{"text":"because ","element":"span"},{"style":{"height":17.38},"width":331.31,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-15.png","element":"img","alt":" V ar[Qsa] = σ2/nsa","inline":true,"padRight":true},{"text":"for the sample average ","element":"span"},{"style":{"height":14},"width":63.54,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-16.png","element":"img","alt":" Qsa","inline":true,"padRight":true},{"text":"that uses all the samples for one estimator.","element":"span"}],[{"text":"Easy to verify that for ","element":"span"},{"style":{"height":16.98},"width":529.77,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-17.png","element":"img","alt":" N ≥ 8, V ar[Qminsa ] < V ar[Qsa]","inline":true},{"text":".","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-18.png","element":"img"}],[{"id":"id-28","text":"B ","element":"span"},{"text":"T","element":"span"},{"text":"HE ","element":"span"},{"text":"C","element":"span"},{"text":"ONVERGENCE ","element":"span"},{"text":"P","element":"span"},{"text":"ROOF OF ","element":"span"},{"text":"G","element":"span"},{"text":"ENERALIZED ","element":"span"},{"text":"Q-","element":"span"},{"text":"LEARNING","element":"span"}],[{"text":"The convergence proof of Generalized Q-learning is based on ","element":"span"},{"href":"#id-11","referenceIndex":16,"text":"Tsitsiklis ","element":"a"},{"href":"#id-11","referenceIndex":16,"text":"(1994)","element":"a"},{"text":". The key steps to","element":"span"}],[{"text":"use this result for Generalized Q-learning include showing that the operator is a contraction and","element":"span"}],[{"text":"verifying the noise conditions. We first show these two steps in Lemma ","element":"span"},{"href":"#id-36","referenceIndex":114,"text":"2 ","element":"a"},{"text":"and Lemma ","element":"span"},{"href":"#id-37","referenceIndex":120,"text":"3. ","element":"a"},{"text":"We then","element":"span"}],[{"text":"use these lemmas to make the standard argument for convergence.","element":"span"}],[{"text":"B.1 ","element":"span"},{"text":"P","element":"span"},{"text":"ROBLEM ","element":"span"},{"text":"S","element":"span"},{"text":"ETTING FOR ","element":"span"},{"text":"G","element":"span"},{"text":"ENERALIZED ","element":"span"},{"text":"Q-","element":"span"},{"text":"LEARNING","element":"span"}],[{"text":"Consider a Markov decision problem defined on a finite state space ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":". For every state ","element":"span"},{"style":{"height":11.6},"width":93.41,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-19.png","element":"img","alt":" s ∈ S","inline":true},{"text":", there is","element":"span"}],[{"text":"a finite set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"of possible actions for state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"and a set of non-negative scalars ","element":"span"},{"style":{"height":16},"width":360.54,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-20.png","element":"img","alt":" pss′(a), a ∈ A, s′ ∈ S","inline":true},{"text":",","element":"span"}],[{"text":"such that ","element":"span"},{"style":{"height":19.18},"width":300.54,"height":47.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-21.png","element":"img","alt":"�j∈S pss′(a) = 1","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":12.4},"width":106.12,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-22.png","element":"img","alt":" a ∈ A","inline":true},{"text":". The scalar ","element":"span"},{"style":{"height":16},"width":115.46,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-23.png","element":"img","alt":" pss′(a)","inline":true,"padRight":true},{"text":"is interpreted as the probability of a","element":"span"}],[{"text":"transition to ","element":"span"},{"style":{"height":6.8},"width":32.68,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-24.png","element":"img","alt":" s′","inline":true},{"text":", given that the current state is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"and action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"is applied. Furthermore, for every state","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"and action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":", there is a random variable ","element":"span"},{"style":{"height":9.19},"width":50.01,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-25.png","element":"img","alt":" rsa","inline":true,"padRight":true},{"text":"which represents the reward if action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"is applied at","element":"span"}],[{"text":"state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":". We assume that the variance of ","element":"span"},{"style":{"height":9.19},"width":50.01,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-26.png","element":"img","alt":" rsa","inline":true,"padRight":true},{"text":"is finite for every ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":12.4},"width":101.79,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/12-27.png","element":"img","alt":" a ∈ A","inline":true},{"text":".","element":"span"}],[{"text":"A stationary policy is a function ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-0.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"defined on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"such that ","element":"span"},{"style":{"height":16},"width":154.55,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-1.png","element":"img","alt":" π(s) ∈ A","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":93.4,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-2.png","element":"img","alt":" s ∈ S","inline":true},{"text":". Given a stationary","element":"span"}],[{"text":"policy, we obtain a discrete-time Markov chain ","element":"span"},{"style":{"height":16},"width":91.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-3.png","element":"img","alt":" f π(t)","inline":true,"padRight":true},{"text":"with transition probabilities","element":"span"}],[{"style":{"width":"72%"},"width":1145,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-4.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":16},"width":151.52,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-5.png","element":"img","alt":" γ ∈ [0, 1]","inline":true,"padRight":true},{"text":"be a discount factor. For any stationary policy ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-6.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"and initial state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", the state value ","element":"span"},{"style":{"height":14.74},"width":51.1,"height":36.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-7.png","element":"img","alt":" V πs","inline":true}],[{"text":"is defined by","element":"span"}],[{"style":{"width":"73%"},"width":1167,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-8.png","element":"img"}],[{"text":"The optimal state value function ","element":"span"},{"style":{"height":10.98},"width":48.1,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-9.png","element":"img","alt":" V ∗","inline":true,"padRight":true},{"text":"is defined by","element":"span"}],[{"style":{"width":"61%"},"width":976,"height":62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-10.png","element":"img"}],[{"text":"The Markov decision problem is to evaluate the function ","element":"span"},{"style":{"height":10.98},"width":48.1,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-11.png","element":"img","alt":" V ∗","inline":true},{"text":". Once this is done, an optimal policy","element":"span"}],[{"text":"is easily determined.","element":"span"}],[{"text":"Markov decision problems are easiest when the discount ","element":"span"},{"style":{"height":10.4},"width":22,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-12.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"is strictly smaller than ","element":"span"},{"text":"1","element":"span"},{"text":". For the undis-","element":"span"}],[{"text":"counted case (","element":"span"},{"style":{"height":14.4},"width":96,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-13.png","element":"img","alt":"γ = 1","inline":true},{"text":"), we will assume throughout that there is a reward-free state, say state ","element":"span"},{"text":"1","element":"span"},{"text":", which","element":"span"}],[{"text":"is absorbing; that is, ","element":"span"},{"style":{"height":16},"width":179.87,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-14.png","element":"img","alt":" p11(a) = 1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.19},"width":128.72,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-15.png","element":"img","alt":" r1u = 0","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":12.4},"width":102.66,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-16.png","element":"img","alt":" a ∈ A","inline":true},{"text":". The objective is then to reach that state","element":"span"}],[{"text":"at maximum expected reward. We say that a stationary policy is proper if the probability of being at","element":"span"}],[{"text":"the absorbing state converges to ","element":"span"},{"text":"1 ","element":"span"},{"text":"as time converges to infinity; otherwise, we say that the policy is","element":"span"}],[{"text":"improper.","element":"span"}],[{"text":"We define the dynamic programming operator ","element":"span"},{"style":{"height":14.59},"width":264.14,"height":36.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-17.png","element":"img","alt":" T : R|S| �→ R|S|","inline":true},{"text":", with components ","element":"span"},{"style":{"height":13.19},"width":34.29,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-18.png","element":"img","alt":" Ti","inline":true},{"text":", by letting","element":"span"}],[{"style":{"width":"71%"},"width":1138,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-19.png","element":"img"}],[{"text":"It is well known that if ","element":"span"},{"style":{"height":14.4},"width":98.03,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-20.png","element":"img","alt":" γ < 1","inline":true},{"text":", then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"is a contraction with respect to the norm ","element":"span"},{"style":{"height":16},"width":101.45,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-21.png","element":"img","alt":" ∥ · ∥∞","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":10.98},"width":48.1,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-22.png","element":"img","alt":" V ∗","inline":true,"padRight":true},{"text":"is its","element":"span"}],[{"text":"unique fixed point.","element":"span"}],[{"text":"For Generalized Q-learning algorithm, assume that there are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"estimators of action-values","element":"span"}],[{"style":{"height":16.59},"width":196.44,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-23.png","element":"img","alt":"Q1, . . . , QN","inline":true},{"text":". Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"be the cardinality of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"be the cardinality of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":". We use a discrete in-","element":"span"}],[{"text":"dex variable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"in order to count iterations. Denote ","element":"span"},{"style":{"height":16.98},"width":322.86,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-24.png","element":"img","alt":" Qij(t) = Qi(t + j)","inline":true},{"text":". After ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"iterations, we have","element":"span"}],[{"text":"a vector ","element":"span"},{"style":{"height":16},"width":180.05,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-25.png","element":"img","alt":" Q(t) ∈ Rw","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"w ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnNK","element":"span"},{"text":", with components ","element":"span"},{"style":{"height":16.98},"width":651.28,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-26.png","element":"img","alt":" Qijsa(t), (s, a) ∈ S × A, i ∈ {1, . . . , N}","inline":true},{"text":",","element":"span"}],[{"text":"and ","element":"span"},{"style":{"height":16},"width":321.13,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-27.png","element":"img","alt":" j ∈ {0, . . . , K − 1}","inline":true},{"text":".","element":"span"}],[{"text":"By definition, for ","element":"span"},{"style":{"height":16},"width":321.13,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-28.png","element":"img","alt":" j ∈ {1, . . . , K − 1}","inline":true},{"text":", we have","element":"span"}],[{"id":"id-39","style":{"width":"62%"},"width":988,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-29.png","element":"img"}],[{"text":"For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 0","element":"span"},{"text":", we have ","element":"span"},{"style":{"height":17.33},"width":182.5,"height":43.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-30.png","element":"img","alt":" Qi0sa = Qisa","inline":true},{"text":". And we update according to the formula","element":"span"}],[{"style":{"width":"99%"},"width":1582,"height":176,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-31.png","element":"img"}],[{"text":"Here, each ","element":"span"},{"style":{"height":16.98},"width":105.7,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-32.png","element":"img","alt":" αisa(t)","inline":true,"padRight":true},{"text":"is a nonnegative step-size coefficient which is set to zero for those ","element":"span"},{"style":{"height":16},"width":239.16,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-33.png","element":"img","alt":" (s, a) ∈ S × A","inline":true}],[{"text":"and ","element":"span"},{"style":{"height":16},"width":256.21,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-34.png","element":"img","alt":" i ∈ {1, . . . , N}","inline":true,"padRight":true},{"text":"for which ","element":"span"},{"style":{"height":16.92},"width":63.54,"height":42.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-35.png","element":"img","alt":" Qisa","inline":true,"padRight":true},{"text":"is not to be updated at the current iteration. Furthermore, ","element":"span"},{"style":{"height":9.19},"width":50.02,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-36.png","element":"img","alt":" rsa","inline":true}],[{"text":"is a random sample of the immediate reward if action ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"is applied at state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":") ","element":"span"},{"text":"is a random","element":"span"}],[{"text":"successor state which is equal to ","element":"span"},{"style":{"height":6.8},"width":32.68,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-37.png","element":"img","alt":" s′","inline":true,"padRight":true},{"text":"with probability ","element":"span"},{"style":{"height":16},"width":115.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-38.png","element":"img","alt":" pss′(a)","inline":true},{"text":". Finally, ","element":"span"},{"style":{"height":17.39},"width":129.29,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-39.png","element":"img","alt":" QGQs (t)","inline":true,"padRight":true},{"text":"is defined as","element":"span"}],[{"style":{"width":"60%"},"width":958,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-40.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"is a mapping from ","element":"span"},{"style":{"height":13.39},"width":104.71,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-41.png","element":"img","alt":" RnNK","inline":true,"padRight":true},{"text":"to ","element":"span"},{"text":"R","element":"span"},{"text":". It is understood that all random samples that are drawn in","element":"span"}],[{"text":"the course of the algorithm are drawn independently.","element":"span"}],[{"text":"Since for ","element":"span"},{"style":{"height":16},"width":332.9,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-42.png","element":"img","alt":" j ∈ {1, . . . , K − 1}","inline":true},{"text":", we just preserve current available action-values, we only focus on","element":"span"}],[{"text":"the case that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 0 ","element":"span"},{"text":"in the sequel. Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"be the mapping from ","element":"span"},{"style":{"height":13.39},"width":132.99,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-43.png","element":"img","alt":" RmnNK","inline":true,"padRight":true},{"text":"into ","element":"span"},{"style":{"height":13.39},"width":103.76,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-44.png","element":"img","alt":" RmnN","inline":true,"padRight":true},{"text":"with components","element":"span"}],[{"style":{"height":16.93},"width":57.66,"height":42.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-45.png","element":"img","alt":"F isa","inline":true,"padRight":true},{"id":"id-38","text":"defined by","element":"span"}],[{"style":{"width":"66%"},"width":1052,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/13-46.png","element":"img"}],[{"text":"and note that","element":"span"}],[{"style":{"width":"64%"},"width":1021,"height":84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-0.png","element":"img"}],[{"text":"If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"style":{"height":16.98},"width":304.26,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-1.png","element":"img","alt":"isa(Q(t)) = Q(t)isa","inline":true},{"text":", we can do ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"more updates such that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"style":{"height":17.38},"width":231.08,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-2.png","element":"img","alt":"(t)ija = Q(t)kla","inline":true,"padRight":true},{"text":", ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-3.png","element":"img","alt":" ∀","inline":true},{"style":{"fontStyle":"italic"},"text":"i, k ","element":"span"},{"style":{"height":16},"width":77.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-4.png","element":"img","alt":" ∈ {1","inline":true},{"style":{"fontStyle":"italic"},"text":", . . . , N","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":",","element":"span"}],[{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-5.png","element":"img","alt":"∀","inline":true},{"style":{"fontStyle":"italic"},"text":"j, l ","element":"span"},{"style":{"height":16},"width":77.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-6.png","element":"img","alt":" ∈ {0","inline":true},{"style":{"fontStyle":"italic"},"text":", . . . , K ","element":"span"},{"style":{"height":16},"width":79.78,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-7.png","element":"img","alt":" − 1}","inline":true},{"text":", and ","element":"span"},{"style":{"height":12.4},"width":123.93,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-8.png","element":"img","alt":" ∀a ∈ A","inline":true},{"text":".","element":"span"}],[{"text":"In view of Equation ","element":"span"},{"href":"#id-38","referenceIndex":106,"text":"13, ","element":"a"},{"text":"Equation ","element":"span"},{"href":"#id-39","text":"10 ","element":"a"},{"text":"can be written as","element":"span"}],[{"id":"id-40","style":{"width":"99%"},"width":1582,"height":182,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-9.png","element":"img"}],[{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") ","element":"span"},{"text":"represents the history of the algorithm during the first ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"iterations. The expectation in the","element":"span"}],[{"text":"expression ","element":"span"},{"style":{"height":23.49},"width":307.09,"height":58.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-10.png","element":"img","alt":" E[QGQf(s,a)(t)|F(t)]","inline":true,"padRight":true},{"text":"is with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":")","element":"span"},{"text":".","element":"span"}],[{"text":"B.2 ","element":"span"},{"text":"K","element":"span"},{"text":"EY ","element":"span"},{"text":"L","element":"span"},{"text":"EMMAS AND THE ","element":"span"},{"text":"P","element":"span"},{"text":"ROOFS","element":"span"}],[{"id":"id-36","style":{"fontWeight":"bold"},"text":"Lemma 2 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume Assumption ","element":"span"},{"href":"#id-26","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"holds for function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"style":{"fontStyle":"italic"},"text":"in Generalized Q-learning. Then we have","element":"span"}],[{"style":{"width":"68%"},"width":1086,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-11.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof. ","element":"span"},{"text":"Under Assumption ","element":"span"},{"href":"#id-26","text":"1, ","element":"a"},{"text":"the conditional variance of ","element":"span"},{"style":{"height":23.49},"width":116.42,"height":58.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-12.png","element":"img","alt":" QGQf(s,a)","inline":true,"padRight":true},{"text":"given ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":")","element":"span"},{"text":", is bounded","element":"span"}],[{"text":"above ","element":"span"},{"text":"by ","element":"span"},{"text":"the ","element":"span"},{"text":"largest ","element":"span"},{"text":"pos","element":"span"},{"href":"#id-26","text":"sib","element":"a"},{"text":"le ","element":"span"},{"text":"value ","element":"span"},{"text":"that ","element":"span"},{"text":"this ","element":"span"},{"text":"random ","element":"span"},{"text":"variable ","element":"span"},{"text":"could ","element":"span"},{"text":"take, ","element":"span"},{"text":"which ","element":"span"},{"text":"is","element":"span"}],[{"href":"#id-40","style":{"height":22.51},"width":983.93,"height":56.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-13.png","element":"img","alt":"maxi∈{1,...,N} maxj∈{0,...,K−1} max(s,a)∈S×A |Qisa(t − j)|2","inline":true},{"text":". We then take the conditional vari-","element":"span"}],[{"text":"ance of both sides of Equation ","element":"span"},{"href":"#id-40","text":"16, ","element":"a"},{"text":"to obtain","element":"span"}],[{"style":{"width":"83%"},"width":1328,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-14.png","element":"img"}],[{"text":"We have assumed here that ","element":"span"},{"style":{"height":9.19},"width":50.02,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-15.png","element":"img","alt":" rsa","inline":true,"padRight":true},{"text":"is independent from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":")","element":"span"},{"text":". If it is not, the right-hand side in the","element":"span"}],[{"id":"id-37","text":"last inequality must be multiplied by ","element":"span"},{"text":"2","element":"span"},{"text":", but the conclusion does not change.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 3 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a contraction mapping, in each of the following cases:","element":"span"}],[{"style":{"width":"9%"},"width":158,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-16.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"(ii) ","element":"span"},{"style":{"height":14.4},"width":108.12,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-17.png","element":"img","alt":" γ = 1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":18.52},"width":450.88,"height":46.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-18.png","element":"img","alt":" ∀a ∈ A, Qis1a(t = 0) = 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-19.png","element":"img","alt":" s1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is an absorbing state. All policies are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"proper.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proof. ","element":"span"},{"text":"For discounted problems (","element":"span"},{"style":{"height":14.4},"width":95.98,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-20.png","element":"img","alt":"γ < 1","inline":true},{"text":"), Equation ","element":"span"},{"href":"#id-38","referenceIndex":106,"text":"13 ","element":"a"},{"text":"easily yields ","element":"span"},{"style":{"height":14},"width":116.86,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-21.png","element":"img","alt":" ∀Q, Q′","inline":true},{"text":",","element":"span"}],[{"style":{"width":"73%"},"width":1158,"height":71,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-22.png","element":"img"}],[{"text":"In particular, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"is a contraction mapping, with respect to the maximum norm ","element":"span"},{"style":{"height":16},"width":100.63,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-23.png","element":"img","alt":" ∥ · ∥∞","inline":true},{"text":".","element":"span"}],[{"text":"For undiscounted problems (","element":"span"},{"style":{"height":14.4},"width":96,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-24.png","element":"img","alt":"γ = 1","inline":true},{"text":"), our assumptions on the absorbing state ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-25.png","element":"img","alt":" s1","inline":true,"padRight":true},{"text":"imply that the update","element":"span"}],[{"text":"equation for ","element":"span"},{"style":{"height":18.52},"width":79.1,"height":46.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-26.png","element":"img","alt":" Qis1a","inline":true,"padRight":true},{"text":"degenerates to ","element":"span"},{"style":{"height":18.52},"width":361.65,"height":46.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-27.png","element":"img","alt":" Qis1a(t+1) = Qis1a(t)","inline":true},{"text":", for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". We will be assuming in the sequel,","element":"span"}],[{"text":"that ","element":"span"},{"style":{"height":18.52},"width":79.1,"height":46.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-28.png","element":"img","alt":" Qis1a","inline":true,"padRight":true},{"text":"is initialized at zero. This leads to an equivalent description of the algorithm in which the","element":"span"}],[{"text":"mappings ","element":"span"},{"style":{"height":16.92},"width":57.66,"height":42.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-29.png","element":"img","alt":" F isa","inline":true,"padRight":true},{"text":"of Equation ","element":"span"},{"href":"#id-38","referenceIndex":106,"text":"13 ","element":"a"},{"text":"are replaced by mappings ","element":"span"},{"style":{"height":18.77},"width":57.66,"height":46.93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-30.png","element":"img","alt":" ˜F isa","inline":true,"padRight":true},{"text":"satisfying ","element":"span"},{"style":{"height":18.77},"width":180.42,"height":46.93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-31.png","element":"img","alt":" ˜F isa = F isa","inline":true,"padRight":true},{"text":"if ","element":"span"},{"style":{"height":15.2},"width":116.18,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-32.png","element":"img","alt":" s ̸= s1","inline":true,"padRight":true},{"text":"and","element":"span"}],[{"style":{"height":20.36},"width":211.12,"height":50.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-33.png","element":"img","alt":"˜F is1a(Q) = 0","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":16},"width":368.72,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-34.png","element":"img","alt":" a ∈ A, i ∈ {1, . . . , N}","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14},"width":128.99,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-35.png","element":"img","alt":" Q ∈ Rn","inline":true},{"text":".","element":"span"}],[{"text":"Let us consider the special case where every policy is proper. By Proposition 2.2 in the work","element":"span"}],[{"text":"of ","element":"span"},{"href":"#id-41","referenceIndex":3,"text":"(Bertsekas & Tsitsiklis, ","element":"a"},{"href":"#id-41","referenceIndex":3,"text":"1996)","element":"a"},{"text":", there exists a vector ","element":"span"},{"style":{"fontStyle":"italic"},"text":"v > ","element":"span"},{"text":"0 ","element":"span"},{"text":"such that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"is a contraction with","element":"span"}],[{"text":"respect to the norm ","element":"span"},{"style":{"height":16},"width":88.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-36.png","element":"img","alt":" ∥ · ∥v","inline":true},{"text":". In fact, a close examination of the proof of this Proposition 2.2 shows","element":"span"}],[{"text":"that this proof is easily extended to show that the mapping ","element":"span"},{"style":{"height":14.83},"width":31,"height":37.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-37.png","element":"img","alt":"˜F","inline":true,"padRight":true},{"text":"(with components ","element":"span"},{"style":{"height":18.77},"width":57.66,"height":46.93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-38.png","element":"img","alt":"˜F isa","inline":true},{"text":") is a contraction","element":"span"}],[{"text":"with respect to the norm ","element":"span"},{"style":{"height":16},"width":83.63,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-39.png","element":"img","alt":" ∥ · ∥z","inline":true},{"text":", where ","element":"span"},{"style":{"height":16.93},"width":140.3,"height":42.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-40.png","element":"img","alt":" zisa = vs","inline":true,"padRight":true},{"text":"for every ","element":"span"},{"style":{"height":12.4},"width":101.79,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-41.png","element":"img","alt":" a ∈ A","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":247.21,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/14-42.png","element":"img","alt":" i ∈ {1, . . . , N}","inline":true},{"text":".","element":"span"}],[{"id":"id-53","text":"B.3 ","element":"span"},{"text":"M","element":"span"},{"text":"ODELS AND ","element":"span"},{"text":"A","element":"span"},{"text":"SSUMPTIONS","element":"span"}],[{"text":"In this section, we describe the algorithmic model to be employed and state some assumptions that","element":"span"}],[{"text":"will be imposed.","element":"span"}],[{"text":"The algorithm consists of noisy updates of a vector ","element":"span"},{"style":{"height":11.6},"width":139.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-0.png","element":"img","alt":" x ∈ Rn","inline":true},{"text":", for the purpose of solving a sys-","element":"span"}],[{"text":"tem of equations of the form ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") = ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". ","element":"span"},{"text":"Here ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"is assumed to be a mapping from ","element":"span"},{"style":{"height":10.8},"width":48.78,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-1.png","element":"img","alt":" Rn","inline":true,"padRight":true},{"text":"into","element":"span"}],[{"text":"itself. Let ","element":"span"},{"style":{"height":14},"width":376.4,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-2.png","element":"img","alt":" F1, . . . , Fn: Rn �→ R","inline":true,"padRight":true},{"text":"be the corresponding component mappings; that is, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") =","element":"span"}],[{"style":{"height":16},"width":318.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-3.png","element":"img","alt":"(F1(x), . . . , Fn(x))","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":120.26,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-4.png","element":"img","alt":" x ∈ Rn","inline":true},{"text":".","element":"span"}],[{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"be the set of non-negative integers. We employ a discrete ”time” variable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", taking values","element":"span"}],[{"text":"in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":". This variable need not have any relation with real time; rather, it is used to index successive","element":"span"}],[{"text":"updates. Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") ","element":"span"},{"text":"be the value of the vector ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"and let ","element":"span"},{"style":{"height":16},"width":81.93,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-5.png","element":"img","alt":" xi(t)","inline":true,"padRight":true},{"text":"denote its ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"th component. Let","element":"span"}],[{"style":{"height":12.99},"width":39.82,"height":32.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-6.png","element":"img","alt":"T i","inline":true,"padRight":true},{"text":"be an infinite subset of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"indicating the set of times at which an update of ","element":"span"},{"style":{"height":9.19},"width":33.78,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-7.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"is performed. We","element":"span"}],[{"id":"id-44","text":"assume that","element":"span"}],[{"id":"id-42","style":{"width":"63%"},"width":1013,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-8.png","element":"img"}],[{"text":"Regarding the times that ","element":"span"},{"style":{"height":9.19},"width":33.78,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-9.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"is updated, we postulate an update equation of the form","element":"span"}],[{"style":{"width":"82%"},"width":1304,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-10.png","element":"img"}],[{"text":"Here, ","element":"span"},{"style":{"height":16},"width":71.53,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-11.png","element":"img","alt":" α(t)","inline":true,"padRight":true},{"text":"is a step-size parameter belonging to ","element":"span"},{"style":{"height":16},"width":187.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-12.png","element":"img","alt":" [0, 1], wi(t)","inline":true,"padRight":true},{"text":"is a noise term, and ","element":"span"},{"style":{"height":16},"width":81.93,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-13.png","element":"img","alt":" xi(t)","inline":true,"padRight":true},{"text":"is a vector of","element":"span"}],[{"text":"possibly outdated components of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". In particular, we assume that","element":"span"}],[{"style":{"width":"72%"},"width":1154,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-14.png","element":"img"}],[{"text":"where each ","element":"span"},{"style":{"height":19.53},"width":81.09,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-15.png","element":"img","alt":" τ ij(t)","inline":true,"padRight":true},{"text":"is an integer satisfying ","element":"span"},{"style":{"height":19.53},"width":246.5,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-16.png","element":"img","alt":" 0 ≤ τ ij(t) ≤ t","inline":true},{"text":". If no information is outdated, we have","element":"span"}],[{"style":{"height":19.53},"width":158.32,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-17.png","element":"img","alt":"τ ij(t) = t","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16.99},"width":213.82,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-18.png","element":"img","alt":" xi(t) = x(t)","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"; the reader may wish to think primarily of this case. For an","element":"span"}],[{"href":"#id-42","text":"inte","element":"a"},{"text":"rpretation of the general case, see ","element":"span"},{"href":"#id-43","referenceIndex":2,"text":"(Bertsekas & Tsitsiklis, ","element":"a"},{"href":"#id-43","referenceIndex":2,"text":"1989)","element":"a"},{"text":". In order to bring Eqs. ","element":"span"},{"href":"#id-44","referenceIndex":145,"text":"19 ","element":"a"},{"text":"and","element":"span"}],[{"href":"#id-42","text":"20 ","element":"a"},{"text":"into a unified form, it is convenient to assume that ","element":"span"},{"style":{"height":16},"width":189.54,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-19.png","element":"img","alt":" αi(t), wi(t)","inline":true},{"text":", and ","element":"span"},{"style":{"height":19.53},"width":81.09,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-20.png","element":"img","alt":" τ ij(t)","inline":true,"padRight":true},{"text":"are defined for every ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":",","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":", and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", but that ","element":"span"},{"style":{"height":16},"width":157.31,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-21.png","element":"img","alt":" αi(t) = 0","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.53},"width":147.72,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-22.png","element":"img","alt":" τ ij(t) = t","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":16.98},"width":102.92,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-23.png","element":"img","alt":" t /∈ T i","inline":true},{"text":".","element":"span"}],[{"text":"We ","element":"span"},{"text":"will ","element":"span"},{"text":"now ","element":"span"},{"text":"continue ","element":"span"},{"text":"with ","element":"span"},{"text":"our ","element":"span"},{"text":"assumptions. ","element":"span"},{"text":"All ","element":"span"},{"text":"variables ","element":"span"},{"text":"introduced ","element":"span"},{"text":"so ","element":"span"},{"text":"far","element":"span"}],[{"style":{"height":19.53},"width":404.71,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-24.png","element":"img","alt":"(x(t), τ ij(t), αi(t), wi(t))","inline":true,"padRight":true},{"text":"are viewed as random variables defined on a probability space ","element":"span"},{"style":{"height":16},"width":159.3,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-25.png","element":"img","alt":" (Ω, F, P)","inline":true}],[{"text":"and the assumptions deal primarily with the dependencies between these random variables. Our","element":"span"}],[{"text":"assumptions also involve an increasing sequence ","element":"span"},{"style":{"height":16},"width":170.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-26.png","element":"img","alt":" {F(t)}∞t=0","inline":true,"padRight":true},{"text":"of subfields of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":". Intuitively, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") ","element":"span"},{"text":"is","element":"span"}],[{"text":"meant to represent the history of the algorithm up to, and including the point at which the step-sizes","element":"span"}],[{"style":{"height":16},"width":84.65,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-27.png","element":"img","alt":"αi(t)","inline":true,"padRight":true},{"text":"for the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"th iteration are selected, but just before the noise term ","element":"span"},{"style":{"height":16},"width":87.69,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-28.png","element":"img","alt":" wi(t)","inline":true,"padRight":true},{"text":"is generated. Also, the","element":"span"}],[{"text":"measure-theoretic terminology that ”a random variable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Z ","element":"span"},{"text":"is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":")","element":"span"},{"text":"-measurable” has the intuitive","element":"span"}],[{"text":"meaning that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Z ","element":"span"},{"text":"is completely determined by the history represented by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":")","element":"span"},{"text":".","element":"span"}],[{"text":"The first assumption, which is the same as the total asynchronism assumption of ","element":"span"},{"href":"#id-43","referenceIndex":2,"text":"Bertsekas & Tsit-","element":"a"}],[{"href":"#id-43","referenceIndex":2,"text":"siklis ","element":"a"},{"href":"#id-43","referenceIndex":2,"text":"(1989)","element":"a"},{"text":", guarantees that even though information can be outdated, any old information is even-","element":"span"}],[{"id":"id-46","text":"tually discarded.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Assumption 3 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For any ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":19.53},"width":351.9,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-29.png","element":"img","alt":" j, limt→∞ τ ij(t) = ∞","inline":true},{"style":{"fontStyle":"italic"},"text":", with probability 1.","element":"span"}],[{"id":"id-45","text":"Our next assumption refers to the statistics of the random variables involved in the algorithm.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Assumption 4 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":16},"width":170.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-30.png","element":"img","alt":" {F(t)}∞t=0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be an increasing sequence of subfields of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"width":"95%"},"width":1517,"height":451,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/15-31.png","element":"img"}],[{"text":"Assumption ","element":"span"},{"href":"#id-45","referenceIndex":166,"text":"4 ","element":"a"},{"text":"allows for the possibility of deciding whether to update a particular component ","element":"span"},{"style":{"height":9.19},"width":33.78,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-0.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"at","element":"span"}],[{"text":"time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", based on the past history of the process. In this case, the step-size ","element":"span"},{"style":{"height":16},"width":84.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-1.png","element":"img","alt":" αi(t)","inline":true,"padRight":true},{"text":"becomes a random","element":"span"}],[{"text":"variable. However, part ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"iii","element":"span"},{"text":") ","element":"span"},{"text":"of the assumption requires that the choice of the components to be","element":"span"}],[{"text":"updated must be made without anticipatory knowledge of the noise variables ","element":"span"},{"style":{"height":9.19},"width":39.53,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-2.png","element":"img","alt":" wi","inline":true,"padRight":true},{"text":"that have not yet","element":"span"}],[{"text":"been realized.","element":"span"}],[{"text":"Finally, we introduce a few alternative assumptions on the structure of the iteration mapping ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":". We","element":"span"}],[{"text":"first need some notation: if ","element":"span"},{"style":{"height":14},"width":160.99,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-3.png","element":"img","alt":" x, y ∈ Rn","inline":true},{"text":", the inequality ","element":"span"},{"style":{"height":13.6},"width":97.96,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-4.png","element":"img","alt":" x ≤ y","inline":true,"padRight":true},{"text":"is to be interpreted as ","element":"span"},{"style":{"height":13.6},"width":121.76,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-5.png","element":"img","alt":" xi ≤ yi","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":".","element":"span"}],[{"text":"Furthermore, for any positive vector ","element":"span"},{"style":{"height":16},"width":272.14,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-6.png","element":"img","alt":" v = (v1, . . . , vn)","inline":true},{"text":", we define a norm ","element":"span"},{"style":{"height":16},"width":84.63,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-7.png","element":"img","alt":" ∥ · ∥v","inline":true,"padRight":true},{"text":"on ","element":"span"},{"style":{"height":10.8},"width":48.78,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-8.png","element":"img","alt":" Rn","inline":true,"padRight":true},{"text":"by letting","element":"span"}],[{"style":{"width":"64%"},"width":1017,"height":92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-9.png","element":"img"}],[{"text":"Notice that in the special case where all components of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"v ","element":"span"},{"text":"are equal to ","element":"span"},{"style":{"height":16},"width":131.22,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-10.png","element":"img","alt":" 1, ∥ · ∥v","inline":true,"padRight":true},{"text":"is the same as the","element":"span"}],[{"id":"id-48","text":"maximum norm ","element":"span"},{"style":{"height":16},"width":100.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-11.png","element":"img","alt":" ∥ · ∥∞","inline":true},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Assumption 5 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":11.2},"width":225.62,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-12.png","element":"img","alt":" F : Rn �→ Rn","inline":true},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"width":"95%"},"width":1517,"height":349,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-13.png","element":"img"}],[{"id":"id-49","style":{"fontWeight":"bold"},"text":"Assumption 6 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"There exists a vector ","element":"span"},{"style":{"height":11.78},"width":138.58,"height":29.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-14.png","element":"img","alt":" x∗ ∈ Rn","inline":true},{"style":{"fontStyle":"italic"},"text":", a positive vector ","element":"span"},{"style":{"fontStyle":"italic"},"text":"v","element":"span"},{"style":{"fontStyle":"italic"},"text":", and a scalar ","element":"span"},{"style":{"height":16},"width":157.98,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-15.png","element":"img","alt":" β ∈ [0, 1)","inline":true},{"style":{"fontStyle":"italic"},"text":", such that","element":"span"}],[{"style":{"width":"71%"},"width":1127,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-16.png","element":"img"}],[{"id":"id-47","style":{"fontWeight":"bold"},"text":"Assumption 7 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"There exists a positive vector ","element":"span"},{"style":{"fontStyle":"italic"},"text":"v","element":"span"},{"style":{"fontStyle":"italic"},"text":", a scalar ","element":"span"},{"style":{"height":16},"width":157.98,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-17.png","element":"img","alt":" β ∈ [0, 1)","inline":true},{"style":{"fontStyle":"italic"},"text":", and a scalar ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"style":{"fontStyle":"italic"},"text":"such that","element":"span"}],[{"style":{"width":"68%"},"width":1079,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-18.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Assumption 8 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"There exists at least one proper stationary policy. Every improper stationary policy","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"yields infinite expected cost for at least one initial state.","element":"span"}],[{"id":"id-50","style":{"fontWeight":"bold"},"text":"Theorem 3 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let Assumptions ","element":"span"},{"href":"#id-46","referenceIndex":164,"style":{"fontStyle":"italic"},"text":"3, ","element":"a"},{"href":"#id-45","referenceIndex":166,"style":{"fontStyle":"italic"},"text":"4, ","element":"a"},{"href":"#id-29","style":{"fontStyle":"italic"},"text":"2, ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-47","referenceIndex":180,"style":{"fontStyle":"italic"},"text":"7 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"hold. Then the sequence ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is bounded with probability","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"1.","element":"span"}],[{"id":"id-51","style":{"fontWeight":"bold"},"text":"Theorem 4 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let Assumptions ","element":"span"},{"href":"#id-46","referenceIndex":164,"style":{"fontStyle":"italic"},"text":"3, ","element":"a"},{"href":"#id-45","referenceIndex":166,"style":{"fontStyle":"italic"},"text":"4, ","element":"a"},{"href":"#id-29","style":{"fontStyle":"italic"},"text":"2, ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-48","referenceIndex":177,"style":{"fontStyle":"italic"},"text":"5 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"hold. Furthermore, suppose that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is bounded with","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"probability ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":". Then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"conv","element":"span"},{"href":"#id-46","referenceIndex":164,"style":{"fontStyle":"italic"},"text":"erg","element":"a"},{"href":"#id-45","referenceIndex":166,"style":{"fontStyle":"italic"},"text":"es ","element":"a"},{"href":"#id-29","style":{"fontStyle":"italic"},"text":"to ","element":"a"},{"style":{"height":10.99},"width":38.78,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-19.png","element":"img","alt":" x∗","inline":true,"padRight":true},{"href":"#id-48","referenceIndex":177,"style":{"fontStyle":"italic"},"text":"wit","element":"a"},{"style":{"fontStyle":"italic"},"text":"h probability ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"id":"id-52","style":{"fontWeight":"bold"},"text":"Theorem 5 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let Assumptions ","element":"span"},{"href":"#id-46","referenceIndex":164,"style":{"fontStyle":"italic"},"text":"3, ","element":"a"},{"href":"#id-45","referenceIndex":166,"style":{"fontStyle":"italic"},"text":"4, ","element":"a"},{"href":"#id-29","style":{"fontStyle":"italic"},"text":"2, ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-49","referenceIndex":179,"style":{"fontStyle":"italic"},"text":"6 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"hold. Then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"converges to ","element":"span"},{"style":{"height":10.98},"width":38.78,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-20.png","element":"img","alt":" x∗","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with probability ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"text":"Detailed proofs of Theorems ","element":"span"},{"href":"#id-50","referenceIndex":183,"text":"3, ","element":"a"},{"href":"#id-51","referenceIndex":185,"text":"4, ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-52","referenceIndex":187,"text":"5 ","element":"a"},{"text":"can be found in the work of ","element":"span"},{"href":"#id-43","referenceIndex":2,"text":"Bertsekas & Tsitsiklis ","element":"a"},{"href":"#id-43","referenceIndex":2,"text":"(1989)","element":"a"},{"text":".","element":"span"}],[{"text":"B.4 ","element":"span"},{"text":"P","element":"span"},{"text":"ROOF OF ","element":"span"},{"text":"T","element":"span"},{"text":"HEOREM ","element":"span"},{"text":"2","element":"span"}],[{"text":"We first state Theorem ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"here again and then show the proof.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 2 ","element":"span"},{"text":"Assume a finite MDP ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"P","element":"span"},{"style":{"fontStyle":"italic"},"text":", R","element":"span"},{"text":") ","element":"span"},{"text":"and that Assumption ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-29","text":"2 ","element":"a"},{"text":"hold. ","element":"span"},{"text":"Then the","element":"span"}],[{"text":"action-value functions in Generalized Q-learning, using tabular upd","element":"span"},{"href":"#id-26","text":"ate ","element":"a"},{"text":"in ","element":"span"},{"href":"#id-29","text":"Eq","element":"a"},{"text":"uation ","element":"span"},{"href":"#id-30","text":"(3)","element":"a"},{"text":", will","element":"span"}],[{"text":"converge to the optimal action-value function with probability ","element":"span"},{"text":"1","element":"span"},{"text":", in each of the following cases:","element":"span"}],[{"style":{"width":"10%"},"width":160,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-21.png","element":"img"}],[{"text":"(ii) ","element":"span"},{"style":{"height":14.4},"width":109.16,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-22.png","element":"img","alt":" γ = 1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.52},"width":453.93,"height":46.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-23.png","element":"img","alt":" ∀a ∈ A, Qis1a(t = 0) = 0","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":9.19},"width":34.68,"height":22.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/16-24.png","element":"img","alt":" s1","inline":true,"padRight":true},{"text":"is an absorbing state. All policies are ","element":"span"},{"text":"proper.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proof. ","element":"span"},{"text":"We first check Assumptions ","element":"span"},{"href":"#id-46","referenceIndex":164,"text":"3, ","element":"a"},{"href":"#id-45","referenceIndex":166,"text":"4, ","element":"a"},{"href":"#id-29","text":"2, ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-49","referenceIndex":179,"text":"6 ","element":"a"},{"text":"in Section ","element":"span"},{"href":"#id-53","referenceIndex":134,"text":"B.3 ","element":"a"},{"text":"are satisfied. Then we simply apply","element":"span"}],[{"text":"Theorem ","element":"span"},{"href":"#id-52","referenceIndex":187,"text":"5 ","element":"a"},{"text":"to Generalized Q-learning.","element":"span"}],[{"text":"Assumption ","element":"span"},{"href":"#id-46","referenceIndex":164,"text":"3 ","element":"a"},{"text":"is satisfied in the special case where ","element":"span"},{"style":{"height":19.53},"width":147.72,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/17-0.png","element":"img","alt":" τ ij(t) = t","inline":true},{"text":", which is what was implicitly assumed","element":"span"}],[{"text":"in Equation ","element":"span"},{"href":"#id-39","text":"10, ","element":"a"},{"text":"but can be also satisfied even if we allow for outdated information.","element":"span"}],[{"text":"Regarding Assumption ","element":"span"},{"href":"#id-45","referenceIndex":166,"text":"4, ","element":"a"},{"text":"parts ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":") ","element":"span"},{"text":"and ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"ii","element":"span"},{"text":") ","element":"span"},{"text":"of the assumption are then automatically valid. Part ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"iii","element":"span"},{"text":")","element":"span"}],[{"text":"is quite natural: in par","element":"span"},{"href":"#id-45","referenceIndex":166,"text":"tic","element":"a"},{"text":"ular, it assumes that the required samples are generated after we decide","element":"span"}],[{"text":"which components to update during the current iteration. Part ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"iv","element":"span"},{"text":") ","element":"span"},{"text":"is automatic from Equation ","element":"span"},{"href":"#id-40","text":"16.","element":"a"}],[{"text":"Part ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"v","element":"span"},{"text":") ","element":"span"},{"text":"is satisfied by Lemma ","element":"span"},{"href":"#id-36","referenceIndex":114,"text":"2.","element":"a"}],[{"text":"Assumption ","element":"span"},{"href":"#id-29","text":"2 ","element":"a"},{"text":"needs to be imposed on the step-sizes employed by the Generalized Q-learning algo-","element":"span"}],[{"text":"rithm. This assumption is standard for stochastic approximation algorithms. In particular, it requires","element":"span"}],[{"text":"that every state-action pair ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":") ","element":"span"},{"text":"is simulated an infinite number of times.","element":"span"}],[{"text":"By Lemma ","element":"span"},{"href":"#id-37","referenceIndex":120,"text":"3, ","element":"a"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"is a contraction mapping. Assumption ","element":"span"},{"href":"#id-49","referenceIndex":179,"text":"6 ","element":"a"},{"text":"is satisfied.","element":"span"}],[{"text":"All assumptions required by Theorem ","element":"span"},{"href":"#id-52","referenceIndex":187,"text":"5 ","element":"a"},{"text":"are verified, convergence then follows from Theorem ","element":"span"},{"href":"#id-52","referenceIndex":187,"text":"5.","element":"a"}],[{"style":{"width":"1%"},"width":29,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/17-1.png","element":"img"}],[{"text":"C ","element":"span"},{"text":"A","element":"span"},{"text":"DDITIONAL ","element":"span"},{"text":"E","element":"span"},{"text":"MPIRICAL ","element":"span"},{"text":"R","element":"span"},{"text":"ESULTS","element":"span"}],[{"text":"C.1 ","element":"span"},{"text":"MDP ","element":"span"},{"text":"RESULTS","element":"span"}],[{"text":"Comparison of three algorithms using the simple MDP in Figure ","element":"span"},{"href":"#id-13","text":"1 ","element":"a"},{"text":"with different values of ","element":"span"},{"style":{"height":10},"width":24,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/17-2.png","element":"img","alt":" µ","inline":true,"padRight":true},{"text":"is","element":"span"}],[{"text":"shown in Figure ","element":"span"},{"href":"#id-54","text":"6. ","element":"a"},{"text":"For ","element":"span"},{"style":{"height":14},"width":160.93,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/17-3.png","element":"img","alt":" µ = +0.1","inline":true},{"text":", the learning curves of action value ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, ","element":"span"},{"text":"Left","element":"span"},{"text":") ","element":"span"},{"text":"are shown in ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":")","element":"span"},{"text":".","element":"span"}],[{"text":"Here, the true a","element":"span"},{"href":"#id-54","text":"ctio","element":"a"},{"text":"n value ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, ","element":"span"},{"text":"Left","element":"span"},{"text":") ","element":"span"},{"text":"is ","element":"span"},{"text":"+0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"1","element":"span"},{"text":". For ","element":"span"},{"style":{"height":14},"width":165.09,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/17-4.png","element":"img","alt":" µ = −0.1","inline":true},{"text":", the learning curves of action value","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, ","element":"span"},{"text":"Left","element":"span"},{"text":") ","element":"span"},{"text":"are shown in ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":")","element":"span"},{"text":". The true action value ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, ","element":"span"},{"text":"Left","element":"span"},{"text":") ","element":"span"},{"text":"is ","element":"span"},{"style":{"height":10.8},"width":81.99,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/17-5.png","element":"img","alt":" −0.1","inline":true},{"text":". All results were averaged","element":"span"}],[{"text":"over ","element":"span"},{"text":"5","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"000 ","element":"span"},{"text":"runs.","element":"span"}],[{"style":{"width":"92%"},"width":1458,"height":622,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/17-6.png","element":"img"}],[{"id":"id-54","text":"Figure 6: MDP results","element":"figcaption","subtype":"caption"}],[{"id":"id-19","text":"C.2 ","element":"span"},{"text":"M","element":"span"},{"text":"OUNTAIN ","element":"span"},{"text":"C","element":"span"},{"text":"AR RESULTS","element":"span"}],[{"text":"Comparison of four algorithms on Mountain Car under different reward settings is shown in Figure ","element":"span"},{"href":"#id-55","text":"7.","element":"a"}],[{"text":"All experimental results were averaged over ","element":"span"},{"text":"100 ","element":"span"},{"text":"runs. Note that for reward variance ","element":"span"},{"style":{"height":13.39},"width":136.83,"height":33.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/17-7.png","element":"img","alt":" σ2 = 50","inline":true},{"text":", both","element":"span"}],[{"text":"Q-learning and Averaged Q-learning fail to reach the goal position in ","element":"span"},{"text":"5","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"000 ","element":"span"},{"text":"steps so there are no","element":"span"}],[{"text":"learning curves shown in Figure ","element":"span"},{"text":"7 (","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":") ","element":"span"},{"text":"for these two algorithms.","element":"span"}],[{"id":"id-24","text":"C.3 ","element":"span"},{"text":"B","element":"span"},{"text":"ENCHMARK ","element":"span"},{"text":"E","element":"span"},{"text":"NVIRONMENT RESULTS","element":"span"}],[{"text":"The sensitivity analysis results of seven benchmark environment are shown in Figure ","element":"span"},{"href":"#id-56","text":"8.","element":"a"}],[{"style":{"width":"97%"},"width":1540,"height":1231,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/18-0.png","element":"img"}],[{"id":"id-55","text":"Figure 7: Mountain Car results","element":"figcaption","subtype":"caption"}],[{"style":{"width":"93%"},"width":1478,"height":2523,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.06487/images/19-0.png","element":"img"}],[{"id":"id-56","text":"Figure 8: Sensitivity analysis","element":"figcaption","subtype":"caption"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]