1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2022-12-04T18:34:02.000Z","paperID":"2003.01704","published":"2020-03-03T18:46:34.000Z","authors":"[\"Aldo Pacchiano\",\"My Phan\",\"Yasin Abbasi-Yadkori\",\"Anup Rao\",\"Julian Zimmert\",\"Tor Lattimore\",\"Csaba Szepesvari\"]","title":"Model Selection in Contextual Stochastic Bandit Problems","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-12-06T06:08:10.384Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9tb2RlbC1zZWxlY3Rpb24taW4tY29udGV4dHVhbC1zdG9jaGFzdGljIn0=","type":"pwc","url":"https://paperswithcode.com/paper/model-selection-in-contextual-stochastic","data":"{\"date\":\"2022-12-06T06:19:52.740Z\"}"}],"reposConnection":{"edges":[]},"models":[],"tags":[{"id":"eyJuYW1lIjoibW9kZWwgc2VsZWN0aW9uIiwidHlwZSI6InRhc2sifQ==","name":"model selection","description":"Model selection involves inputting various machine learning models and their performance metrics, and outputting the best model based on those metrics. It's used in real-world scenarios to choose the most effective model for a specific task, such as predicting customer behavior or diagnosing diseases.","scoreTrending":null,"count":{"stars":3548,"papers":1702,"models":1545},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[{"author":"yasin abbasi yadkori","node":{"id":"eyJhZGRyZXNzIjoieWFzaW4uYWJiYXNpQGdtYWlsLmNvbSJ9","address":"yasin.abbasi@gmail.com","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"-D0EgMIAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI4NDZkM2JhZi0wMzY1LTQ3MDUtYmQ0OC0yNDJjZDg2MzQ4NTEifQ==","name":"yasin abbasi yadkori","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgwNS4wMTY0OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.01648"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wODg2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.08865"},{"id":"eyJwYXBlcklEIjoiMTQwMi42NzYzIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1402.6763"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01704"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wNjQyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.06426"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wNDY0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.04644"},{"id":"eyJwYXBlcklEIjoiMTQwNi4zOTI2IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1406.3926"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNTQ5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.05491"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wOTc5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.09793"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNTI0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.05247"},{"id":"eyJwYXBlcklEIjoiMTcxMS4wNzk3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1711.07979"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wNDk3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.04970"},{"id":"eyJwYXBlcklEIjoiMTgwNC4xMDQ4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1804.10488"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wNjUzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.06532"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wNTM3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.05378"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMjYxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.12611"},{"id":"eyJwYXBlcklEIjoiMjIwMi4xMzAwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.13001"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wNTUzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.05533"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMzA1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.13053"},{"id":"eyJwYXBlcklEIjoiNzIzMTQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72314"}]}]}},{"author":"my phan","node":{"id":"eyJhZGRyZXNzIjoibXlwaGFuQGNzLnVtYXNzLmVkdSJ9","address":"myphan@cs.umass.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"JMybxmkAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI4ZTM1YjQ4Yi00MDlhLTQ4Y2ItYWRlMi05YjViNTJkNzdkYWQifQ==","name":"my phan","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01704"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNTQ5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.05491"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wNDk3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.04970"}]}]}},{"author":"anup rao","node":{"id":"eyJhZGRyZXNzIjoiYW51cHJhb0BhZG9iZS5jb20ifQ==","address":"anuprao@adobe.com","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"Adobe"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"pkwXPU0AAAAJ"},{"thirdPartyID":"T2U5sGIAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIxOWI2NmE1OC02NzhlLTQ0ZjctYWViMy01ZDU2MWY3NmY3YmMifQ==","name":"anup rao","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTUwNy4wMDcxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1507.00710"},{"id":"eyJwYXBlcklEIjoiMTUwNS4wMDI5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1505.00290"},{"id":"eyJwYXBlcklEIjoiMTYwNC4wNjk2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1604.06968"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01704"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wNDY0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.04644"},{"id":"eyJwYXBlcklEIjoiMjAwOS4xMzU2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.13566"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wOTc5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.09793"},{"id":"eyJwYXBlcklEIjoiMTgwOS4wNzY5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1809.07697"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMzE3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.13179"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wNDI1NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.04254"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNTA1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.05059"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wNDgxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.04817"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xNDA1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.14058"},{"id":"eyJwYXBlcklEIjoiMjIxMC4wNjU5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.06594"},{"id":"eyJwYXBlcklEIjoiMjExMS4wMzAzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.03030"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xMTMzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.11332"},{"id":"eyJwYXBlcklEIjoiMjEwOS4wOTIyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2109.09222"},{"id":"eyJwYXBlcklEIjoiMjMwNC4wMjI2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.02261"},{"id":"eyJwYXBlcklEIjoiNTM2MTUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53615"},{"id":"eyJwYXBlcklEIjoiNzMwODEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"73081"},{"id":"eyJwYXBlcklEIjoiNzEzMzciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71337"},{"id":"eyJwYXBlcklEIjoiMjQwNC4wMTU4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.01588"}]}]}},{"author":"julian zimmert","node":{"id":"eyJhZGRyZXNzIjoiemltbWVydEBnb29nbGUuY29tIn0=","address":"zimmert@google.com","name":"T. Zimmer","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"Google"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"isgoo2QAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIxN2RlNGM4NS05ZGFkLTRjZDAtYTA0ZC1kZGU4ZDJiZjY5ZTUifQ==","name":"julian zimmert","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/d763d18481104c7a6aad0637772fa56d_ef3ea10d54de1afd3c0a04726769e052df6126ca2469a12c4bb34307ff034699"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTgwNy4wNzYyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.07623"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01704"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wNTc0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.05745"},{"id":"eyJwYXBlcklEIjoiMTkwMS4wODc3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.08779"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wMTI2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.01264"},{"id":"eyJwYXBlcklEIjoiMTkwNS4xMTgxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.11817"},{"id":"eyJwYXBlcklEIjoiMjMwMS4xMjk0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2301.12942"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xMTU1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.11550"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMzI4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.13282"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wOTQwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.09408"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wOTczOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.09739"},{"id":"eyJwYXBlcklEIjoiMjAwMi4xMjAxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.12014"},{"id":"eyJwYXBlcklEIjoiMTgwNy4wMTQ4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.01488"},{"id":"eyJwYXBlcklEIjoiMjMwOC4xMDY3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2308.10675"},{"id":"eyJwYXBlcklEIjoiMjMwOS4wMDgxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2309.00814"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xNDkwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.14906"},{"id":"eyJwYXBlcklEIjoiMjQwMS4wMTg1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.01857"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xMDAyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.10022"},{"id":"eyJwYXBlcklEIjoiNTQwNDMiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54043"},{"id":"eyJwYXBlcklEIjoiNTQ0ODEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54481"},{"id":"eyJwYXBlcklEIjoiNzIzOTgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72398"},{"id":"eyJwYXBlcklEIjoiNzA0NDEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70441"},{"id":"eyJwYXBlcklEIjoiMjQwNS4wNjQ4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.06480"}]}]}},{"author":"aldo pacchiano","node":{"id":"eyJhZGRyZXNzIjoicGFjY2hpYW5vQGJlcmtlbGV5LmVkdSJ9","address":"pacchiano@berkeley.edu","name":"Aldo Pacchiano","avatar":"https://img.fullcontact.com/static/ae175a1814a6b280786226314efbc745_af36f75456a4d096355bf998a4b513c33e6e40c7543ab72df517afc2c9f1467c","linkedin":"https://www.linkedin.com/in/aldo-pacchiano-49573150","bio":"Mathematics, Computer Science and Writing","site":"http://www.aldopacchiano.com/","override":null,"membership":[{"name":"UC Berkeley"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/1836751?v=4","username":"pacchiano"}],"scholar":[{"thirdPartyID":"no_BfYgAAAAJ"}],"twitter":[{"avatar":null,"username":null}],"location":[{"formatted":"Boston, MA, USA"}],"owner":[{"id":"eyJ1aWQiOiI1NDcxMDFmNC00ZmI2LTQ5MjgtYTI5ZC00ZmFmMjJlOTM4NzcifQ==","name":"aldo pacchiano","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/ae175a1814a6b280786226314efbc745_af36f75456a4d096355bf998a4b513c33e6e40c7543ab72df517afc2c9f1467c"}],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01704"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMTIxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.01215"},{"id":"eyJwYXBlcklEIjoiMjAxMi4xMzA0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.13045"},{"id":"eyJwYXBlcklEIjoiMTkwNy4xMjA1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.12059"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wMDYzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.00632"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNTQ5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.05491"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wMjk5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.02993"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMDE4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.10185"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMTkxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.11911"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wOTczMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.09732"},{"id":"eyJwYXBlcklEIjoiMjIxMS4wNDk3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.04974"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wMjY5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.02693"},{"id":"eyJwYXBlcklEIjoiMjEwNS4xMDU5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.10590"},{"id":"eyJwYXBlcklEIjoiMjEwNS4xNDM2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.14363"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wMzc2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.03765"},{"id":"eyJwYXBlcklEIjoiMjIxMC4wOTU3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.09579"},{"id":"eyJwYXBlcklEIjoiMTkwNS4xMjY2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.12667"},{"id":"eyJwYXBlcklEIjoiMjAwMi4xMDAwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.10002"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wNDI2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.04268"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xMjQ0MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.12441"},{"id":"eyJwYXBlcklEIjoiMjEwMS4wMjA4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2101.02084"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wMzk3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.03976"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNDg1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.04858"},{"id":"eyJwYXBlcklEIjoiMjIwNy4xMjgwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.12805"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wOTc1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.09756"},{"id":"eyJwYXBlcklEIjoiMjExMS4wMjk5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.02994"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wMjg2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.02869"},{"id":"eyJwYXBlcklEIjoiMTUwMi4wNTA5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1502.05090"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wODM5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.08393"},{"id":"eyJwYXBlcklEIjoiMjIxMS4xNDQ2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.14469"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xNDkxMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.14912"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wNjE4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.06184"},{"id":"eyJwYXBlcklEIjoiMjMwOC4wODA1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2308.08051"},{"id":"eyJwYXBlcklEIjoiNTI4MzgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"52838"},{"id":"eyJwYXBlcklEIjoiNTI5NjMiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"52963"}]}]}},{"author":"tor lattimore","node":{"id":"eyJhZGRyZXNzIjoibGF0dGltb3JlQGdvb2dsZS5jb20ifQ==","address":"lattimore@google.com","name":"Lattimore","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"Google"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/52031?v=4","username":"tor"}],"scholar":[{"thirdPartyID":"fkDxJxcAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIwY2Y4ZGI3Ny1jYmJkLTRkZjEtYmM5Mi00MzdkODdmYWI3ZGQifQ==","name":"tor lattimore","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/ea5cf50d46d28c6f10ed8f67ce65c9f4_bc425be420c212397a24da634658ce48eab3d56d5154ac91a76ce6b7f796a393"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTcwMy4wNzcxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1703.07710"},{"id":"eyJwYXBlcklEIjoiMTYwNS4wODk4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.08988"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNzY3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.07676"},{"id":"eyJwYXBlcklEIjoiMTUwNy4wNzg4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1507.07880"},{"id":"eyJwYXBlcklEIjoiMTkwMi4xMDczMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.10730"},{"id":"eyJwYXBlcklEIjoiMTYwNi4wMzIwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1606.03203"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01704"},{"id":"eyJwYXBlcklEIjoiMTkwMS4wMjIzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.02230"},{"id":"eyJwYXBlcklEIjoiMTYwNS4wNzQxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.07416"},{"id":"eyJwYXBlcklEIjoiMTYwMi4wNDI4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1602.04282"},{"id":"eyJwYXBlcklEIjoiMTMwOC40ODI4IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1308.4828"},{"id":"eyJwYXBlcklEIjoiMTYwMy4wODY2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1603.08661"},{"id":"eyJwYXBlcklEIjoiMTYwMi4wNzkwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1602.07905"},{"id":"eyJwYXBlcklEIjoiMTQxMS4yOTE5IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1411.2919"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wMTg5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.01897"},{"id":"eyJwYXBlcklEIjoiMTUxMS4wMDA0OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1511.00048"},{"id":"eyJwYXBlcklEIjoiMTkwNS4xMTgxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.11817"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wMDQ3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.00475"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wNDAyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.04020"},{"id":"eyJwYXBlcklEIjoiMjAwOS4xMjIyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.12228"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wMzA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.03040"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wNjUzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.06535"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wMjI2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.02266"},{"id":"eyJwYXBlcklEIjoiMTgxMC4wMjU2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.02567"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wNDAxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.04019"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wNDAxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.04018"},{"id":"eyJwYXBlcklEIjoiMjEwNS4xNDI2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.14267"},{"id":"eyJwYXBlcklEIjoiMTkwNy4xMzA2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.13062"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNDY0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.04640"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wMTY2MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.01660"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wNjUwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.06506"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNTE0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.05145"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNTgxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.05819"},{"id":"eyJwYXBlcklEIjoiMTgxMS4xMDkyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.10928"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xMzE3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.13170"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wMzY4MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.03683"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wNzg5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.07890"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNTk2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.05964"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wMzI0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.03242"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMzA1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.13053"},{"id":"eyJwYXBlcklEIjoiMjExMC4xNTY4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.15688"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xMTkwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.11908"},{"id":"eyJwYXBlcklEIjoiNTUyNTUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"55255"},{"id":"eyJwYXBlcklEIjoiNzI1NjkiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72569"},{"id":"eyJwYXBlcklEIjoiNzIzMTQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72314"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xMzI5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.13294"},{"id":"eyJwYXBlcklEIjoiMTMwNy4wMTI3IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1307.0127"}]}]}},{"author":"yasin abbasi yadkori","node":{"id":"eyJhZGRyZXNzIjoieWFka29yaUBnb29nbGUuY29tIn0=","address":"yadkori@google.com","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/2314113?v=4","username":"yasin-abbasi"}],"scholar":[{"thirdPartyID":"-D0EgMIAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI4NDZkM2JhZi0wMzY1LTQ3MDUtYmQ0OC0yNDJjZDg2MzQ4NTEifQ==","name":"yasin abbasi yadkori","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgwNS4wMTY0OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.01648"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wODg2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.08865"},{"id":"eyJwYXBlcklEIjoiMTQwMi42NzYzIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1402.6763"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01704"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wNjQyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.06426"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wNDY0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.04644"},{"id":"eyJwYXBlcklEIjoiMTQwNi4zOTI2IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1406.3926"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNTQ5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.05491"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wOTc5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.09793"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNTI0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.05247"},{"id":"eyJwYXBlcklEIjoiMTcxMS4wNzk3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1711.07979"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wNDk3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.04970"},{"id":"eyJwYXBlcklEIjoiMTgwNC4xMDQ4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1804.10488"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wNjUzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.06532"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wNTM3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.05378"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMjYxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.12611"},{"id":"eyJwYXBlcklEIjoiMjIwMi4xMzAwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.13001"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wNTUzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.05533"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMzA1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.13053"},{"id":"eyJwYXBlcklEIjoiNzIzMTQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72314"}]}]}},{"author":"csaba szepesvari","node":{"id":"eyJhZGRyZXNzIjoic3plcGlAZ29vZ2xlLmNvbSJ9","address":"szepi@google.com","name":"I. Szep","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"Google"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/5859662?v=4","username":"Szepi"}],"scholar":[{"thirdPartyID":"zvC19mQAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJjMTQ3OGVhOS1lMGE0LTQ0OTItYmQ3NC05OGYzMjg2NDg0OGMifQ==","name":"csaba szepesvari","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/3815043e4a99934b97e032ddda88716a_75693adf86ab1193578b0a685e2a98a85e80afbb155fa7e13da9cfd211350616"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTUxMS4wMzAzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1511.03034"},{"id":"eyJwYXBlcklEIjoiMTMwNi4wNjg2IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1306.0686"},{"id":"eyJwYXBlcklEIjoiMTUwNy4wNDIwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1507.04208"},{"id":"eyJwYXBlcklEIjoiMTQxMC4wOTQ5IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1410.0949"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wOTQ5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.09491"},{"id":"eyJwYXBlcklEIjoiMTUwNi4wMjYzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1506.02632"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wMTEwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.01107"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01704"},{"id":"eyJwYXBlcklEIjoiMTYwOC4wMzAyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1608.03023"},{"id":"eyJwYXBlcklEIjoiMTUwNy4wMDA2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1507.00066"},{"id":"eyJwYXBlcklEIjoiMjAxMi4wODUwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.08507"},{"id":"eyJwYXBlcklEIjoiMTYwMi4wNDI4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1602.04282"},{"id":"eyJwYXBlcklEIjoiMTcwMy4wNjUxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1703.06513"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wNDY0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.04644"},{"id":"eyJwYXBlcklEIjoiMTYwOS4wNjM4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1609.06385"},{"id":"eyJwYXBlcklEIjoiMTUxMC4wODEwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1510.08108"},{"id":"eyJwYXBlcklEIjoiMTQwNi4zOTI2IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1406.3926"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMzA1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.13057"},{"id":"eyJwYXBlcklEIjoiMTgxMi4wMTY0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1812.01647"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wNzM4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.07380"},{"id":"eyJwYXBlcklEIjoiMTcwOS4wMjcyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1709.02726"},{"id":"eyJwYXBlcklEIjoiMTQwOS4zNjUzIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1409.3653"},{"id":"eyJwYXBlcklEIjoiMTkwMi4xMDA4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.10089"},{"id":"eyJwYXBlcklEIjoiMTgwNy4wMDc1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.00755"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wMjE1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.02151"},{"id":"eyJwYXBlcklEIjoiMjIwNC4wODk2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2204.08967"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wNTE5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.05198"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wMjM4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.02380"},{"id":"eyJwYXBlcklEIjoiMjAwNy4xMjkxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.12911"},{"id":"eyJwYXBlcklEIjoiMjIwOS4xNDk5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2209.14997"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wNjE4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.06184"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wMzA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.03040"},{"id":"eyJwYXBlcklEIjoiMTkwNC4xMTYwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.11608"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wODYwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.08607"},{"id":"eyJwYXBlcklEIjoiMTcwOS4wNDA3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1709.04073"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xMjM1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.12353"},{"id":"eyJwYXBlcklEIjoiMTkwMS4wNDY3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.04676"},{"id":"eyJwYXBlcklEIjoiMTgxMC4wMjU2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.02567"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wNDAxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.04019"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wNDAxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.04018"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMjYxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.12611"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMTMxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.01315"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wOTk3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.09973"},{"id":"eyJwYXBlcklEIjoiMjExMS4xMTQ4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.11485"},{"id":"eyJwYXBlcklEIjoiMjExMC4xNTU3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.15572"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNjI3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.06270"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNTgxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.05819"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xMTAzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.11032"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wNTUzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.05533"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wNzU2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.07565"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wNTgwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.05801"},{"id":"eyJwYXBlcklEIjoiMjQwMi4xNzIzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.17235"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wODM3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.08376"},{"id":"eyJwYXBlcklEIjoiMjIxMC4xNjkxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.16913"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMzA1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.13053"},{"id":"eyJwYXBlcklEIjoiMjQwMy4xMDM3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.10379"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMjA5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.02092"},{"id":"eyJwYXBlcklEIjoiMjMwMS4wNjI3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2301.06276"},{"id":"eyJwYXBlcklEIjoiMTUwNi4wMjkwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1506.02903"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xMjk0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.12940"},{"id":"eyJwYXBlcklEIjoiNTQwMzAiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54030"},{"id":"eyJwYXBlcklEIjoiNTI5MTciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"52917"},{"id":"eyJwYXBlcklEIjoiNTM3ODQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53784"},{"id":"eyJwYXBlcklEIjoiNTI5MjEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"52921"},{"id":"eyJwYXBlcklEIjoiNzAyNjUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70265"},{"id":"eyJwYXBlcklEIjoiNzE0NjMiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71463"},{"id":"eyJwYXBlcklEIjoiNjk4ODciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"69887"},{"id":"eyJwYXBlcklEIjoiNzIzMTQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72314"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xODUyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.18529"}]}]}}]},"__typename":"paper","authorArray":["Aldo Pacchiano","My Phan","Yasin Abbasi-Yadkori","Anup Rao","Julian Zimmert","Tor Lattimore","Csaba Szepesvari"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2003.01704","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2003.01704","publisher":"arxiv","paperJSON":{"title":"Model Selection in Contextual Stochastic Bandit Problems","paperID":"2003.01704","avgLineHeight":13.56,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"id":"id-71","text":"We study bandit model selection in stochastic environments. Our approach relies ","element":"span"},{"text":"on a meta-algorithm that selects between candidate base algorithms. We develop a meta-algorithm-base algorithm abstraction that can work with general classes of base algorithms and different type of adversarial meta-algorithms. Our methods rely on a novel and generic smoothing transformation for bandit algorithms that permits us to obtain optimal ","element":"span"},{"style":{"height":18.3},"width":109.3,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/0-0.png","element":"img","alt":" O(√T","inline":true},{"text":") model selection guarantees for stochastic contextual bandit problems as long as the optimal base algorithm satisfies a high probability regret guarantee. We show through a lower bound that even when one of the base algorithms has ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(log ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":") regret, in general it is impossible to get better than Ω(","element":"span"},{"style":{"height":16},"width":62.21,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/0-1.png","element":"img","alt":"√T","inline":true},{"text":") regret in model selection, even asymptotically. Using our techniques, we address model selection in a variety of problems such as misspecified linear contextual bandits [","element":"span"},{"href":"#id-0","referenceIndex":20,"text":"LSW20","element":"a"},{"text":"], linear bandit with unknown dimension [","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"FKL19","element":"a"},{"text":"] and reinforcement learning with unknown feature maps. Our algorithm requires the knowledge of the optimal base regret to adjust the meta-algorithm learning rate. We show that without such prior knowledge any meta-algorithm can suffer a regret larger than the optimal base regret.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"Bandit algorithms have been applied in a variety of decision making and personalization problems in industry. There are many specialized algorithms each designed to perform well in specific environments. For example, algorithms are designed to exploit low variance [","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"AMS09","element":"a"},{"text":"], extra context information, linear reward structure [","element":"span"},{"href":"#id-3","referenceIndex":13,"text":"DHK08","element":"a"},{"text":"; ","element":"span"},{"href":"#id-4","referenceIndex":22,"text":"Li+10","element":"a"},{"text":"; ","element":"span"},{"href":"#id-5","referenceIndex":2,"text":"AYPS11","element":"a"},{"text":"], sparsity [","element":"span"},{"href":"#id-6","referenceIndex":3,"text":"AYPS12","element":"a"},{"text":"; ","element":"span"},{"href":"#id-7","referenceIndex":10,"text":"CM12","element":"a"},{"text":"], etc. The exact properties of the current environment however might not be known in advance, and we might not know which algorithm is going to perform best.","element":"span"}],[{"text":"Model selection in contextual bandits aims to solve this problem. More formally, the learner is tasked to solve a bandit problem for which the appropriate bandit algorithm to use is not known in advance. Despite this limitation, the learner does have access to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"different algorithms ","element":"span"},{"style":{"height":20.02},"width":140.69,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-0.png","element":"img","alt":" {Bi}Mi=1","inline":true},{"text":", one of which ","element":"span"},{"style":{"height":16.61},"width":52.19,"height":41.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-1.png","element":"img","alt":" Bi⋆ ","inline":true,"padRight":true},{"text":"is promised to be adequate for the problem ","element":"span"},{"text":"the learner wishes to solve. We use regret to measure the learner’s performance","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-2.png","element":"img","alt":"1","inline":true},{"text":". The problem’s objective is to design algorithms to minimize regret.","element":"span"}],[{"text":"The algorithms we develop in this work follow the template of [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"] where a ‘meta-algorithm’ algorithm is placed on top of a couple of ‘base’ algorithms (in this case ","element":"span"},{"style":{"height":20.02},"width":171.26,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-3.png","element":"img","alt":" {Bi}Mi=1).","inline":true,"padRight":true},{"text":"At the beginning of each round the meta-algorithm selects which base algorithm to ‘listen to’ during that time-step effectively treating the base algorithms as arms to be pulled by the meta-algorithm. The difficulty in using existing algorithms such as UCB or EXP3 [","element":"span"},{"href":"#id-9","referenceIndex":8,"text":"BC12","element":"a"},{"text":"] as a meta-algorithm lies in the non-stationary nature of the rewards collected by a learning base algorithm. The meta-algorithm needs to be sufficiently smart to recognize when a base algorithm is simply performing poorly because it still in the early stages of learning from the case where poor performance is the result of model misspecification.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Adapted and misspecified algorithms ","element":"span"},{"text":"We say that an algorithm is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"adapted ","element":"span"},{"text":"to the environment at hand if it satisfies a valid regret guarantee. Let’s illustrate this with an example in the setting of linear bandits with finitely many arms. In this problem the learner has access to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"arms. Each arm ","element":"span"},{"style":{"height":17.2},"width":120.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-4.png","element":"img","alt":" i ∈ [K","inline":true},{"text":"] is associated with a feature vector ","element":"span"},{"style":{"height":18.33},"width":150.09,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-5.png","element":"img","alt":" zi ∈ Rd,","inline":true,"padRight":true},{"text":"and the reward of arm ","element":"span"},{"style":{"height":17.6},"width":122.37,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-6.png","element":"img","alt":" i ∈ [K","inline":true},{"text":"] follows a linear model of the form ","element":"span"},{"style":{"height":17.6},"width":306.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-7.png","element":"img","alt":" ri = ⟨zi, θ⋆⟩ + ξi","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":16.4},"width":31.09,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-8.png","element":"img","alt":"ξi","inline":true,"padRight":true},{"text":"is conditionally zero mean and ","element":"span"},{"style":{"height":15.02},"width":36.48,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-9.png","element":"img","alt":" θ⋆","inline":true,"padRight":true},{"text":"is an unknown parameter. An algorithm such as LinUCB [","element":"span"},{"href":"#id-10","referenceIndex":11,"text":"Chu+11","element":"a"},{"text":"] achieves a regret guarantee of order ","element":"span"},{"style":{"height":12.8},"width":35,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-10.png","element":"img","alt":"�O","inline":true},{"text":"(","element":"span"}],[{"text":"logarithmic factors in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":". In contrast, the UCB algorithm [","element":"span"},{"href":"#id-11","referenceIndex":7,"text":"ACBF02","element":"a"},{"text":"] yields a regret guarantee of order ","element":"span"},{"style":{"height":19.98},"width":160.41,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-11.png","element":"img","alt":"�O(√KT","inline":true},{"text":"). In this case, both algorithms are well adapted to the problem of linear bandits with finitely many actions, but LinUCB’s regret guarantee may be substantially smaller than UCB’s regret upper bound if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"is much smaller than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":". If an algorithm is not well adapted, we say it is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"misspecified","element":"span"},{"text":". For the sake of exposition let’s assume we are in a similar setting as above, where the learner has access to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"arms each of which is associated with a feature vector ","element":"span"},{"style":{"height":17.75},"width":136.76,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/1-12.png","element":"img","alt":" zi ∈ Rd","inline":true},{"text":". Instead of assuming a linear model as before, let’s instead ","element":"span"},{"id":"id-80","text":"assume that ","element":"span"},{"style":{"height":20.84},"width":275.94,"height":52.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-0.png","element":"img","alt":" ri = (⟨zi, θ⋆⟩)2","inline":true,"padRight":true},{"text":"+ ","element":"span"},{"style":{"height":16.4},"width":31.09,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-1.png","element":"img","alt":" ξi","inline":true,"padRight":true},{"text":"is quadratic. In this case, there is no reason to believe LinUCB can yield a valid regret guarantee since the underlying linearity assumption of LinUCB is violated. We say that in this case LinUCB is misspecified. Consider an instance of LinUCB that instead uses matrix features of the form ","element":"span"},{"style":{"height":12.89},"width":72.03,"height":32.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-2.png","element":"img","alt":" ziz⊤i","inline":true,"padRight":true},{"text":". In this case the quadratic ","element":"span"},{"text":"reward is again a linear function of the feature vectors since (","element":"span"},{"style":{"height":21.33},"width":572.14,"height":53.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-3.png","element":"img","alt":"⟨zi, θ⋆⟩)2 = ⟨ziz⊤i , θ⋆θ⊤⋆ ⟩. Thus","inline":true,"padRight":true},{"text":"this version of LinUCB with quadratic features is adapted.","element":"span"}],[{"text":"We will assume that all algorithms ","element":"span"},{"style":{"height":17.2},"width":249.46,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-4.png","element":"img","alt":" Bi for i ∈ [M","inline":true},{"text":"] are associated with a putative regret guarantee ","element":"span"},{"style":{"height":17.6},"width":115.73,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-5.png","element":"img","alt":" Ui(t, δ","inline":true},{"text":") that is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"known ","element":"span"},{"text":"by the learner and holding with probability 1 ","element":"span"},{"style":{"height":12.8},"width":63.64,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-6.png","element":"img","alt":" − δ","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":17.2},"width":583.75,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-7.png","element":"img","alt":"t ∈ [N] if algorithm i is adapted","inline":true,"padRight":true},{"text":"to the environment at hand. If the learner knew the identity of the best adapted algorithm ","element":"span"},{"style":{"height":14.62},"width":31.04,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-8.png","element":"img","alt":" i⋆","inline":true},{"text":", it would be able to incur regret of order ","element":"span"},{"style":{"height":17.6},"width":146.42,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-9.png","element":"img","alt":" Ui⋆(T, δ","inline":true},{"text":") by playing ","element":"span"},{"style":{"height":16.61},"width":52.2,"height":41.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-10.png","element":"img","alt":" Bi⋆","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"time-steps. The learner’s objective in the model selection problem is to design a procedure that would allow a learner to incur in regret that is competitive with the regret upper bound ","element":"span"},{"style":{"height":17.2},"width":132.38,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-11.png","element":"img","alt":" Ui⋆(t, δ","inline":true},{"text":") of the best adapted algorithm among those in ","element":"span"},{"style":{"height":20.02},"width":300.11,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-12.png","element":"img","alt":" {Bi}Mi=1, so that","inline":true,"padRight":true},{"text":"the regret incurred by the learner up to time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"scales as a function of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":", the parameters defining ","element":"span"},{"style":{"height":16.61},"width":52.19,"height":41.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-13.png","element":"img","alt":" Bi⋆","inline":true,"padRight":true},{"text":"(and therefore ","element":"span"},{"style":{"height":17.6},"width":133.07,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-14.png","element":"img","alt":" Ui⋆(t, δ","inline":true},{"text":")) and possibly ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":". From now on we will refer to each of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"algorithms in ","element":"span"},{"style":{"height":16.61},"width":52.19,"height":41.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-15.png","element":"img","alt":" Bi⋆","inline":true,"padRight":true},{"text":"as a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"base algorithm","element":"span"},{"text":". We will alert the reader if we have a specific set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"algorithms in mind. In any other case, when we talk about the set of base algorithms we simply mean a set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"algorithms the learner is model selecting from.","element":"span"}],[{"text":"The authors of [","element":"span"},{"href":"#id-12","referenceIndex":24,"text":"OM11","element":"a"},{"text":"] were perhaps the first to address the bandit model-selection problem, with a variant of an EXP4 meta-algorithm that works with UCB or EXP3 base algorithms. These results are improved by [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"] via the CORRAL algorithm. The CORRAL algorithm follows the meta-algorithm-base template that we discussed at the beginning of this section. It makes use of a CORRAL meta-algorithm based on a LogBarrier Online Mirror Descent algorithm controlling which of the base algorithms to play at any given round. Let ","element":"span"},{"style":{"height":15.6},"width":261.95,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-16.png","element":"img","alt":" pt be the M−","inline":true},{"text":"dimensional probability distribution over the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"base algorithms given by the CORRAL meta-algorithm. The learner will sample an algorithm index ","element":"span"},{"style":{"height":17.6},"width":152.87,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-17.png","element":"img","alt":" jt ∈ [M","inline":true},{"text":"] with ","element":"span"},{"style":{"height":16},"width":133.31,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-18.png","element":"img","alt":" jt ∼ pt","inline":true},{"text":". and play the action prescribed by ","element":"span"},{"style":{"height":17.42},"width":53.36,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-19.png","element":"img","alt":" Bjt","inline":true,"padRight":true},{"text":"to collect a reward ","element":"span"},{"style":{"height":10.62},"width":31.68,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-20.png","element":"img","alt":"rt","inline":true},{"text":". All algorithms ","element":"span"},{"style":{"height":20.03},"width":140.69,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-21.png","element":"img","alt":" {Bi}Mi=1","inline":true,"padRight":true},{"text":"are then updated using an importance weighted version of ","element":"span"},{"style":{"height":10.62},"width":31.69,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/2-22.png","element":"img","alt":" rt","inline":true,"padRight":true},{"text":"regardless of whether they were selected by the meta-algorithm or not.","element":"span"}],[{"text":"Unfortunately, this means that in order to use a base algorithm in CORRAL, it needs to be compatible with this importance weighting modification of the rewards. For example, to use UCB as a base, we would need to manually re-derive UCB’s confidence intervals and modify its regret analysis to be compatible with importance weighted feedback. The authors show that a base algorithm can be safely combined with the CORRAL meta-algorithm to yield model selection guarantees provided it satisfies a stability condition (see Definition 3 in [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"]). Verifying that an algorithm satisfies such stability condition is a cumbersome process that requires a detailed analysis of the algorithm’s internal workings. In this work we instead focus on devising a black-box procedure that can solve the model selection problem for a general class of stochastic contextual bandit algorithms. This work introduced the first black-box method for model selection in stochastic contextual bandits, and it has been followed by many others that have expanded and refined these results; most","element":"span"}],[{"id":"id-59","style":{"width":"71%"},"width":1228,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/3-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Contributions. ","element":"span"},{"text":"We focus on the problem of bandit model-selection in stochastic environments. Our contributions are as follows:","element":"span"}],[{"text":"• ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A new model selection algorithm","element":"span"},{"text":". We introduce Stochastic CORRAL, a two step per round algorithm and an accompanying base “smoothing” wrapper that can be shown to satisfy model selection guarantees when combined with any set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"stochastic contextual bandit algorithms such that at least one of them is adapted and satisfies a high probability regret guarantee. We also show model selection regret guarantees for Stochastic CORRAL with two distinct adversarial meta-algorithms, CORRAL [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"] and EXP3.P [","element":"span"},{"href":"#id-9","referenceIndex":8,"text":"BC12","element":"a"},{"text":"]. Our approach is more general than that of the original CORRAL algorithm [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"] because instead of requiring each base algorithm to be individually modified to satisfy a certain stability condition, our version of the CORRAL algorithm provides the algorithm designer with a generic black-box wrapper that allows to do model selection over any set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"base algorithm with high probability regret guarantees. Stochastic CORRAL has another important difference with respect to the original CORRAL algorithm: instead of importance weighted feedback, the unadulterated reward ","element":"span"},{"style":{"height":10.62},"width":31.69,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/3-1.png","element":"img","alt":" rt","inline":true,"padRight":true},{"text":"is sent to algorithm ","element":"span"},{"style":{"height":17.42},"width":53.36,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/3-2.png","element":"img","alt":" Bjt","inline":true},{"text":", and only this algorithm is allowed to update its internal state at round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". The main consequence of these properties of Stochastic CORRAL is that our model selection strategy can be used with almost any base algorithm developed for stochastic environments. When the learner has knowledge of the function ","element":"span"},{"style":{"height":17.6},"width":133.07,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/3-3.png","element":"img","alt":" Ui⋆(t, δ","inline":true},{"text":") but not of ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/3-4.png","element":"img","alt":" i⋆","inline":true,"padRight":true},{"text":"(for example when all the putative upper bounds ","element":"span"},{"style":{"height":17.6},"width":115.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/3-5.png","element":"img","alt":" Ui(t, δ","inline":true},{"text":") are the same), using the CORRAL meta-algorithm achieves optimal regret guarantees. When the optimal target regret is unknown sometimes using a an EXP3.P meta-algorithm can achieve better performance.","element":"span"}],[{"text":"• ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A versatile algorithm. ","element":"span"},{"text":"We demonstrate the generality and effectiveness of our method by showing how it seamlessly improves existing results or addresses open questions in a variety of problems. We show applications in adapting to the misspecification level in contextual linear bandits [","element":"span"},{"href":"#id-0","referenceIndex":20,"text":"LSW20","element":"a"},{"text":"], adapting to the unknown dimension in nested linear bandit classes [","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"FKL19","element":"a"},{"text":"], tuning the data-dependent exploration rate of bandit algorithms, and choosing feature maps in reinforcement learning. Moreover, our meta-algorithm can simultaneously perform different types of model selection. For example, we show how to choose both the unknown dimension and the unknown mis-specification error at the same time. This is in contrast to algorithms that specialize in a specific type of model selection such as detecting the unknown dimension [","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"FKL19","element":"a"},{"text":"].","element":"span"}],[{"text":"• ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Lower bounds. ","element":"span"},{"text":"In the stochastic domain, an important question is whether a model selection procedure can inherit the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(log ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":") regret of a fast stochastic base algorithm. We show a lower bound for the model selection problem that scales as Ω(","element":"span"},{"style":{"height":17.6},"width":67.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/3-6.png","element":"img","alt":"√T","inline":true},{"text":"), which implies that our result is minimax optimal. Recall the CORRAL meta-algorithm achieves an oracle optimal model selection regret guarantee provided the algorithm has access to","element":"span"}],[{"id":"id-70","style":{"height":17.6},"width":132.81,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-0.png","element":"img","alt":"Ui⋆(t, δ","inline":true},{"text":"). This begs the question of whether this is an unavoidable statistical limitation of model selection or just a property of the CORRAL meta-algorithm. We show this condition is unavoidable in general: there are problems where the regret of the best base algorithm scales as ","element":"span"},{"style":{"height":17.2},"width":101.9,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-1.png","element":"img","alt":" O(T x","inline":true},{"text":") for an unknown ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":", and the regret of any meta-algorithm that is unaware of the value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"scales as Ω(","element":"span"},{"style":{"height":17.6},"width":267.82,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-2.png","element":"img","alt":"T y) for y > x.","inline":true}]]},{"heading":"2 Problem Statement","paragraphs":[[{"text":"We use the notation ","element":"span"},{"style":{"height":15.02},"width":37.4,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-3.png","element":"img","alt":" δa","inline":true,"padRight":true},{"text":"to write the delta distribution at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":". For an integer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":", we use [","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":"] to denote the set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , n","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":". We consider the following formulation of contextual stochastic bandits. At the beginning of each time-step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", the learner observes a context ","element":"span"},{"style":{"height":15.42},"width":46.84,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-4.png","element":"img","alt":" At","inline":true,"padRight":true},{"text":"that corresponds to a subset of an ‘action-set’ ","element":"span"},{"text":"A","element":"span"},{"text":". After this the learner will select an action ","element":"span"},{"style":{"height":15.42},"width":141.96,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-5.png","element":"img","alt":"at ∈ At","inline":true,"padRight":true},{"text":"and then collect a reward ","element":"span"},{"style":{"height":17.6},"width":244.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-6.png","element":"img","alt":" rt = f(At, at","inline":true},{"text":") + ","element":"span"},{"style":{"height":16.4},"width":31.09,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-7.png","element":"img","alt":" ξt","inline":true},{"text":", a noisy quantity that will depend on the context ","element":"span"},{"style":{"height":15.42},"width":46.84,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-8.png","element":"img","alt":" At","inline":true},{"text":", and the learner’s action ","element":"span"},{"style":{"height":10.62},"width":35.07,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-9.png","element":"img","alt":" at","inline":true},{"text":", a reward function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"and a 1","element":"span"},{"style":{"height":4.8},"width":34,"height":12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-10.png","element":"img","alt":"−","inline":true},{"text":"subGaussian conditionally zero mean random noise random variable ","element":"span"},{"style":{"height":16.4},"width":31.1,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-11.png","element":"img","alt":" ξt","inline":true},{"text":". In this work we will restrict ourselves to the case where contexts sets ","element":"span"},{"style":{"height":15.42},"width":46.84,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-12.png","element":"img","alt":" At","inline":true,"padRight":true},{"text":"are all subsets of a context generating set ","element":"span"},{"text":"A","element":"span"},{"text":". This is in fact a very general scenario that captures all types of contextual bandit problems ranging from the case of changing linear contexts with linear rewards, to more general contexts and reward sets studied in works such as [","element":"span"},{"href":"#id-13","referenceIndex":14,"text":"FR20","element":"a"},{"text":"]. For simplicity we will assume the contexts ","element":"span"},{"style":{"height":15.42},"width":46.84,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-13.png","element":"img","alt":" At","inline":true,"padRight":true},{"text":"are made of the subset of available actions to the learner at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". Our formulation allows for the action set to vary in size from round to round and even to be infinite. For example, the finite linear bandit setting (where ","element":"span"},{"style":{"height":17.75},"width":247.14,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-14.png","element":"img","alt":" At = A ⊂ Rd","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") fits in this setting. Similarly it is easy to see the linear contextual bandit problem with i.i.d. contexts and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"actions can also be written as an instance of our formulation. In the linear contextual bandit problem with ","element":"span"},{"style":{"height":14.62},"width":49.06,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-15.png","element":"img","alt":" Kt","inline":true,"padRight":true},{"text":"actions (where ","element":"span"},{"style":{"height":14.62},"width":49.06,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-16.png","element":"img","alt":" Kt","inline":true,"padRight":true},{"text":"may be infinite) the learner is presented at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"with ","element":"span"},{"style":{"height":14.62},"width":49.06,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-17.png","element":"img","alt":" Kt","inline":true,"padRight":true},{"text":"action-vectors ","element":"span"},{"style":{"height":21.62},"width":228.52,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-18.png","element":"img","alt":" A = {ai}Kti=1","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":17.75},"width":139.44,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-19.png","element":"img","alt":" ai ∈ Rd","inline":true,"padRight":true},{"text":"and the (random) ","element":"span"},{"text":"return ","element":"span"},{"style":{"height":10.62},"width":37.69,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-20.png","element":"img","alt":" ra","inline":true,"padRight":true},{"text":"of any action ","element":"span"},{"style":{"height":17.6},"width":743.79,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-21.png","element":"img","alt":" a ∈ A satisfies ra = ⟨a, θ⋆⟩+ξ. The K−","inline":true},{"text":"action setting where contexts ","element":"span"},{"style":{"height":14.62},"width":128.52,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-22.png","element":"img","alt":"xt ∈ X","inline":true,"padRight":true},{"text":"(an abstract context set) and the learner has access to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"actions per round can also be formulated in this way by defining ","element":"span"},{"style":{"height":17.6},"width":234.71,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-23.png","element":"img","alt":" A = X × [K","inline":true},{"text":"] and defining ","element":"span"},{"style":{"height":14.7},"width":55.66,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-24.png","element":"img","alt":" DS","inline":true,"padRight":true},{"text":"to be a distribution over subsets of the form ","element":"span"},{"style":{"height":20.03},"width":435.48,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-25.png","element":"img","alt":" {(x, i)}Ki=1 with x ∈ X.","inline":true}],[{"text":"In this work we focus on the setting of stochastic i.i.d. contexts. Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"be the set of all subsets of ","element":"span"},{"text":"A ","element":"span"},{"text":"and let ","element":"span"},{"style":{"height":14.7},"width":55.66,"height":36.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-26.png","element":"img","alt":" DS","inline":true,"padRight":true},{"text":"be a distribution over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":". We assume all contexts ","element":"span"},{"style":{"height":21.4},"width":200.16,"height":53.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-27.png","element":"img","alt":" Ati.i.d.∼ DS","inline":true,"padRight":true},{"text":"and that ","element":"span"},{"style":{"height":16.4},"width":278.11,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-28.png","element":"img","alt":" f : S × A → R","inline":true},{"text":". For an arbitrary subset ","element":"span"},{"style":{"height":14},"width":125.63,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-29.png","element":"img","alt":" A ⊂ A","inline":true,"padRight":true},{"text":"we denote the space of distributions over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"as ∆","element":"span"},{"style":{"height":10},"width":28,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-30.png","element":"img","alt":"A","inline":true},{"text":". A policy ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-31.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"is a mapping such that for any subset ","element":"span"},{"style":{"height":14},"width":125.02,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-32.png","element":"img","alt":" A ⊂ A","inline":true,"padRight":true},{"text":"in the support of ","element":"span"},{"style":{"height":14.7},"width":55.66,"height":36.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-33.png","element":"img","alt":"DS","inline":true,"padRight":true},{"text":"outputs an element of ∆","element":"span"},{"style":{"height":10},"width":28,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-34.png","element":"img","alt":"A","inline":true},{"text":". Let’s denote by Π as the space of all policies with domain in Support(","element":"span"},{"style":{"height":14.7},"width":55.66,"height":36.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-35.png","element":"img","alt":"DS","inline":true},{"text":"). We abuse notation and denote ","element":"span"},{"style":{"height":19.95},"width":519.25,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-36.png","element":"img","alt":" f(A, π) = Ea∼π(X) [f(A, a)]","inline":true,"padRight":true},{"text":". Notice that in this case ","element":"span"},{"style":{"height":17.6},"width":602.97,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-37.png","element":"img","alt":" f(A, a) = f(A, δa) for all a ∈ A","inline":true},{"text":". We will generally omit the ","element":"span"},{"style":{"height":15.5},"width":148.73,"height":38.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-38.png","element":"img","alt":" A ∼ DS","inline":true,"padRight":true},{"text":"dependence from our expectation notation in the future.","element":"span"}],[{"text":"In a contextual bandit problem the learner chooses policy ","element":"span"},{"style":{"height":10.22},"width":36.87,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-39.png","element":"img","alt":" πt","inline":true,"padRight":true},{"text":"at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", which takes context set ","element":"span"},{"style":{"height":15.42},"width":131.4,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-40.png","element":"img","alt":" At ∈ S","inline":true,"padRight":true},{"text":"as an input and outputs a distribution over ","element":"span"},{"style":{"height":15.42},"width":46.84,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/4-41.png","element":"img","alt":" At","inline":true},{"text":". The learner then selects","element":"span"}],[{"id":"id-61","text":"an action ","element":"span"},{"style":{"height":17.6},"width":198.38,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-0.png","element":"img","alt":" at ∼ πt(At","inline":true},{"text":") and receives a reward ","element":"span"},{"style":{"height":17.6},"width":613.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-1.png","element":"img","alt":" rt such that rt = f(At, δat) + ξt.","inline":true,"padRight":true},{"text":"We are interested in designing an algorithm with small regret, defined as","element":"span"}],[{"id":"id-17","style":{"width":"74%"},"width":1286,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-2.png","element":"img"}],[{"text":"If for example ","element":"span"},{"style":{"height":20.8},"width":762.96,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-3.png","element":"img","alt":" Ui(T, δ) = cdi�T log(1/δ) for all i ∈ [M","inline":true},{"text":"] we would like our algorithm to satisfy a regret guarantee of the form ","element":"span"},{"style":{"height":24.5},"width":1007.81,"height":61.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-4.png","element":"img","alt":" R(T) ≤ O(Mαdβi⋆�T log(1/δ)) for some α ≥ 0, β ≥ 1","inline":true,"padRight":true},{"text":"and where ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-5.png","element":"img","alt":" i⋆","inline":true,"padRight":true},{"text":"is the index of the best performing adapted base algorithm ","element":"span"},{"style":{"height":16.61},"width":52.19,"height":41.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-6.png","element":"img","alt":" Bi⋆","inline":true},{"text":". Crucially, we want to avoid this guarantee to depend on other ","element":"span"},{"style":{"height":16.61},"width":145.93,"height":41.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-7.png","element":"img","alt":" di > di⋆","inline":true,"padRight":true},{"text":"(if any). From now on we will refer to the policy maximizing the right hand side of the equation above as ","element":"span"},{"style":{"height":12.73},"width":43.44,"height":31.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-8.png","element":"img","alt":" π∗","inline":true},{"text":". For simplicity we will also make the following assumption regarding the range of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":",","element":"span"}],[{"id":"id-25","style":{"fontWeight":"bold"},"text":"Assumption 2.1 ","element":"span"},{"text":"(Bounded Expected Rewards)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The absolute value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is bounded by ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":",","element":"span"}],[{"style":{"width":"20%"},"width":346,"height":75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-9.png","element":"img"}],[{"text":"Throughout this work we assume the base algorithms we want to model select from satisfy a high probability regret bound whenever they are well adapted to their environment. We make this more precise in definition ","element":"span"},{"href":"#id-14","text":"2.1","element":"a"},{"text":",","element":"span"}],[{"id":"id-14","style":{"fontWeight":"bold"},"text":"Definition 2.1 ","element":"span"},{"text":"((","element":"span"},{"style":{"height":17.6},"width":169.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-10.png","element":"img","alt":"U, δ, T)−","inline":true},{"text":"Boundedness)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":12.4},"width":155.12,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-11.png","element":"img","alt":" U : R ×","inline":true,"padRight":true},{"text":"[0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1] ","element":"span"},{"style":{"height":14.73},"width":116.92,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-12.png","element":"img","alt":" → R+","inline":true},{"style":{"fontStyle":"italic"},"text":". We say an adapted algorithm ","element":"span"},{"style":{"height":17.2},"width":273.04,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-13.png","element":"img","alt":" B is (U, δ, T)−","inline":true},{"style":{"fontStyle":"italic"},"text":"bounded if with probability at least ","element":"span"},{"text":"1","element":"span"},{"style":{"height":12.8},"width":58.52,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-14.png","element":"img","alt":"−δ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and for all rounds ","element":"span"},{"style":{"height":17.2},"width":178.18,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-15.png","element":"img","alt":" t ∈ [1, T],","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"its cumulative pseudo-regret is bounded above by ","element":"span"},{"style":{"height":21.2},"width":828.58,"height":53.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-16.png","element":"img","alt":" U(t, δ): �tl=1 f(Al, π∗)−f(Al, πl) ≤ U(t, δ).","inline":true}],[{"text":"We assume that for all ","element":"span"},{"style":{"height":17.2},"width":126.25,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-17.png","element":"img","alt":" i ∈ [M","inline":true},{"text":"] the base algorithm ","element":"span"},{"style":{"height":17.2},"width":245.17,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-18.png","element":"img","alt":" Bi is (Ui, δ, T","inline":true},{"text":")-bounded for a function ","element":"span"},{"style":{"height":14.62},"width":41.79,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-19.png","element":"img","alt":"Ui","inline":true,"padRight":true},{"text":"known to the learner","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-20.png","element":"img","alt":"2","inline":true},{"text":". For example in the Multi Armed Bandit Problem with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"arms the UCB algorithm is (","element":"span"},{"style":{"height":20.8},"width":444.99,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-21.png","element":"img","alt":"c�KT log(T/δ), δ, T)−","inline":true},{"text":"bounded for some universal constant ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c > ","element":"span"},{"text":"0.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Original CORRAL","element":"span"}],[{"text":"We start by reproducing the pesudo-code of CORRAL [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"] (see Algorithm ","element":"span"},{"href":"#id-15","text":"1","element":"a"},{"text":") as it will prove helpful in our discussion of our main algorithm: Stochastic CORRAL. As we have explained in the previous section we assume there are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"candidate ","element":"span"},{"style":{"fontStyle":"italic"},"text":"base ","element":"span"},{"text":"algorithms and a meta-algorithm which we denote as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":". At time-step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"the CORRAL meta-algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"selects one of the base algorithms in ","element":"span"},{"style":{"height":20.02},"width":140.69,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-22.png","element":"img","alt":" {Bi}Mi=1","inline":true,"padRight":true},{"text":"according to a distribution ","element":"span"},{"style":{"height":16.4},"width":164.33,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-23.png","element":"img","alt":" pt ∈ ∆M","inline":true,"padRight":true},{"text":"by ","element":"span"},{"text":"sampling an index ","element":"span"},{"style":{"height":16},"width":132.52,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-24.png","element":"img","alt":" jt ∼ pt","inline":true},{"text":". The learner plays action ","element":"span"},{"style":{"height":18.22},"width":243.42,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-25.png","element":"img","alt":" at ∼ πt,jt(At","inline":true},{"text":") and receives reward ","element":"span"},{"style":{"height":17.2},"width":352.6,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-26.png","element":"img","alt":"rt = f(At, δat) + ξt","inline":true},{"text":". An importance weighted version of ","element":"span"},{"style":{"height":10.62},"width":31.69,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/5-27.png","element":"img","alt":" rt","inline":true,"padRight":true},{"text":"is sent out to all base algorithms, after which all of them update their internal state.","element":"span"}],[{"id":"id-15","style":{"width":"99%"},"width":1714,"height":2118,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/6-0.png","element":"img"}]]},{"heading":"3 The Stochastic CORRAL Algorithm","paragraphs":[[{"text":"In order to better describe the feedback structure of Stochastic CORRAL we abstract the meta-algorithm-base interaction template discussed in Section ","element":"span"},{"text":"1 ","element":"span"},{"text":"into Algorithms ","element":"span"},{"href":"#id-16","text":"4 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-16","text":"5","element":"a"},{"text":". As we have mentioned before, one crucial difference between Stochastic CORRAL and CORRAL is that in Stochastic CORRAL only the state of the base algorithm whose action was selected is modified. In contrast in the CORRAL algorithm all the base algorithms’ states are updated during every step.","element":"span"}],[{"text":"To make this description more precise we introduce some notation. Each base algorithm ","element":"span"},{"style":{"height":17.42},"width":43.66,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-0.png","element":"img","alt":"Bj","inline":true,"padRight":true},{"text":"maintains a counter ","element":"span"},{"style":{"height":13.02},"width":57.1,"height":32.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-1.png","element":"img","alt":" st,j","inline":true,"padRight":true},{"text":"that keeps track of the number of times it has been updated up to time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". We will say algorithm ","element":"span"},{"style":{"height":17.42},"width":43.66,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-2.png","element":"img","alt":" Bj","inline":true,"padRight":true},{"text":"is in ‘state’ ","element":"span"},{"style":{"height":13.02},"width":57.1,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-3.png","element":"img","alt":" st,j","inline":true,"padRight":true},{"text":"at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". For any base algorithm ","element":"span"},{"style":{"height":17.42},"width":137.59,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-4.png","element":"img","alt":"Bj, πs,j","inline":true,"padRight":true},{"text":"is the policy ","element":"span"},{"style":{"height":17.42},"width":43.66,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-5.png","element":"img","alt":" Bj","inline":true,"padRight":true},{"text":"uses at state ","element":"span"},{"style":{"height":15.02},"width":220.57,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-6.png","element":"img","alt":" s. If t1 < t2","inline":true,"padRight":true},{"text":"are two consecutive times when base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"is chosen by the meta-algorithm, then base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"proposed policy ","element":"span"},{"style":{"height":15.03},"width":117.64,"height":37.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-7.png","element":"img","alt":" πst1,j,j","inline":true,"padRight":true},{"text":"at time ","element":"span"},{"style":{"height":13.82},"width":32.76,"height":34.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-8.png","element":"img","alt":" t1","inline":true,"padRight":true},{"text":"and policy ","element":"span"},{"style":{"height":15.03},"width":117.64,"height":37.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-9.png","element":"img","alt":"πst2,j,j","inline":true,"padRight":true},{"text":"during all times ","element":"span"},{"style":{"height":17.42},"width":680.01,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-10.png","element":"img","alt":" t1 + 1, . . . , t2 where st2,j = st1,j + 1.","inline":true}],[{"id":"id-16","style":{"width":"99%"},"width":1714,"height":1096,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/7-11.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Regret Decomposition. ","element":"span"},{"text":"Let’s introduce the regret decomposition we will make use of to prove our regret guarantees. This is a similar decomposition as the one appearing in the proofs of Theorem 4,5 and 7 of [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"]. We split the regret ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":") of Equation ","element":"span"},{"href":"#id-17","text":"1 ","element":"a"},{"text":"into two","element":"span"}],[{"text":"terms (I and II) by adding and subtracting terms ","element":"span"},{"style":{"height":21.7},"width":387.28,"height":54.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-0.png","element":"img","alt":" {f(At, πst,i⋆,i⋆)}Tt=1 :","inline":true}],[{"id":"id-39","style":{"width":"92%"},"width":1592,"height":342,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-1.png","element":"img"}],[{"text":"Term I is the regret of the meta-algorithm with respect to the optimal base ","element":"span"},{"style":{"height":16.61},"width":261.12,"height":41.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-2.png","element":"img","alt":" Bi⋆, and term","inline":true,"padRight":true},{"text":"II is the regret of the optimal base with respect to the optimal policy ","element":"span"},{"style":{"height":12.73},"width":43.44,"height":31.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-3.png","element":"img","alt":" π∗","inline":true},{"text":". Analysis of term I is largely based on the adversarial regret guarantees of the Log-Barrier-OMD in CORRAL and of the EXP3.P algorithm.","element":"span"}],[{"text":"In order to bound term II we will have to modify the feedback structure of Algorithms ","element":"span"},{"href":"#id-16","text":"4","element":"a"}],[{"text":"and ","element":"span"},{"href":"#id-16","text":"5","element":"a"},{"text":". In Algorithm ","element":"span"},{"href":"#id-18","text":"8 ","element":"a"},{"text":"from Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"we introduce a smoothing procedure that allows any (","element":"span"},{"style":{"height":15.6},"width":118.14,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-4.png","element":"img","alt":"U, δ, T","inline":true},{"text":")-bounded algorithm to be transformed into a ‘smoothed’ version of itself such that its conditional expected instantaneous regret is bounded with high probability during every even step. We name this procedure ‘smoothing’ because it is based on playing uniformly from the set of previously played policies during the smoothed algorithm’s odd steps. We provide more details in Section ","element":"span"},{"text":"4","element":"span"},{"text":". For now, the main property we are to use from this discussion is that by smoothing a (","element":"span"},{"style":{"height":15.6},"width":118.1,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-5.png","element":"img","alt":"U, δ, T","inline":true},{"text":")-bounded algorithm it is possible to ensure the conditional expected instantaneous regret of the smoothed algorithm is bounded above by","element":"span"}],[{"style":{"width":"99%"},"width":1710,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-6.png","element":"img"}],[{"text":"function of ","element":"span"},{"style":{"height":17.6},"width":272.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-7.png","element":"img","alt":" ℓ) when U(ℓ, δ","inline":true},{"text":") is concave in ","element":"span"},{"style":{"height":12.8},"width":18,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-8.png","element":"img","alt":" ℓ","inline":true},{"text":". In Stochastic CORRAL the smoothing of base algorithms takes the place of the stability condition required by the CORRAL algorithm in [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"].","element":"span"}],[{"text":"Let’s sketch some intuition behind why this decreasing instantaneous regret condition can help us bound term II. For all ","element":"span"},{"style":{"height":17.6},"width":126.66,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-9.png","element":"img","alt":" i ∈ [M","inline":true},{"text":"] let ","element":"span"},{"style":{"height":19.91},"width":230.04,"height":49.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-10.png","element":"img","alt":" {pi1, . . . , piT }","inline":true,"padRight":true},{"text":"be the (random) probabilities ","element":"span"},{"text":"used by the Stochastic CORRAL meta-algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"(an adversarial meta-algorithm) to chose base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"during round ","element":"span"},{"style":{"height":22.42},"width":390.32,"height":56.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-11.png","element":"img","alt":" t and let pi = mint pit","inline":true},{"text":". Let’s focus on the optimal algorithm ","element":"span"},{"style":{"height":14.62},"width":115.45,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-12.png","element":"img","alt":" i⋆ and","inline":true,"padRight":true},{"text":"assume ","element":"span"},{"style":{"height":17.2},"width":120.52,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-13.png","element":"img","alt":" U⋆(t, δ","inline":true},{"text":") is convex in ","element":"span"},{"style":{"height":24.22},"width":266.46,"height":60.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-14.png","element":"img","alt":" t. Since U⋆(t,δ)t","inline":true,"padRight":true},{"text":"is decreasing, term II is the largest when base ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-15.png","element":"img","alt":" i⋆","inline":true,"padRight":true},{"text":"is selected the least often. For the sake of the argument let’s assume that ","element":"span"},{"style":{"height":25.28},"width":218.68,"height":63.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-16.png","element":"img","alt":" pi⋆t = pi⋆ ∀t","inline":true},{"text":". In ","element":"span"},{"text":"this case base ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-17.png","element":"img","alt":" i⋆","inline":true,"padRight":true},{"text":"will be played roughly ","element":"span"},{"style":{"height":21.28},"width":77.05,"height":53.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-18.png","element":"img","alt":" Tpi⋆","inline":true,"padRight":true},{"text":"times, and will repeat its decisions in intervals of length ","element":"span"},{"style":{"height":29.03},"width":39.7,"height":72.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-19.png","element":"img","alt":"1pi⋆ ","inline":true,"padRight":true},{"text":", resulting in the following bound:","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 3.1 ","element":"span"},{"text":"(informal)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If the regret of the optimal base is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.2},"width":154.69,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-20.png","element":"img","alt":"U∗, T, δ)","inline":true},{"style":{"fontStyle":"italic"},"text":"-bounded, then we have that","element":"span"}],[{"style":{"width":"61%"},"width":1047,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/8-21.png","element":"img"}],[{"text":"We demonstrate the effectiveness of our smoothing transformation by deriving regret bounds with two meta-algorithms: the Log-Barrier-OMD algorithm in CORRAL (introduced ","element":"span"},{"id":"id-62","text":"by [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"]) which we will henceforth refer to as the CORRAL and EXP3.P (Theorem 3.3 in [","element":"span"},{"href":"#id-9","referenceIndex":8,"text":"BC12","element":"a"},{"text":"]) with forced exploration. The later is a simple algorithm that ensures each base is picked with at least a (horizon dependent) constant probability ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"text":". We now state an informal version of our main result, Theorem ","element":"span"},{"href":"#id-19","text":"4.11","element":"a"},{"text":".","element":"span"}],[{"id":"id-38","style":{"fontWeight":"bold"},"text":"Theorem 3.2 ","element":"span"},{"text":"(informal version of Theorem ","element":"span"},{"href":"#id-19","text":"4.11","element":"a"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If ","element":"span"},{"style":{"height":17.6},"width":414.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-0.png","element":"img","alt":" U∗(T, δ) = O(c(δ) T α","inline":true},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"for some function ","element":"span"},{"style":{"height":12.4},"width":192.32,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-1.png","element":"img","alt":" c : R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and constant ","element":"span"},{"style":{"height":10.4},"width":70.56,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-2.png","element":"img","alt":" α ∈","inline":true,"padRight":true},{"text":"[1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-3.png","element":"img","alt":" B∗","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":15.6},"width":136.66,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-4.png","element":"img","alt":"U⋆, T, δ","inline":true},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":"-bounded, the regrets of Stochastic CORRAL with an EXP3.P and CORRAL meta-algorithms are:","element":"span"}],[{"style":{"width":"74%"},"width":1276,"height":228,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-5.png","element":"img"}],[{"text":"The CORRAL meta-algorithm achieves optimal regret when ","element":"span"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-6.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.6},"width":56.19,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-7.png","element":"img","alt":" c(δ","inline":true},{"text":") are known.","element":"span"}],[{"text":"When ","element":"span"},{"style":{"height":17.2},"width":55.52,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-8.png","element":"img","alt":" c(δ","inline":true},{"text":") is unknown and ","element":"span"},{"style":{"height":17.2},"width":162.39,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-9.png","element":"img","alt":" c(δ) > T","inline":true}],[{"text":"an EXP3.P meta-algorithm achieves better regret because ","element":"span"},{"style":{"height":31.6},"width":579.74,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-10.png","element":"img","alt":"�O�T 12−α c(δ)�< �O�T αc(δ)1α�","inline":true},{"text":". We complement this result with a couple of lower bounds.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lower bounds. ","element":"span"},{"text":"Theorem ","element":"span"},{"href":"#id-20","text":"5.3 ","element":"a"},{"text":"in Section ","element":"span"},{"text":"5 ","element":"span"},{"text":"shows that if the regret of the best base is ","element":"span"},{"style":{"height":17.6},"width":102.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-11.png","element":"img","alt":"O(T x","inline":true},{"text":"), in the worst case a meta-algorithm that does not know ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"can have regret Ω(","element":"span"},{"style":{"height":12},"width":48.56,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-12.png","element":"img","alt":"T y","inline":true},{"text":") with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y > x","element":"span"},{"text":". Theorem ","element":"span"},{"href":"#id-21","text":"5.2 ","element":"a"},{"text":"shows that in general it is impossible for any meta-algorithm to achieve a regret better than Ω(","element":"span"},{"style":{"height":17.6},"width":67.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-13.png","element":"img","alt":"√T","inline":true},{"text":") even when the best base has regret ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(log(","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":")). When the regret of the best base is ","element":"span"},{"style":{"height":19.98},"width":118.92,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-14.png","element":"img","alt":" O(√T","inline":true},{"text":"), CORRAL with our smoothing achieves the optimal ","element":"span"},{"style":{"height":19.98},"width":274.65,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/9-15.png","element":"img","alt":"O(√T) regret.","inline":true}],[{"text":"The detailed description of the aforementioned smoothing procedure, its properties and the regret analysis are postponed to Section ","element":"span"},{"text":"4","element":"span"},{"text":". We also show some applications of our model selection results in Section ","element":"span"},{"text":"6","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Meta-Algorithms","element":"span"}],[{"text":"We review the adversarial bandit algorithms used as a Meta-Algorithm in Algorithm ","element":"span"},{"href":"#id-16","text":"4","element":"a"},{"text":".","element":"span"}],[{"id":"id-68","style":{"fontWeight":"bold"},"text":"CORRAL Meta-Algorithm","element":"span"}],[{"text":"We reproduce the CORRAL meta-algorithm below.","element":"span"}],[{"style":{"width":"99%"},"width":1714,"height":505,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/10-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"EXP3.P Meta-Algorithm","element":"span"}],[{"text":"We reproduce the EXP3.P algorithm (Figure 3.1 in [","element":"span"},{"href":"#id-22","referenceIndex":9,"text":"BS12","element":"a"},{"text":"]) below. In this formulation we use ","element":"span"},{"style":{"height":19.51},"width":499.14,"height":48.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/10-1.png","element":"img","alt":" η = 1, γ = 2βk and p = γk.","inline":true}],[{"style":{"width":"99%"},"width":1714,"height":623,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/10-2.png","element":"img"}]]},{"heading":"4 Smoothed Algorithm and Regret Analysis","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"Non-increasing Instantaneous Regret","element":"span"}],[{"text":"We introduce a ”smoothing” procedure (Algorithm ","element":"span"},{"href":"#id-18","text":"8","element":"a"},{"text":") which, given a (","element":"span"},{"style":{"height":17.6},"width":169.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/10-3.png","element":"img","alt":"U, δ, T)−","inline":true},{"text":"bounded algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"constructs a smoothed algorithm ","element":"span"},{"style":{"height":12.8},"width":30.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/10-4.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"with the property that for some time-steps its conditional expected instantaneous regret has a decreasing upper bound. For ease of presentation and instead of making use of odd and even time-steps in the definition of ","element":"span"},{"style":{"height":12.8},"width":30.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/10-5.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"we assume each round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"is split in two types of steps (Step 1 and Step 2). We will denote objects pertaining to the ","element":"span"},{"style":{"height":11.6},"width":49.76,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/10-6.png","element":"img","alt":" t−","inline":true},{"text":"th round step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"using a subscript ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"and a superscript (","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"). The construction of ","element":"span"},{"style":{"height":12.8},"width":30.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/10-7.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"is simple. The smoothed algorithm maintains an internal copy of the","element":"span"}],[{"style":{"width":"1%"},"width":22,"height":2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-0.png","element":"img"}],[{"text":"original algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":". During step 1 of round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", ","element":"span"},{"style":{"height":12.8},"width":30.94,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-1.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"will play the action suggested by its internal copy of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":". During step 2 of round ","element":"span"},{"style":{"height":15.6},"width":73.31,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-2.png","element":"img","alt":" t, �B","inline":true,"padRight":true},{"text":"will instead sample uniformly from the set of policies previously played by the copy of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"maintained by ","element":"span"},{"style":{"height":12.8},"width":30.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-3.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"during steps of type 1 from round 1 to round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":".","element":"span"}],[{"text":"Let’s define step 2 more formally. If algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"is at state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"during round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", at step 2 of the corresponding time-step the smoothed strategy will pick an index ","element":"span"},{"style":{"fontStyle":"italic"},"text":"q ","element":"span"},{"text":"in [1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", .., s","element":"span"},{"text":"] uniformly at random, and will then re-play the policy ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"used during step 1 of round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"q","element":"span"},{"text":". Since ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"is (","element":"span"},{"style":{"height":15.6},"width":118.14,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-4.png","element":"img","alt":"U, δ, T","inline":true},{"text":")-bounded we will show in Lemma ","element":"span"},{"href":"#id-23","text":"4.2 ","element":"a"},{"text":"that the expected instantaneous regret of step 2 at round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"is at most ","element":"span"},{"style":{"height":17.6},"width":172.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-5.png","element":"img","alt":" U(s, δ)/s","inline":true,"padRight":true},{"text":"with high probability.","element":"span"}],[{"id":"id-18","style":{"width":"99%"},"width":1714,"height":1058,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-6.png","element":"img"}],[{"text":"It is easy to see that if algorithm ","element":"span"},{"style":{"height":12.8},"width":30.94,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-7.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"has been played ","element":"span"},{"style":{"height":12.8},"width":18,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-8.png","element":"img","alt":" ℓ","inline":true,"padRight":true},{"text":"times (including step 1 and 2 plays), the internal counter of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"equals ","element":"span"},{"style":{"height":17.6},"width":40.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-9.png","element":"img","alt":" ℓ/","inline":true},{"text":"2. We will make use of this internal counter when we connect a smoothed algorithm with the Stochastic CORRAL meta-algorithm. We now introduce the definition of (","element":"span"},{"style":{"height":20.33},"width":218.17,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-10.png","element":"img","alt":"U, δ, T (2))−","inline":true},{"text":"Smoothness which in short corresponds to algorithms that satisfy a high probability conditional expected regret upper bound during steps of type 2.","element":"span"}],[{"id":"id-24","style":{"fontWeight":"bold"},"text":"Definition 4.1 ","element":"span"},{"text":"((","element":"span"},{"style":{"height":20.33},"width":218.55,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-11.png","element":"img","alt":"U, δ, T (2))−","inline":true},{"text":"Smoothness)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":12.4},"width":148.12,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-12.png","element":"img","alt":" U : R ×","inline":true,"padRight":true},{"text":"[0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1] ","element":"span"},{"style":{"height":14.73},"width":114,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-13.png","element":"img","alt":" → R+","inline":true},{"style":{"fontStyle":"italic"},"text":". We say a smoothed algorithm ","element":"span"},{"style":{"height":12.8},"width":30.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-14.png","element":"img","alt":"�B","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":20.33},"width":218.47,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-15.png","element":"img","alt":"U, δ, T (2))−","inline":true},{"style":{"fontStyle":"italic"},"text":"smooth if with probability ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":12.8},"width":63.62,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-16.png","element":"img","alt":" − δ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and for all rounds ","element":"span"},{"style":{"height":17.6},"width":112.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-17.png","element":"img","alt":" t ∈ [T","inline":true},{"text":"]","element":"span"},{"style":{"fontStyle":"italic"},"text":", the conditional expected instantaneous regret of Step 2 is bounded above by ","element":"span"},{"style":{"height":17.6},"width":176.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-18.png","element":"img","alt":" U(t, δ)/t:","inline":true}],[{"id":"id-93","style":{"width":"97%"},"width":1666,"height":93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/11-19.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Where ","element":"span"},{"style":{"height":31.6},"width":1267.88,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-0.png","element":"img","alt":"�Ft−1 = σ�{A(i)ℓ , �π(i)ℓ , r(i)ℓ , a(i)ℓ }ℓ∈[t−1],i∈{1,2}, ∪{A(1)ℓ , �π(1)ℓ , r(i)ℓ , a(1)ℓ }�","inline":true},{"style":{"fontStyle":"italic"},"text":"is the sigma algebra generated by all contexts, rewards, policies and actions up to time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"style":{"fontStyle":"italic"},"text":"step 1.","element":"span"}],[{"style":{"width":"1%"},"width":23,"height":3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-1.png","element":"img"}],[{"text":"During all steps of type 2 algorithm ","element":"span"},{"style":{"height":12.8},"width":30.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-2.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"replays the policies it has used when confronted with contexts ","element":"span"},{"href":"#id-23","style":{"height":23.8},"width":525,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-3.png","element":"img","alt":" A(1)1 , ..., A(1)s . In Lemma 4.2","inline":true,"padRight":true},{"text":"we will use the fact that all contexts are assumed ","element":"span"},{"text":"to be generated as i.i.d. samples from the same context generating distribution ","element":"span"},{"style":{"height":14.7},"width":216.77,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-4.png","element":"img","alt":" DS to show","inline":true,"padRight":true},{"text":"that ","element":"span"},{"style":{"height":20.33},"width":475.17,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-5.png","element":"img","alt":"�B is (U, δ, T (2))−smooth.","inline":true}],[{"text":"With this objective in mind let’s analyze a slightly more general setting. Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"be a (","element":"span"},{"style":{"height":17.6},"width":169.73,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-6.png","element":"img","alt":"U, δ, T)−","inline":true},{"text":"bounded algorithm playing in an environment where the high probability regret upper bound ","element":"span"},{"style":{"fontStyle":"italic"},"text":"U ","element":"span"},{"text":"holds. Let’s assume that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"has been played for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"time-steps during which it has encountered i.i.d. generated contexts ","element":"span"},{"style":{"height":16},"width":204.86,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-7.png","element":"img","alt":" A1, · · · , At","inline":true,"padRight":true},{"text":"and has played actions sampled from policies ","element":"span"},{"style":{"height":10.8},"width":184.92,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-8.png","element":"img","alt":" π1, · · · , πt","inline":true},{"text":". Similar to the definition of ","element":"span"},{"style":{"height":15.02},"width":86.94,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-9.png","element":"img","alt":"�Ft−1","inline":true,"padRight":true},{"text":"in Definition ","element":"span"},{"href":"#id-24","text":"4.1","element":"a"},{"text":", let’s define ","element":"span"},{"style":{"height":21.69},"width":617.84,"height":54.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-10.png","element":"img","alt":"Ft−1 = σ�{Aℓ, �πℓ, rℓ, aℓ}ℓ∈[t−1]�","inline":true},{"text":", the sigma algebra generated by all contexts, rewards, policies and actions up to time ","element":"span"},{"style":{"height":11.6},"width":57.94,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-11.png","element":"img","alt":" t −","inline":true,"padRight":true},{"text":"1. We define the “expected replay regret” ","element":"span"},{"style":{"height":17.6},"width":271.41,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-12.png","element":"img","alt":" Replay(t|Ft−1)","inline":true,"padRight":true},{"text":"as:","element":"span"}],[{"style":{"width":"79%"},"width":1357,"height":119,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-13.png","element":"img"}],[{"text":"Where ","element":"span"},{"style":{"height":17.48},"width":208.86,"height":43.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-14.png","element":"img","alt":" A′1, · · · , A′t","inline":true,"padRight":true},{"text":"are i.i.d. contexts from ","element":"span"},{"style":{"height":14.7},"width":55.66,"height":36.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-15.png","element":"img","alt":" DS","inline":true,"padRight":true},{"text":"all of them conditionally independent from ","element":"span"},{"style":{"height":15.02},"width":43.36,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-16.png","element":"img","alt":"Ft","inline":true},{"text":". It is easy to see that the conditional instantaneous regret of a smoothed algorithm ","element":"span"},{"style":{"height":12.8},"width":30.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-17.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"during round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"step 2 equals the expected replay regret ","element":"span"},{"style":{"height":17.6},"width":438.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-18.png","element":"img","alt":" Replay(t| �Ft−1) of the B","inline":true,"padRight":true},{"text":"copy inside ","element":"span"},{"style":{"height":12.8},"width":41.99,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-19.png","element":"img","alt":"�B.","inline":true}],[{"text":"As a first step in proving that ","element":"span"},{"style":{"height":20.34},"width":318.23,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-20.png","element":"img","alt":"�B is (U, δ, T (2))−","inline":true},{"text":"smooth in Lemma ","element":"span"},{"href":"#id-23","text":"4.2 ","element":"a"},{"text":"we show the replay regret of a (","element":"span"},{"style":{"height":15.6},"width":118.14,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-21.png","element":"img","alt":"U, δ, T","inline":true},{"text":")-bounded algorithm satisfies a high probability upper bound.","element":"span"}],[{"style":{"width":"9%"},"width":158,"height":15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-22.png","element":"img"}],[{"id":"id-23","style":{"height":17.6},"width":600.69,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-23.png","element":"img","alt":"Lemma 4.2. If B is (U, δ, T)−","inline":true},{"style":{"fontStyle":"italic"},"text":"bounded with ","element":"span"},{"style":{"height":17.6},"width":204.86,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-24.png","element":"img","alt":" U(t, δ) > 8","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Assumption ","element":"span"},{"href":"#id-25","style":{"fontStyle":"italic"},"text":"2.1","element":"a"},{"style":{"fontStyle":"italic"},"text":", then with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":17.2},"width":335.44,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-25.png","element":"img","alt":" − δ for all t ∈ [T]","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the expected replay regret of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"style":{"fontStyle":"italic"},"text":"satisfies:","element":"span"}],[{"style":{"width":"34%"},"width":593,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-26.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Furthermore, if ","element":"span"},{"style":{"height":25.5},"width":743.48,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-27.png","element":"img","alt":" δ ≤ 1√T then Replay(t|Ft−1) ≤ 5U(t, δ).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Let’s condition on the event ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-28.png","element":"img","alt":" E1","inline":true,"padRight":true},{"text":"that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":"’s plays satisfy the high probability regret guarantee given by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"U","element":"span"},{"text":":","element":"span"}],[{"id":"id-26","style":{"width":"68%"},"width":1175,"height":127,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-29.png","element":"img"}],[{"text":"For all ","element":"span"},{"style":{"height":17.6},"width":112.15,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-30.png","element":"img","alt":" t ∈ [T","inline":true},{"text":"] and where ","element":"span"},{"style":{"height":16},"width":204.86,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-31.png","element":"img","alt":" A1, · · · , At","inline":true,"padRight":true},{"text":"are the contexts algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"encountered up to time ","element":"span"},{"style":{"height":17.6},"width":436.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-32.png","element":"img","alt":"t. Since B is (U, δ, T)−","inline":true},{"text":"bounded it must be the case that ","element":"span"},{"style":{"height":17.6},"width":276.22,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-33.png","element":"img","alt":" P (E1) ≥ 1 − δ.","inline":true}],[{"text":"Let ","element":"span"},{"style":{"height":17.48},"width":208.86,"height":43.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-34.png","element":"img","alt":" A′1, · · · , A′t ","inline":true,"padRight":true},{"text":"be a collection of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"fresh i.i.d. contexts from ","element":"span"},{"style":{"height":14.7},"width":55.66,"height":36.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-35.png","element":"img","alt":" DS ","inline":true,"padRight":true},{"text":"independent from ","element":"span"},{"style":{"height":15.02},"width":135.06,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-36.png","element":"img","alt":" Ft. We","inline":true,"padRight":true},{"text":"now use martingale concentration arguments to show that ","element":"span"},{"style":{"height":21.36},"width":619.56,"height":53.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/12-37.png","element":"img","alt":"�tl=1 f(Al, π∗) ≈ �tl=1 f(A′l, π∗)","inline":true}],[{"style":{"width":"99%"},"width":1712,"height":292,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-0.png","element":"img"}],[{"text":"Since by assumption max","element":"span"},{"style":{"height":18.53},"width":299.33,"height":46.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-1.png","element":"img","alt":"A′,π |f(A′, π)| ≤","inline":true,"padRight":true},{"text":"1 each term in ","element":"span"},{"style":{"height":20.05},"width":318.92,"height":50.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-2.png","element":"img","alt":" {M1l } and {M2l }","inline":true,"padRight":true},{"text":"is bounded and ","element":"span"},{"text":"satisfies max","element":"span"},{"style":{"height":20.8},"width":477.86,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-3.png","element":"img","alt":"�|M1l |, |M2l |�≤ 2 for all t","inline":true},{"text":". A simple use of Azuma-Hoeffding yields:","element":"span"}],[{"style":{"width":"70%"},"width":1213,"height":137,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-4.png","element":"img"}],[{"text":"Summing over all ","element":"span"},{"style":{"height":17.6},"width":356.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-5.png","element":"img","alt":" t, and all i ∈ {1, 2}","inline":true},{"text":", using the fact that ","element":"span"},{"style":{"height":23.32},"width":195.36,"height":58.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-6.png","element":"img","alt":"�Tt=1 1t2 <","inline":true,"padRight":true},{"text":"2 and the union bound ","element":"span"},{"text":"implies that for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", with probability 1 ","element":"span"},{"style":{"height":15.6},"width":76.68,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-7.png","element":"img","alt":" − δ,","inline":true}],[{"id":"id-27","style":{"width":"73%"},"width":1266,"height":359,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-8.png","element":"img"}],[{"text":"Denote this event as ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-9.png","element":"img","alt":" E2","inline":true},{"text":". We shall proceed to upper bound the replay regret. Let’s condition on ","element":"span"},{"style":{"height":15.02},"width":130.47,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-10.png","element":"img","alt":" E1 ∩ E2","inline":true},{"text":". The following sequence of inequalities holds,","element":"span"}],[{"style":{"width":"90%"},"width":1553,"height":437,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-11.png","element":"img"}],[{"text":"For all ","element":"span"},{"style":{"height":17.2},"width":111.97,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-12.png","element":"img","alt":" t ∈ [T","inline":true},{"text":"]. Inequality (","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":") follows by the triangle inequality while (","element":"span"},{"style":{"fontStyle":"italic"},"text":"ii","element":"span"},{"text":") is a consequence of conditioning on ","element":"span"},{"style":{"height":15.02},"width":133.14,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-13.png","element":"img","alt":" E1 ∩ E2","inline":true,"padRight":true},{"text":"and invoking inequalities ","element":"span"},{"href":"#id-26","text":"5","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","text":"6 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-27","text":"6","element":"a"},{"text":". We conclude that with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":85.46,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-14.png","element":"img","alt":" − 2δ","inline":true,"padRight":true},{"text":"and for all ","element":"span"},{"style":{"height":17.6},"width":136.9,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-15.png","element":"img","alt":" t ∈ [T],","inline":true}],[{"style":{"width":"58%"},"width":999,"height":135,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/13-16.png","element":"img"}],[{"style":{"width":"99%"},"width":1711,"height":228,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-0.png","element":"img"}],[{"text":"It is easy to see that in case ","element":"span"},{"style":{"height":25.5},"width":743.79,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-1.png","element":"img","alt":" δ ≤ 1√T then Replay(t|Ft−1) ≤ 5U(t, δ).","inline":true}],[{"style":{"width":"1%"},"width":22,"height":2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-2.png","element":"img"}],[{"text":"In Propositon ","element":"span"},{"href":"#id-28","text":"4.3 ","element":"a"},{"text":"we show that if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"is bounded, then ","element":"span"},{"style":{"height":12.8},"width":30.94,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-3.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"is both bounded and smooth. We will then show that several algorithms such as UCB, LinUCB, ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-4.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy and EXP3 are (","element":"span"},{"style":{"height":15.6},"width":118.14,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-5.png","element":"img","alt":"U, δ, T","inline":true},{"text":")-bounded for appropriate functions ","element":"span"},{"style":{"fontStyle":"italic"},"text":"U","element":"span"},{"text":". By Proposition ","element":"span"},{"href":"#id-28","text":"4.3 ","element":"a"},{"text":"we will then conclude the smoothed versions of these algorithms are smooth.","element":"span"}],[{"style":{"width":"9%"},"width":158,"height":14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-6.png","element":"img"}],[{"id":"id-28","style":{"fontWeight":"bold"},"text":"Proposition 4.3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If ","element":"span"},{"style":{"height":17.6},"width":173.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-7.png","element":"img","alt":" U(t, δ) >","inline":true,"padRight":true},{"text":"8","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":169.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-8.png","element":"img","alt":"U, δ, T)−","inline":true},{"style":{"fontStyle":"italic"},"text":"bounded, then ","element":"span"},{"style":{"height":12.8},"width":30.94,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-9.png","element":"img","alt":"�B","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is ","element":"span"},{"text":"(5","element":"span"},{"style":{"height":20.34},"width":218.56,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-10.png","element":"img","alt":"U, δ, T (2))−","inline":true},{"style":{"fontStyle":"italic"},"text":"smooth and with probability at least","element":"span"}],[{"style":{"width":"78%"},"width":1348,"height":167,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"for all ","element":"span"},{"style":{"height":17.6},"width":137.9,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-12.png","element":"img","alt":" t ∈ [T].","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-13.png","element":"img","alt":" E1","inline":true,"padRight":true},{"text":"denote the event that ","element":"span"},{"style":{"height":12.8},"width":30.94,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-14.png","element":"img","alt":"�B","inline":true},{"text":"’s plays during steps of type 1 satisfy the high probability regret guarantee given by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"U","element":"span"},{"text":":","element":"span"}],[{"id":"id-31","style":{"width":"71%"},"width":1225,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-15.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":17.6},"width":112.74,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-16.png","element":"img","alt":" t ∈ [T","inline":true},{"text":"]. Since the conditional instantaneous regret of Step 2 of round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"equals the average replay regret of the type 1 steps up to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", Lemma ","element":"span"},{"href":"#id-23","text":"4.2 ","element":"a"},{"text":"implies that whenever ","element":"span"},{"style":{"height":15.02},"width":153.41,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-17.png","element":"img","alt":" E2 holds","inline":true,"padRight":true},{"text":"(see definition for ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-18.png","element":"img","alt":" E2","inline":true,"padRight":true},{"text":"in the proof of Lemma ","element":"span"},{"href":"#id-23","text":"4.2","element":"a"},{"text":") which occurs with probability at least 1","element":"span"},{"style":{"height":15.6},"width":73.66,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-19.png","element":"img","alt":"− δ,","inline":true,"padRight":true},{"text":"the conditional expected instantaneous regret satisfies: ","element":"span"},{"style":{"height":23.43},"width":445.73,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-20.png","element":"img","alt":" E[f(A′, π∗) − f(A′, π(2)t","inline":true,"padRight":true},{"text":")","element":"span"},{"style":{"height":17.6},"width":160.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-21.png","element":"img","alt":"| �Ft−1] ≤","inline":true}],[{"style":{"width":"71%"},"width":1225,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-22.png","element":"img"}],[{"text":"It is easy to see that if we condition on ","element":"span"},{"style":{"height":15.02},"width":130.48,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-23.png","element":"img","alt":" E1 ∩ E2","inline":true,"padRight":true},{"text":"the conditional expected instantaneous regret of steps of type 2 satisfy,","element":"span"}],[{"id":"id-30","style":{"width":"85%"},"width":1470,"height":183,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-24.png","element":"img"}],[{"text":"For all ","element":"span"},{"style":{"height":17.2},"width":111.97,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-25.png","element":"img","alt":" t ∈ [T","inline":true},{"text":"]. We now show the regret incurred by ","element":"span"},{"style":{"height":12.8},"width":30.94,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/14-26.png","element":"img","alt":"�B","inline":true,"padRight":true},{"text":"satisfies a high probability upper bound. To bound the regret accrued during time-steps of type 2, consider the following","element":"span"}],[{"text":"Martingale difference sequences,","element":"span"}],[{"style":{"width":"51%"},"width":876,"height":196,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-0.png","element":"img"}],[{"text":"As a result of Assumption ","element":"span"},{"href":"#id-25","text":"2.1","element":"a"},{"text":", ","element":"span"},{"style":{"height":20.05},"width":134.67,"height":50.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-1.png","element":"img","alt":" |Mil | ≤","inline":true,"padRight":true},{"text":"2 for all ","element":"span"},{"style":{"height":17.6},"width":183.47,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-2.png","element":"img","alt":" i ∈ {1, 2}","inline":true,"padRight":true},{"text":"and therefore a simple use of ","element":"span"},{"text":"Azuma-Hoeffding’s inequality,","element":"span"}],[{"style":{"width":"70%"},"width":1213,"height":137,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-3.png","element":"img"}],[{"text":"Summing over all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", applying the union bound, using the fact that ","element":"span"},{"style":{"height":23.32},"width":195.36,"height":58.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-4.png","element":"img","alt":"�Tt=1 1t2 <","inline":true,"padRight":true},{"text":"2 implies that ","element":"span"},{"text":"for all ","element":"span"},{"style":{"height":17.6},"width":112.21,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-5.png","element":"img","alt":" t ∈ [T","inline":true},{"text":"], with probability 1 ","element":"span"},{"style":{"height":15.6},"width":76.68,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-6.png","element":"img","alt":" − δ,","inline":true}],[{"id":"id-29","style":{"width":"94%"},"width":1626,"height":204,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-7.png","element":"img"}],[{"text":"Let’s denote as ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-8.png","element":"img","alt":" E3","inline":true,"padRight":true},{"text":"the event where Equation ","element":"span"},{"href":"#id-29","text":"11 ","element":"a"},{"text":"holds. If ","element":"span"},{"style":{"height":15.02},"width":130.02,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-9.png","element":"img","alt":" E2 ∩ E3","inline":true,"padRight":true},{"text":"occur, then combining the upper bounds in ","element":"span"},{"href":"#id-30","text":"10 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-29","text":"11 ","element":"a"},{"text":"we conclude that,","element":"span"}],[{"style":{"width":"50%"},"width":872,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-10.png","element":"img"}],[{"text":"combining this last observation with Equation ","element":"span"},{"href":"#id-31","text":"9","element":"a"},{"text":", we conclude that for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":98.5,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-11.png","element":"img","alt":" − 3δ,","inline":true}],[{"style":{"width":"57%"},"width":992,"height":138,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-12.png","element":"img"}],[{"text":"For all ","element":"span"},{"style":{"height":17.6},"width":112.21,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/15-13.png","element":"img","alt":" t ∈ [T","inline":true},{"text":"]. The result follows.","element":"span"}],[{"text":"It remains to show how to adapt the feedback structure of the Stochastic CORRAL meta-algorithm to deal with the two step nature of smoothed algorithms. We reproduce the full pseudo-code of the Stochastic CORRAL meta-algorithm adapted to smoothed","element":"span"}],[{"id":"id-60","text":"algorithms below,","element":"span"}],[{"id":"id-32","style":{"width":"99%"},"width":1714,"height":522,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-0.png","element":"img"}],[{"text":"For reasons that have to do with the analysis, Algorithm ","element":"span"},{"href":"#id-32","text":"9 ","element":"a"},{"text":"has a few extra features not present in the meta-algorithm-base template of Algorithm ","element":"span"},{"href":"#id-16","text":"4","element":"a"},{"text":". First, whenever the smooth stochastic corral meta-algorithm selects an algorithm ","element":"span"},{"style":{"height":16},"width":29.97,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-1.png","element":"img","alt":" jt","inline":true,"padRight":true},{"text":"it plays it for two steps, thus coinciding with ","element":"span"},{"style":{"height":17.42},"width":53.36,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-2.png","element":"img","alt":"�Bjt","inline":true},{"text":"’s two time step structure. Second, it updates its distribution ","element":"span"},{"style":{"height":11.6},"width":33.96,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-3.png","element":"img","alt":" pt","inline":true,"padRight":true},{"text":"using the feedback 2","element":"span"},{"style":{"height":23.82},"width":248.83,"height":59.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-4.png","element":"img","alt":"r(2)t − bjt(st,jt","inline":true},{"text":") instead of using the sum ","element":"span"},{"style":{"height":23.43},"width":180.91,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-5.png","element":"img","alt":" r(1)t + r(2)t ","inline":true,"padRight":true},{"text":". Most notably, the update ","element":"span"},{"text":"makes use of a bias adjustment to the reward signal that is not present in the original. The reason behind this modification will become clearer in the regret analysis.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Applications of Proposition ","element":"span"},{"href":"#id-28","style":{"fontWeight":"bold"},"text":"4.3","element":"a"}],[{"text":"We now show the smoothed versions of several algorithms satisfy Definition ","element":"span"},{"href":"#id-24","text":"4.1 ","element":"a"},{"text":"by showing they are (","element":"span"},{"style":{"height":17.6},"width":169.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-6.png","element":"img","alt":"U, δ, T)−","inline":true},{"text":"bounded for an appropriate upper bound function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"U","element":"span"},{"text":". We focus on algorithms for the ","element":"span"},{"style":{"height":12.8},"width":58.09,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-7.png","element":"img","alt":" k−","inline":true},{"text":"armed bandit setting and the contextual linear bandit setting.","element":"span"}],[{"id":"id-45","style":{"fontWeight":"bold"},"text":"Lemma 4.4 ","element":"span"},{"text":"(Theorem 3 in [","element":"span"},{"href":"#id-5","referenceIndex":2,"text":"AYPS11","element":"a"},{"text":"])","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"In the case of changing and potentially infinite contexts of dimension ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"style":{"fontStyle":"italic"},"text":", LinUCB is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":135.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-8.png","element":"img","alt":"U, δ, T)","inline":true},{"style":{"fontStyle":"italic"},"text":"-bounded with ","element":"span"},{"style":{"height":19.38},"width":501.4,"height":48.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-9.png","element":"img","alt":" U(t, δ) = O(d√t log(1/δ)).","inline":true}],[{"id":"id-46","style":{"fontWeight":"bold"},"text":"Lemma 4.5 ","element":"span"},{"text":"(Theorem 1 in [","element":"span"},{"href":"#id-10","referenceIndex":11,"text":"Chu+11","element":"a"},{"text":"])","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"In the case of finite linear contexts of size ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and dimension ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"style":{"fontStyle":"italic"},"text":", LinUCB is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":135.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-10.png","element":"img","alt":"U, δ, T)","inline":true},{"style":{"fontStyle":"italic"},"text":"-bounded with ","element":"span"},{"style":{"height":20.08},"width":683.3,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-11.png","element":"img","alt":" U(t, δ) = O(√dt log3(kT log(T)/δ)).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Lemma 4.6 ","element":"span"},{"text":"(Theorem 1 in [","element":"span"},{"href":"#id-33","referenceIndex":28,"text":"Sel+13","element":"a"},{"text":"])","element":"span"},{"style":{"height":12.8},"width":221.61,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-12.png","element":"img","alt":". In the k−","inline":true},{"style":{"fontStyle":"italic"},"text":"armed adversarial bandit setting Exp3 is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":169.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-13.png","element":"img","alt":"U, δ, T)−","inline":true},{"style":{"fontStyle":"italic"},"text":"bounded where ","element":"span"},{"style":{"height":21.69},"width":451.71,"height":54.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-14.png","element":"img","alt":" U(t, δ) = O(√tk log tkδ ).","inline":true}],[{"id":"id-43","style":{"fontWeight":"bold"},"text":"Lemma 4.7. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"In the stochastic ","element":"span"},{"style":{"height":12.8},"width":58.09,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-15.png","element":"img","alt":" k−","inline":true},{"style":{"fontStyle":"italic"},"text":"armed bandit problem, if we assume the noise ","element":"span"},{"style":{"height":16.4},"width":31.1,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-16.png","element":"img","alt":" ξt","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is conditionally 1-sub-Gaussian, UCB is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":135.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-17.png","element":"img","alt":"U, δ, T)","inline":true},{"style":{"fontStyle":"italic"},"text":"-bounded with ","element":"span"},{"style":{"height":21.69},"width":451.71,"height":54.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-18.png","element":"img","alt":" U(t, δ) = O(√tk log tkδ ).","inline":true}],[{"style":{"width":"101%"},"width":1740,"height":183,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-19.png","element":"img"}],[{"text":"For the remainder of this section we focus on showing that in the stochastic ","element":"span"},{"style":{"height":12.8},"width":58.09,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-20.png","element":"img","alt":" k−","inline":true},{"text":"armed bandit problem, the ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-21.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy algorithm (Algorithm 1.2 of [","element":"span"},{"href":"#id-34","referenceIndex":30,"text":"Sli19","element":"a"},{"text":"]) is (","element":"span"},{"style":{"height":17.2},"width":169.4,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/16-22.png","element":"img","alt":"U, T, δ)−","inline":true},{"text":"bounded. At ","element":"span"},{"id":"id-66","text":"time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"the ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-0.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy algorithm selects with probability ","element":"span"},{"style":{"height":17.6},"width":251.96,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-1.png","element":"img","alt":" ϵt = min(c/t,","inline":true,"padRight":true},{"text":"1) an arm uniformly at random, and with probability 1 ","element":"span"},{"style":{"height":10.22},"width":73.32,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-2.png","element":"img","alt":" − ϵt","inline":true,"padRight":true},{"text":"it selects the arm whose empirical estimate of the mean is largest so far. Let’s introduce some notation: we will denote by ","element":"span"},{"style":{"height":12},"width":193.76,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-3.png","element":"img","alt":" µ1, · · · , µk","inline":true,"padRight":true},{"text":"the unknown means of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"arms use the name ","element":"span"},{"style":{"height":26.41},"width":64.7,"height":66.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-4.png","element":"img","alt":" �µ(t)j","inline":true,"padRight":true},{"text":"to denote the empirical estimate of the mean of arm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"after using ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"samples.","element":"span"}],[{"text":"Without loss of generality let ","element":"span"},{"style":{"height":12},"width":43.29,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-5.png","element":"img","alt":" µ1","inline":true,"padRight":true},{"text":"be the optimal arm. We denote the sub-optimality gaps as ∆","element":"span"},{"style":{"height":13.02},"width":218.52,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-6.png","element":"img","alt":"j = µ1 − µj","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":17.6},"width":110.79,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-7.png","element":"img","alt":" j ∈ [k","inline":true},{"text":"]. Let ∆","element":"span"},{"style":{"height":6},"width":17,"height":15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-8.png","element":"img","alt":"∗","inline":true,"padRight":true},{"text":"be the smallest nonzero gap. We follow the discussion in [","element":"span"},{"href":"#id-11","referenceIndex":7,"text":"ACBF02","element":"a"},{"text":"] and start by showing that under the right assumptions, and for a horizon of size ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":", the algorithm satisfies a high probability regret bound for all ","element":"span"},{"style":{"height":14.4},"width":210.63,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-9.png","element":"img","alt":" t ≤ T. The","inline":true,"padRight":true},{"text":"objective of this section is to prove the following Lemma:","element":"span"}],[{"id":"id-36","style":{"fontWeight":"bold"},"text":"Lemma 4.8. ","element":"span"},{"style":{"height":29.16},"width":355.76,"height":72.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-10.png","element":"img","alt":" If c = 10K log(T 3/γ)∆2∗","inline":true}],[{"style":{"fontStyle":"italic"},"text":"with ","element":"span"},{"style":{"height":19.22},"width":363.41,"height":48.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-11.png","element":"img","alt":" ϵt = ct is (U, δ, T)−","inline":true},{"style":{"fontStyle":"italic"},"text":"bounded for ","element":"span"},{"style":{"height":25.98},"width":258.44,"height":64.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-12.png","element":"img","alt":" δ ≤ ∆2∗T 3 where","inline":true}],[{"style":{"width":"52%"},"width":905,"height":158,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-13.png","element":"img"}],[{"style":{"height":22.06},"width":542.35,"height":55.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-14.png","element":"img","alt":"Proof. Let E(t) = 12k�tl=1 ϵl","inline":true,"padRight":true},{"text":"and denote by ","element":"span"},{"style":{"height":17.82},"width":75.66,"height":44.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-15.png","element":"img","alt":" Tj(t","inline":true},{"text":") the random variable denoting the number ","element":"span"},{"text":"of times arm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"was selected up to time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". We start by analyzing the probability that a suboptimal arm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j > ","element":"span"},{"text":"1 is selected at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":":","element":"span"}],[{"style":{"width":"84%"},"width":1454,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-16.png","element":"img"}],[{"text":"Let’s bound the second term.","element":"span"}],[{"style":{"width":"81%"},"width":1398,"height":105,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-17.png","element":"img"}],[{"text":"The analysis of these two terms is the same. Denote by ","element":"span"},{"style":{"height":22.43},"width":91.9,"height":56.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/17-18.png","element":"img","alt":" T Rj (t","inline":true},{"text":") the number of times arm","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"id":"id-67","text":"was played as a result of a random epsilon greedy move. We have:","element":"span"}],[{"style":{"width":"89%"},"width":1535,"height":861,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-0.png","element":"img"}],[{"text":"Inequality ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"is a consequence of the Azuma-Hoeffding inequality bound. Inequality ","element":"span"},{"style":{"fontStyle":"italic"},"text":"b ","element":"span"},{"text":"follows because ","element":"span"},{"style":{"height":21.63},"width":610.74,"height":54.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-1.png","element":"img","alt":"�∞l=E+1 exp(−αl) ≤ 1a exp(−αE","inline":true},{"text":"). Term (1) corresponds to the probability ","element":"span"},{"text":"that within the interval [1","element":"span"},{"style":{"height":14.4},"width":120.26,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-2.png","element":"img","alt":", · · · , t","inline":true},{"text":"], the number of greedy pulls to arm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"is at most half its expectation. Term (2) is already ”small”. Lets proceed to bound (1). Let ","element":"span"},{"style":{"height":17.6},"width":306.15,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-3.png","element":"img","alt":" ϵt = min(c/t, 1).","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":29.16},"width":291.02,"height":72.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-4.png","element":"img","alt":" c = 10k log(T 3/γ)∆2∗","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":13.2},"width":67.77,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-5.png","element":"img","alt":" γ ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1) satisfying ","element":"span"},{"style":{"height":22.02},"width":140.56,"height":55.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-6.png","element":"img","alt":" γ ≤ ∆2j","inline":true},{"text":". We’ll show that under these ","element":"span"},{"text":"assumptions we can lower bound ","element":"span"},{"style":{"height":29.16},"width":460.48,"height":72.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-7.png","element":"img","alt":" E(t). If t ≥ 10k log(T 3/γ)∆2∗ :","inline":true}],[{"style":{"width":"64%"},"width":1103,"height":387,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-8.png","element":"img"}],[{"text":"By Bernstein’s inequality (see derivation of equation (13) in [","element":"span"},{"href":"#id-11","referenceIndex":7,"text":"ACBF02","element":"a"},{"text":"]) we can upper bound ","element":"span"},{"style":{"height":22.42},"width":120.96,"height":56.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-9.png","element":"img","alt":"T Rj (t):","inline":true}],[{"id":"id-35","style":{"width":"68%"},"width":1180,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-10.png","element":"img"}],[{"text":"Hence for ","element":"span"},{"style":{"height":29.17},"width":300.9,"height":72.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-11.png","element":"img","alt":" t ≥ 10k log(T 3/γ)∆2∗ :","inline":true}],[{"style":{"width":"31%"},"width":542,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/18-12.png","element":"img"}],[{"style":{"width":"77%"},"width":1331,"height":202,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-0.png","element":"img"}],[{"text":"Now we proceed with term (2):","element":"span"}],[{"style":{"width":"60%"},"width":1044,"height":502,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-1.png","element":"img"}],[{"text":"The first inequality (","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") follows because ","element":"span"},{"style":{"height":29.16},"width":316.51,"height":72.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-2.png","element":"img","alt":" E(t) ≥ 5 log(T 3/γ)∆2∗","inline":true,"padRight":true},{"text":". Inequality (","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":") follows because by the assumption ","element":"span"},{"style":{"height":14.8},"width":71.14,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-3.png","element":"img","alt":" γ ≤","inline":true}],[{"text":"By applying the union bound over all arms ","element":"span"},{"style":{"height":29.16},"width":704.78,"height":72.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-4.png","element":"img","alt":" j ̸= 1 and time-steps t ≥ 10k log(T 3/γ)∆2∗","inline":true,"padRight":true},{"text":", we conclude that the probability of choosing a sub-optimal arm ","element":"span"},{"style":{"height":16},"width":66.6,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-5.png","element":"img","alt":" j ≥","inline":true,"padRight":true},{"text":"2 at any time time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"for ","element":"span"},{"style":{"height":29.16},"width":286.63,"height":72.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-6.png","element":"img","alt":"t ≥ 10k log(T 3/γ)∆2∗","inline":true,"padRight":true},{"text":"as a ","element":"span"},{"style":{"fontWeight":"bold"},"text":"greedy choice ","element":"span"},{"text":"is upper bounded by ","element":"span"},{"style":{"height":23.17},"width":148.96,"height":57.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-7.png","element":"img","alt":"kγT 2 ≤ kγT","inline":true,"padRight":true},{"text":". In other words after","element":"span"}],[{"style":{"height":29.16},"width":292.75,"height":72.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-8.png","element":"img","alt":"t ≥ 10k log(T 3/γ)∆2∗","inline":true,"padRight":true},{"text":"rounds, with probability 1 ","element":"span"},{"style":{"height":22.71},"width":87.67,"height":56.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-9.png","element":"img","alt":" − kγT","inline":true,"padRight":true},{"text":"sub-optimal arms are only chosen as a ","element":"span"},{"text":"result of random epsilon greedy move (occurring with probability ","element":"span"},{"style":{"height":17.6},"width":60.91,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-10.png","element":"img","alt":" ϵt).","inline":true}],[{"text":"A similar argument as the one that gave us Equation ","element":"span"},{"href":"#id-35","text":"13 ","element":"a"},{"text":"can be used to upper bound the probability that ","element":"span"},{"style":{"height":22.42},"width":92.23,"height":56.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-11.png","element":"img","alt":" T Rj (t","inline":true},{"text":") be much larger than its mean:","element":"span"}],[{"style":{"width":"39%"},"width":676,"height":57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-12.png","element":"img"}],[{"text":"Using this and the union bound we see that with probability more than 1 ","element":"span"},{"style":{"height":22.71},"width":87.01,"height":56.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-13.png","element":"img","alt":" − kγT","inline":true,"padRight":true},{"text":"and for ","element":"span"},{"text":"all ","element":"span"},{"style":{"height":17.6},"width":120.18,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-14.png","element":"img","alt":" t ∈ [T","inline":true},{"text":"] and arms ","element":"span"},{"style":{"height":22.42},"width":217.71,"height":56.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-15.png","element":"img","alt":" j ∈ [k], T Rj","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":17.6},"width":189.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-16.png","element":"img","alt":"t) ≤ 3E(t","inline":true},{"text":"). Combining this with the observation that ","element":"span"},{"text":"after ","element":"span"},{"style":{"height":29.16},"width":285.95,"height":72.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-17.png","element":"img","alt":" t ≥ 10k log(T 3/γ)∆2∗","inline":true,"padRight":true},{"text":"and with probability 1 ","element":"span"},{"style":{"height":22.71},"width":86.3,"height":56.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-18.png","element":"img","alt":" − kγT","inline":true,"padRight":true},{"text":"over all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"simultaneously regret is only ","element":"span"},{"text":"incurred by random exploration pulls (and not greedy actions), we can conclude that with probability at least 1 ","element":"span"},{"style":{"height":22.71},"width":104.07,"height":56.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-19.png","element":"img","alt":" − 2kγT","inline":true,"padRight":true},{"text":"simultaneously for all ","element":"span"},{"style":{"height":29.17},"width":290.08,"height":72.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-20.png","element":"img","alt":" t ≥ 10k log(T 3/γ)∆2∗","inline":true,"padRight":true},{"text":"the regret incurred is","element":"span"}],[{"style":{"width":"71%"},"width":1232,"height":237,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/19-21.png","element":"img"}],[{"text":"Term (","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":") is a crude upper bound on the regret incurred in the first ","element":"span"},{"style":{"height":29.17},"width":436.02,"height":72.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-0.png","element":"img","alt":"10k log(T 3/γ)∆2∗ rounds and","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"ii","element":"span"},{"text":") is an upper bound for the regret incurred in the subsequent rounds.","element":"span"}],[{"text":"Since ","element":"span"},{"style":{"height":29.17},"width":454.12,"height":72.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-1.png","element":"img","alt":" E(t) ≤ 20k log(T 3/γ)∆2∗ log(t","inline":true},{"text":") we conclude that with probability 1 ","element":"span"},{"style":{"height":22.71},"width":354.63,"height":56.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-2.png","element":"img","alt":" − 2kγT for all t ≤ T","inline":true,"padRight":true},{"text":"the cumulative regret of epsilon greedy is upper bounded by","element":"span"}],[{"style":{"width":"51%"},"width":882,"height":158,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-3.png","element":"img"}],[{"text":"the result follows by identifying ","element":"span"},{"style":{"height":19.13},"width":188.55,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-4.png","element":"img","alt":" δ = γ/T 3.","inline":true}],[{"style":{"width":"1%"},"width":30,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-5.png","element":"img"}],[{"text":"Lemma ","element":"span"},{"href":"#id-36","text":"4.8 ","element":"a"},{"text":"gives us an instance dependent upper bound for the ","element":"span"},{"style":{"height":8},"width":51.71,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-6.png","element":"img","alt":" ϵ−","inline":true},{"text":"greedy algorithm. We now show the instance-independent high probability regret bound for ","element":"span"},{"style":{"height":16.4},"width":169.22,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-7.png","element":"img","alt":" ϵ-greedy:","inline":true}],[{"id":"id-49","style":{"fontWeight":"bold"},"text":"Lemma 4.9. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If ","element":"span"},{"style":{"height":31.69},"width":235.06,"height":79.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-8.png","element":"img","alt":" c =10k log( 1δ )∆2∗","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":", then ","element":"span"},{"style":{"height":8},"width":51.71,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-9.png","element":"img","alt":" ϵ−","inline":true},{"style":{"fontStyle":"italic"},"text":"greedy with ","element":"span"},{"style":{"height":18.49},"width":111.01,"height":46.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-10.png","element":"img","alt":" ϵt = ct","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":169.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-11.png","element":"img","alt":"δ, U, T)−","inline":true},{"style":{"fontStyle":"italic"},"text":"bounded for ","element":"span"},{"style":{"height":25.98},"width":127.66,"height":64.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-12.png","element":"img","alt":" δ ≤ ∆2∗T 3","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and:","element":"span"}],[{"style":{"height":31.6},"width":704.43,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-13.png","element":"img","alt":"1. U(t, δ) = 16�log( 1δ)t when k = 2.","inline":true}],[{"style":{"height":36.05},"width":1091.85,"height":90.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-14.png","element":"img","alt":"2. U(t, δ) = 20�k log( 1δ)��Kj=2 ∆j��1/3t2/3 when k > 2.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Let ∆ be some arbitrary gap value. Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") denote the expected regret up to round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". We recycle the notation from the proof of Lemma ","element":"span"},{"href":"#id-36","text":"4.8","element":"a"},{"text":", recall ","element":"span"},{"style":{"height":19.14},"width":188.55,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-15.png","element":"img","alt":" δ = γ/T 3.","inline":true}],[{"id":"id-37","style":{"width":"93%"},"width":1609,"height":584,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/20-16.png","element":"img"}],[{"text":"When ","element":"span"},{"style":{"height":16},"width":297.6,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/21-0.png","element":"img","alt":" k = 2, ∆2 = ∆∗","inline":true,"padRight":true},{"text":"and therefore (assuming ∆ ","element":"span"},{"style":{"height":17.6},"width":130.33,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/21-1.png","element":"img","alt":" < ∆2):","inline":true}],[{"style":{"width":"99%"},"width":1712,"height":587,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/21-2.png","element":"img"}],[{"text":"second inequality B is satisfied for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"large enough. We choose this expression for simplicity of exposition.","element":"span"}],[{"text":"When ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k > ","element":"span"},{"text":"2 notice that we can arrive to a bound similar to ","element":"span"},{"href":"#id-37","text":"14","element":"a"},{"text":":","element":"span"}],[{"style":{"width":"100%"},"width":1718,"height":879,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/21-3.png","element":"img"}],[{"text":"The inequality ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/21-4.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"is true for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"large enough. We choose this expression for simplicity of exposition.","element":"span"}],[{"style":{"width":"1%"},"width":30,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/21-5.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Regret Analysis","element":"span"}],[{"text":"In this section we go back to sketch the proof of Theorem ","element":"span"},{"href":"#id-38","text":"3.2 ","element":"a"},{"text":"by explaining how to bound terms I and II in the regret decomposition of Equation ","element":"span"},{"href":"#id-39","text":"2","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Bounding Term I. ","element":"span"},{"text":"Recall that Algorithm ","element":"span"},{"href":"#id-32","text":"9 ","element":"a"},{"text":"only sends the smoothed reward of Step 2 to the meta-algorithm while the base plays and incurs regrets from both Step 1 and Step 2. We show in Section ","element":"span"},{"text":"A ","element":"span"},{"text":"that this does not affect the regret of the meta-algorithm significantly.","element":"span"}],[{"style":{"width":"99%"},"width":1711,"height":93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-0.png","element":"img"}],[{"text":"with exploration rate ","element":"span"},{"style":{"height":24},"width":612.51,"height":60,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-1.png","element":"img","alt":" p, E [I] < O(√MT + 1p + MTp).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Bounding Term II. ","element":"span"},{"text":"This quantity is the regret of all the policies proposed by the optimal base ","element":"span"},{"style":{"height":14.62},"width":31.04,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-2.png","element":"img","alt":" i⋆","inline":true},{"text":", even during steps when it was not selected by the meta-algorithm. Recall the internal state of any algorithm (including ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-3.png","element":"img","alt":" i⋆","inline":true},{"text":") is only updated when selected and played by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":". We assume the smoothed base algorithm ","element":"span"},{"style":{"height":16.61},"width":52.19,"height":41.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-4.png","element":"img","alt":"�Bi⋆","inline":true,"padRight":true},{"text":"satisfies the smoothness and boundedness properties of Definitions ","element":"span"},{"href":"#id-14","text":"2.1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-24","text":"4.1","element":"a"},{"text":". For the purpose of the analysis we declare that when a smoothed base repeats its policy while not played, it repeats its subsequent Step 2 policy (Algorithm ","element":"span"},{"href":"#id-18","text":"8","element":"a"},{"text":"). This will become clearer in Section ","element":"span"},{"text":"A","element":"span"},{"text":". Since we select ","element":"span"},{"style":{"height":16.61},"width":52.2,"height":41.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-5.png","element":"img","alt":"�Bi⋆","inline":true,"padRight":true},{"text":"with probability at least ","element":"span"},{"style":{"height":17.28},"width":45.48,"height":43.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-6.png","element":"img","alt":" pi⋆","inline":true,"padRight":true},{"text":"it will be updated at most every 1","element":"span"},{"style":{"height":20.88},"width":55.78,"height":52.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-7.png","element":"img","alt":"/pi","inline":true,"padRight":true},{"text":"time-steps and the regret upper bound will be roughly ","element":"span"},{"style":{"height":29.03},"width":290.3,"height":72.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-8.png","element":"img","alt":"1pi⋆ Ui⋆(Tpi⋆, δ).","inline":true}],[{"id":"id-91","style":{"fontWeight":"bold"},"text":"Theorem 4.10. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We have that ","element":"span"},{"style":{"height":32.74},"width":970.1,"height":81.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-9.png","element":"img","alt":" E [II] ≤ O�E�1pi Ui(Tpi, δ) log T�+ δT(log T + 1)�","inline":true},{"style":{"fontStyle":"italic"},"text":". Here, the expectation is over the random variable ","element":"span"},{"style":{"height":15.68},"width":33.96,"height":39.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-10.png","element":"img","alt":" pi","inline":true},{"style":{"fontStyle":"italic"},"text":". If ","element":"span"},{"style":{"height":17.6},"width":286.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-11.png","element":"img","alt":" U(t, δ) = tαc(δ","inline":true},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"for some ","element":"span"},{"style":{"height":10.4},"width":72.49,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-12.png","element":"img","alt":" α ∈","inline":true,"padRight":true},{"text":"[1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"then, ","element":"span"},{"style":{"height":17.6},"width":186.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-13.png","element":"img","alt":" E [II] ≤ �O","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Total Regret. ","element":"span"},{"text":"Adding Term I and Term II gives us the following worst-case model selection regret bound for the CORRAL meta-algorithm (maximized over ","element":"span"},{"style":{"height":17.28},"width":45.48,"height":43.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-14.png","element":"img","alt":" pi⋆","inline":true,"padRight":true},{"text":"and with a chosen ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-15.png","element":"img","alt":" η","inline":true},{"text":") and the EXP3.P meta-algorithm (with a chosen ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"text":"):","element":"span"}],[{"id":"id-19","style":{"fontWeight":"bold"},"text":"Theorem 4.11. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If a base algorithm is ","element":"span"},{"text":"(","element":"span"},{"style":{"height":15.6},"width":118.1,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-16.png","element":"img","alt":"U, δ, T","inline":true},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":"-bounded for ","element":"span"},{"style":{"height":17.6},"width":313.13,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-17.png","element":"img","alt":" U(T, δ) = T αc(δ","inline":true},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and some ","element":"span"},{"style":{"height":10.4},"width":75.59,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-18.png","element":"img","alt":"α ∈","inline":true,"padRight":true},{"text":"[1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and the choice of ","element":"span"},{"style":{"height":17.6},"width":167.77,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-19.png","element":"img","alt":" δ = 1/T","inline":true},{"style":{"fontStyle":"italic"},"text":", the regret of the Smooth Stochastic CORRAL (Algorithm ","element":"span"},{"href":"#id-32","style":{"fontStyle":"italic"},"text":"9","element":"a"},{"style":{"fontStyle":"italic"},"text":") where ","element":"span"},{"style":{"height":24.83},"width":259.57,"height":62.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-20.png","element":"img","alt":" bj(s) = Uj(s,δ)s","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is upper bounded by :","element":"span"}]]},{"heading":"5 Lower Bound","paragraphs":[[{"text":"In stochastic environments, algorithms such as UCB can achieve logarithmic regret bounds. Our model selection procedure however has a ","element":"span"},{"style":{"height":19.98},"width":119.16,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-21.png","element":"img","alt":" O(√T","inline":true},{"text":") overall regret. In this section, we show that in general it is impossible to obtain a regret better than Ω(","element":"span"},{"style":{"height":17.6},"width":67.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/22-22.png","element":"img","alt":"√T","inline":true},{"text":") even when the optimal base algorithm has 0 regret. In order to formalize this statement, let’s define a model selection problem formally.","element":"span"}],[{"id":"id-76","style":{"width":"87%"},"width":1494,"height":298,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-0.png","element":"img"}],[{"text":"Table 1: Comparison of model selection guarantees for Stochastic CORRAL between the EXP3.P and CORRAL meta-algorithm. The top row shows the general regret guarantees. The middle row shows the regret guarantees when ","element":"figcaption","subtype":"caption"},{"style":{"height":17.2},"width":182.44,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-1.png","element":"img","alt":" α and c(δ","inline":true},{"text":") are known. The bottom row shows the regret guarantees when ","element":"figcaption","subtype":"caption"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-2.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"is known and ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":55.85,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-3.png","element":"img","alt":" c(δ","inline":true},{"text":") is unknown.","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"Definition 5.1 ","element":"span"},{"text":"(Model Selection Problem)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We call a tuple ","element":"span"},{"text":"(","element":"span"},{"style":{"height":20.02},"width":255.39,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-4.png","element":"img","alt":"{Bi}Mi=1, Env)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"a model selection ","element":"span"},{"style":{"fontStyle":"italic"},"text":"problem where ","element":"span"},{"style":{"height":20.02},"width":140.69,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-5.png","element":"img","alt":" {Bi}Mi=1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is a set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"style":{"fontStyle":"italic"},"text":"base algorithms and ","element":"span"},{"text":"Env ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a bandit environment","element":"span"},{"style":{"height":14.73},"width":31.93,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-6.png","element":"img","alt":"4.","inline":true}],[{"id":"id-21","style":{"height":13.2},"width":502.86,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-7.png","element":"img","alt":"Theorem 5.2. Let T ∈ N","inline":true},{"style":{"fontStyle":"italic"},"text":". For any model selection algorithm there exists a corresponding model selection problem ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":270.97,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-8.png","element":"img","alt":"{B1, B2}, Env)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such the regret of this model selection algorithm is lower bounded by ","element":"span"},{"style":{"height":31.6},"width":365.15,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-9.png","element":"img","alt":" R(T) = Ω� √Tlog(T)�.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Consider a stochastic 2-arm bandit problem where the best arm has expected reward 1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"2 and the second best arm has expected reward 1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"4. We construct base algorithms ","element":"span"},{"style":{"height":15.6},"width":112.64,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-10.png","element":"img","alt":" B1, B2","inline":true,"padRight":true},{"text":"as follows. ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-11.png","element":"img","alt":" B1","inline":true,"padRight":true},{"text":"always chooses the optimal arm and its expected instantaneous reward is 1","element":"span"},{"style":{"height":17.6},"width":120.43,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-12.png","element":"img","alt":"/2. B2","inline":true,"padRight":true},{"text":"chooses the second best arm at time step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"with probability ","element":"span"},{"style":{"height":26.2},"width":410.4,"height":65.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-13.png","element":"img","alt":"4c√t+2 log(t+2) (c will be","inline":true,"padRight":true},{"text":"specified later), and chooses the best arm otherwise. The expected reward at time step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"of ","element":"span"},{"style":{"height":26.2},"width":422.41,"height":65.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-14.png","element":"img","alt":"B2 is 12 − c√t+2 log(t+2).","inline":true}],[{"text":"Let ","element":"span"},{"style":{"height":12.8},"width":49.73,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-15.png","element":"img","alt":" A∗ ","inline":true,"padRight":true},{"text":"be uniformly sampled from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":". Consider two environments ","element":"span"},{"style":{"height":15.02},"width":318.9,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-16.png","element":"img","alt":" ν1 and ν2 for the","inline":true,"padRight":true},{"text":"meta-algorithm, each made up of two base algorithms ","element":"span"},{"style":{"height":15.6},"width":700.34,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-17.png","element":"img","alt":"�B1, �B2. Under ν1, �B1 and �B2 are both","inline":true,"padRight":true},{"text":"instantiations of ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-18.png","element":"img","alt":" B1","inline":true},{"text":". Under ","element":"span"},{"style":{"height":10.62},"width":38.56,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-19.png","element":"img","alt":" ν2","inline":true},{"text":", ","element":"span"},{"style":{"height":15.1},"width":69.04,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-20.png","element":"img","alt":"�BA∗","inline":true},{"text":", where ","element":"span"},{"style":{"height":12.8},"width":49.73,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-21.png","element":"img","alt":" A∗","inline":true,"padRight":true},{"text":"is a uniformly sampled index in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":", is a copy of ","element":"span"},{"style":{"height":15.1},"width":259.3,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-22.png","element":"img","alt":" B1 and �B3−A∗","inline":true,"padRight":true},{"text":"is a copy of ","element":"span"},{"style":{"height":15.02},"width":59.59,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-23.png","element":"img","alt":" B2.","inline":true}],[{"text":"Let ","element":"span"},{"style":{"height":15.2},"width":108.66,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-24.png","element":"img","alt":" P1, P2","inline":true,"padRight":true},{"text":"denote the probability measures induced by interaction of the meta-algorithm with ","element":"span"},{"style":{"height":14.22},"width":170.34,"height":35.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-25.png","element":"img","alt":" ν1 and ν2","inline":true,"padRight":true},{"text":"respectively. Let ","element":"span"},{"style":{"height":16.7},"width":65.03,"height":41.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-26.png","element":"img","alt":"�BAt","inline":true,"padRight":true},{"text":"denote the base algorithm chosen by the meta-algorithm at time ","element":"span"},{"style":{"height":21.29},"width":691.15,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-27.png","element":"img","alt":" t. We have P1(At ̸= A∗) = 12 for all t","inline":true},{"text":", since the learner has no information available ","element":"span"},{"text":"to identify which algorithm is considered optimal. By Pinskers’ inequality we have","element":"span"}],[{"style":{"width":"50%"},"width":870,"height":105,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/23-28.png","element":"img"}],[{"text":"By the divergence decomposition [see ","element":"span"},{"href":"#id-40","referenceIndex":19,"text":"LS20","element":"a"},{"text":", proof of Lemma 15.1 for the decomposition","element":"span"}],[{"text":"technique] and using that for ∆ ","element":"span"},{"href":"#id-41","style":{"height":21.29},"width":955.38,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-0.png","element":"img","alt":" < 14 : KL( 12, 12 − ∆) ≤ 3∆2 (Lemma B.3), we have","inline":true}],[{"style":{"width":"55%"},"width":954,"height":267,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-1.png","element":"img"}],[{"text":"Picking ","element":"span"},{"style":{"height":31.6},"width":131.06,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-2.png","element":"img","alt":" c =�","inline":true}],[{"text":"bounded by","element":"span"}],[{"style":{"width":"76%"},"width":1319,"height":358,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-3.png","element":"img"}],[{"text":"CORRAL needs knowledge of the best base’s regret to achieve the same regret. The following lower bound shows that this requirement is unavoidable:","element":"span"}],[{"id":"id-20","style":{"fontWeight":"bold"},"text":"Theorem 5.3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"text":"Alg ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be a model selection algorithm. There exists a model selection problem with two base algorithms where the best base has regret ","element":"span"},{"style":{"height":17.6},"width":502.13,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-4.png","element":"img","alt":"�O(T x) for some 0 < x < 1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that if ","element":"span"},{"text":"Alg ","element":"span"},{"style":{"fontStyle":"italic"},"text":"has no knowledge of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"style":{"fontStyle":"italic"},"text":"nor of the reward of the best arm, then there exists a potentially different model selection problem where the best base also has regret ","element":"span"},{"style":{"height":17.2},"width":195.53,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-5.png","element":"img","alt":"�O(T x) but","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the model selection regret guarantee of ","element":"span"},{"text":"Alg ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is lower bounded by ","element":"span"},{"text":"Ω(","element":"span"},{"style":{"height":17.6},"width":297.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-6.png","element":"img","alt":"T y) with y > x.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Let the set of arms be ","element":"span"},{"style":{"height":17.6},"width":208.59,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-7.png","element":"img","alt":" {a1, a2, a3}","inline":true},{"text":". Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y ","element":"span"},{"text":"be such that 0 ","element":"span"},{"style":{"height":15.2},"width":205.27,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-8.png","element":"img","alt":" < x < y ≤","inline":true,"padRight":true},{"text":"1. Let ","element":"span"},{"style":{"height":15.94},"width":346.47,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-9.png","element":"img","alt":"∆ = T x−1+(y−x)/2","inline":true},{"text":". Define two environment ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-10.png","element":"img","alt":" E1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-11.png","element":"img","alt":" E2","inline":true,"padRight":true},{"text":"with reward vectors ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":17.6},"width":521.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-12.png","element":"img","alt":"{1 + ∆, 1, 0} for {a1, a2, a3}","inline":true},{"text":", respectively. Let ","element":"span"},{"style":{"height":15.02},"width":190.51,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-13.png","element":"img","alt":" B1 and B2","inline":true,"padRight":true},{"text":"be two base algorithms defined by the following fixed policies when running alone in ","element":"span"},{"style":{"height":15.02},"width":163.92,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-14.png","element":"img","alt":" E1 or E2:","inline":true}],[{"style":{"width":"71%"},"width":1231,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-15.png","element":"img"}],[{"text":"We also construct base ","element":"span"},{"style":{"height":17.08},"width":45.98,"height":42.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-16.png","element":"img","alt":" B′2 ","inline":true,"padRight":true},{"text":"defined as follows. Let ","element":"span"},{"style":{"height":17.6},"width":450.57,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-17.png","element":"img","alt":" c2 > 0 and ϵ2 = (y−x)/","inline":true},{"text":"4 be two constants. ","element":"span"},{"text":"Base ","element":"span"},{"style":{"height":17.08},"width":45.99,"height":42.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-18.png","element":"img","alt":" B′2","inline":true,"padRight":true},{"text":"mimics base ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-19.png","element":"img","alt":" B2","inline":true,"padRight":true},{"text":"when ","element":"span"},{"style":{"height":17.35},"width":306.51,"height":43.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-20.png","element":"img","alt":" t ≤ c2T x−y+1+ϵ2","inline":true},{"text":", and picks arm ","element":"span"},{"style":{"height":10.62},"width":40.07,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-21.png","element":"img","alt":" a1","inline":true,"padRight":true},{"text":"when ","element":"span"},{"style":{"height":17.35},"width":306.5,"height":43.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-22.png","element":"img","alt":" t > c2T x−y+1+ϵ2","inline":true},{"text":". ","element":"span"},{"text":"The instantaneous rewards of ","element":"span"},{"style":{"height":14.62},"width":50.1,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-23.png","element":"img","alt":" B1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.62},"width":50.1,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-24.png","element":"img","alt":" B2","inline":true,"padRight":true},{"text":"when running alone are ","element":"span"},{"style":{"height":19.05},"width":290.88,"height":47.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-25.png","element":"img","alt":" r1t = 1 − T x−1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.05},"width":576.95,"height":47.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-26.png","element":"img","alt":"r2t = 1−T y−1 for all 1 ≤ t ≤ T","inline":true},{"text":". Next, consider model selection with base algorithms ","element":"span"},{"style":{"height":15.02},"width":129.26,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-27.png","element":"img","alt":" B1 and","inline":true},{"style":{"height":15.02},"width":451.02,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-28.png","element":"img","alt":"B2 in E1. Let T1 and T2","inline":true,"padRight":true},{"text":"be the number of rounds that ","element":"span"},{"style":{"height":15.02},"width":192.64,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-29.png","element":"img","alt":" B1 and B2","inline":true,"padRight":true},{"text":"are chosen, respectively.","element":"span"}],[{"text":"First, assume case (1): There exist constants ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c > ","element":"span"},{"text":"0, ","element":"span"},{"style":{"height":10.4},"width":63.84,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-30.png","element":"img","alt":" ϵ >","inline":true,"padRight":true},{"text":"0, ","element":"span"},{"style":{"height":13.2},"width":63.08,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-31.png","element":"img","alt":" p ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1), and ","element":"span"},{"style":{"height":14.62},"width":90.55,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-32.png","element":"img","alt":" T0 >","inline":true,"padRight":true},{"text":"0 such that with probability at least ","element":"span"},{"style":{"height":18.33},"width":638.41,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/24-33.png","element":"img","alt":" p, T2 ≥ cT x−y+1+ϵ for all T > T0.","inline":true}],[{"id":"id-79","text":"The regret of base ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-0.png","element":"img","alt":" B1","inline":true,"padRight":true},{"text":"when running alone for ","element":"span"},{"style":{"height":14.73},"width":497.38,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-1.png","element":"img","alt":" T rounds is T · T x−1 = T x","inline":true},{"text":". The regret of the model selection method is at least","element":"span"}],[{"style":{"width":"54%"},"width":939,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-2.png","element":"img"}],[{"text":"Given that the inequality holds for any ","element":"span"},{"style":{"height":14.62},"width":133.32,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-3.png","element":"img","alt":" T > T0","inline":true},{"text":", it proves the statement of the lemma in case (1).","element":"span"}],[{"text":"Next, we assume the complement of case (1): For all constants ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c > ","element":"span"},{"text":"0, ","element":"span"},{"style":{"height":10.4},"width":63.85,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-4.png","element":"img","alt":" ϵ >","inline":true,"padRight":true},{"text":"0, ","element":"span"},{"style":{"height":13.2},"width":63.07,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-5.png","element":"img","alt":" p ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1), and ","element":"span"},{"style":{"height":14.62},"width":90.55,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-6.png","element":"img","alt":" T0 >","inline":true,"padRight":true},{"text":"0, with probability at least ","element":"span"},{"style":{"height":18.33},"width":687.13,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-7.png","element":"img","alt":" p, T2 < cT x−y+1+ϵ for some T > T0.","inline":true}],[{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"be any such time horizon. Consider model selection with base algorithms ","element":"span"},{"style":{"height":15.02},"width":131.4,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-8.png","element":"img","alt":" B1 and","inline":true},{"style":{"height":17.08},"width":45.99,"height":42.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-9.png","element":"img","alt":"B′2 ","inline":true,"padRight":true},{"text":"in environment ","element":"span"},{"style":{"height":15.02},"width":151.76,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-10.png","element":"img","alt":" E2 for T","inline":true,"padRight":true},{"text":"rounds. Let ","element":"span"},{"style":{"height":16.68},"width":188.5,"height":41.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-11.png","element":"img","alt":" T ′1 and T ′2 ","inline":true,"padRight":true},{"text":"be the number of rounds that ","element":"span"},{"style":{"height":17.08},"width":190.09,"height":42.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-12.png","element":"img","alt":" B1 and B′2","inline":true,"padRight":true},{"text":"are chosen. Note that ","element":"span"},{"style":{"height":17.08},"width":193.04,"height":42.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-13.png","element":"img","alt":" B2 and B′2 ","inline":true,"padRight":true},{"text":"behave the same for ","element":"span"},{"style":{"height":17.35},"width":218.3,"height":43.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-14.png","element":"img","alt":" c2T x−y+1+ϵ","inline":true,"padRight":true},{"text":"time steps, and that ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-15.png","element":"img","alt":" B1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-16.png","element":"img","alt":" B2","inline":true,"padRight":true},{"text":"never choose action ","element":"span"},{"style":{"height":10.62},"width":40.07,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-17.png","element":"img","alt":" a1","inline":true},{"text":". Therefore for the first ","element":"span"},{"style":{"height":17.35},"width":231,"height":43.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-18.png","element":"img","alt":" c2T x−y+1+ϵ2","inline":true,"padRight":true},{"text":"time steps, the model selection strategy that selects between ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-19.png","element":"img","alt":" B1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.08},"width":45.99,"height":42.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-20.png","element":"img","alt":" B′2","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-21.png","element":"img","alt":" E2","inline":true,"padRight":true},{"text":"behaves the same as when it runs ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-22.png","element":"img","alt":"B1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.02},"width":45.66,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-23.png","element":"img","alt":" B2","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":15.02},"width":40.03,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-24.png","element":"img","alt":" E1","inline":true},{"text":". Therefore with probability ","element":"span"},{"style":{"height":19.41},"width":525.42,"height":48.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-25.png","element":"img","alt":" p > 1/2, T ′2 < c2T x−y+1+ϵ2","inline":true},{"text":", which implies ","element":"span"},{"style":{"height":17.88},"width":188.59,"height":44.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-26.png","element":"img","alt":"T ′1 > T/2.","inline":true}],[{"style":{"width":"95%"},"width":1644,"height":143,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-27.png","element":"img"}],[{"text":"Given that with probability ","element":"span"},{"style":{"height":17.88},"width":327.26,"height":44.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-28.png","element":"img","alt":" p > 1/2, T ′1 > T/","inline":true},{"text":"2, the regret of the learner is lower bounded ","element":"span"},{"text":"as,","element":"span"}],[{"style":{"width":"61%"},"width":1053,"height":83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-29.png","element":"img"}],[{"text":"which is larger than the regret of ","element":"span"},{"style":{"height":17.08},"width":45.99,"height":42.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-30.png","element":"img","alt":" B′2 ","inline":true,"padRight":true},{"text":"running alone because ","element":"span"},{"style":{"height":22.31},"width":210.39,"height":55.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-31.png","element":"img","alt":" 3x+y4 < x+y2 ","inline":true,"padRight":true},{"text":". The statement of ","element":"span"},{"text":"the lemma follows given that for any ","element":"span"},{"style":{"height":14.62},"width":42.5,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-32.png","element":"img","alt":" T0","inline":true,"padRight":true},{"text":"there exists ","element":"span"},{"style":{"height":14.62},"width":135.13,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-33.png","element":"img","alt":" T > T0","inline":true,"padRight":true},{"text":"so that the model selection fails.","element":"span"}]]},{"heading":"6 Applications of Stochastic CORRAL","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"Misspecified Contextual Linear Bandit","element":"span"}],[{"text":"We consider model selection in the misspecified linear bandit problem. The learner selects an action ","element":"span"},{"style":{"height":15.42},"width":137.47,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-34.png","element":"img","alt":" at ∈ At","inline":true,"padRight":true},{"text":"and receives a reward ","element":"span"},{"style":{"height":19.54},"width":943.74,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-35.png","element":"img","alt":" rt such that |E[rt] − a⊤t θ| ≤ ϵ∗ where θ ∈ Rd is an","inline":true,"padRight":true},{"text":"unknown parameter vector and ","element":"span"},{"style":{"height":10.22},"width":34.71,"height":25.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-36.png","element":"img","alt":" ϵ∗","inline":true,"padRight":true},{"text":"is the misspecification error. For this problem, [","element":"span"},{"href":"#id-42","referenceIndex":32,"text":"Zan+20","element":"a"},{"text":"] and [","element":"span"},{"href":"#id-0","referenceIndex":20,"text":"LSW20","element":"a"},{"text":"] present variants of LinUCB that achieve a high probability ","element":"span"},{"style":{"height":20.08},"width":338.47,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-37.png","element":"img","alt":"�O(d√T + ϵ∗√dT)","inline":true,"padRight":true},{"text":"regret bound. Both algorithms require knowledge of ","element":"span"},{"href":"#id-0","referenceIndex":20,"style":{"height":17.6},"width":294.43,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-38.png","element":"img","alt":" ϵ∗, but [LSW20","inline":true},{"text":"] show a regret bound of the same order without the knowledge of ","element":"span"},{"style":{"height":10.22},"width":34.71,"height":25.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-39.png","element":"img","alt":" ϵ∗","inline":true,"padRight":true},{"text":"for the version of the problem with a fixed action set ","element":"span"},{"style":{"height":15.42},"width":153,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-40.png","element":"img","alt":" At = A","inline":true},{"text":". Their method relies on G-optimal design, which does not work for contextual settings. It is an open question whether it is possible to achieve the above regret without knowing ","element":"span"},{"style":{"height":10.22},"width":34.71,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/25-41.png","element":"img","alt":" ϵ∗","inline":true,"padRight":true},{"text":"for problems with changing action sets.","element":"span"}],[{"id":"id-77","text":"In this section, we show a ","element":"span"},{"style":{"height":20.08},"width":323.42,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-0.png","element":"img","alt":"�O(d√T + ϵ∗√dT","inline":true},{"text":") regret bound for linear bandit problems with changing action sets without knowing ","element":"span"},{"style":{"height":10.22},"width":34.72,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-1.png","element":"img","alt":" ϵ∗","inline":true},{"text":". For problems with fixed action sets, we show an improved regret that matches the lower bound of [","element":"span"},{"href":"#id-40","referenceIndex":19,"text":"LS20","element":"a"},{"text":"].","element":"span"}],[{"text":"Given a constant ","element":"span"},{"style":{"fontStyle":"italic"},"text":"E ","element":"span"},{"text":"so that ","element":"span"},{"style":{"height":17.6},"width":153.11,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-2.png","element":"img","alt":" |ϵ∗| ≤ E","inline":true},{"text":", we divide the interval [1","element":"span"},{"style":{"fontStyle":"italic"},"text":", E","element":"span"},{"text":"] into an exponential grid ","element":"span"},{"style":{"height":20.34},"width":717.41,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-3.png","element":"img","alt":" G = [1, 2, 22, ..., 2log(E)]. We use log(E","inline":true},{"text":") modified LinUCB bases, from either [","element":"span"},{"href":"#id-42","referenceIndex":32,"text":"Zan+20","element":"a"},{"text":"] or [","element":"span"},{"href":"#id-0","referenceIndex":20,"text":"LSW20","element":"a"},{"text":"], with each base algorithm instantiated with a value of ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-4.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"in the grid.","element":"span"}],[{"id":"id-47","style":{"fontWeight":"bold"},"text":"Theorem 6.1. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For the misspecified linear bandit problem described above, the regret of Stochastic CORRAL with a CORRAL meta-algorithm using learning rate ","element":"span"},{"style":{"height":25.5},"width":162.96,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-5.png","element":"img","alt":" η = 1√Td","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"LinUCB base algorithms with target misspecification level ","element":"span"},{"style":{"height":14.8},"width":116.56,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-6.png","element":"img","alt":" ϵ ∈ G","inline":true},{"style":{"fontStyle":"italic"},"text":", is upper bounded by","element":"span"}],[{"style":{"height":20.08},"width":327.87,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-7.png","element":"img","alt":"O(d√T + ϵ∗√dT","inline":true},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":". In the case of a fixed action linear bandit problem with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arms and ","element":"span"},{"style":{"height":17.6},"width":141.66,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-8.png","element":"img","alt":"√k > d","inline":true},{"style":{"fontStyle":"italic"},"text":", the regret of Stochastic CORRAL with a CORRAL meta-algorithm using learning rate ","element":"span"},{"style":{"height":25.5},"width":171.31,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-9.png","element":"img","alt":" η = 1√Td","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"applied to a set of base algorithms consisting of one UCB base and one ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G-optimal base algorithm [","element":"span"},{"href":"#id-0","referenceIndex":20,"style":{"fontStyle":"italic"},"text":"LSW20","element":"a"},{"style":{"fontStyle":"italic"},"text":"] is upper bounded by ","element":"span"},{"style":{"height":31.6},"width":627.08,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-10.png","element":"img","alt":"�O�min�kd√T, d√T + ϵ∗√dT��.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"From Lemma ","element":"span"},{"href":"#id-43","text":"4.7","element":"a"},{"text":", for UCB, ","element":"span"},{"style":{"height":21.69},"width":456.14,"height":54.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-11.png","element":"img","alt":" U(T, δ) = O(√Tk log Tkδ ","inline":true,"padRight":true},{"text":"). Therefore from Theorem ","element":"span"},{"href":"#id-19","text":"4.11","element":"a"},{"text":", ","element":"span"},{"text":"running CORRAL with smooth UCB results in the following regret bound:","element":"span"}],[{"style":{"width":"100%"},"width":1716,"height":734,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-12.png","element":"img"}],[{"text":"Maximizing over ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-13.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"text":"results in a regret guarantee of the form ","element":"span"},{"style":{"height":31.6},"width":567.76,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-14.png","element":"img","alt":"�O�√T + 1η + Td2η + ϵ√dT�.","inline":true,"padRight":true},{"text":"For the misspecified linear bandit problem we use ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(log(","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":")) LinUCB bases with ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-15.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"defined in the grid, and choose ","element":"span"},{"style":{"height":25.5},"width":156.2,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-16.png","element":"img","alt":" η = 1√Td","inline":true},{"text":". The resulting regret for Stochastic CORRAL is of","element":"span"}],[{"text":"the form ","element":"span"},{"style":{"height":31.6},"width":359.66,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-17.png","element":"img","alt":"�O�√Td + ϵ√dT�.","inline":true}],[{"text":"When the action sets are fixed, by the choice of ","element":"span"},{"style":{"height":25.5},"width":169.52,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-18.png","element":"img","alt":" η = 1√Td","inline":true},{"text":", the regret of Stochastic ","element":"span"},{"text":"CORRAL with a CORRAL meta-algorithm over one UCB and one G-optimal base equals:","element":"span"}],[{"style":{"width":"47%"},"width":810,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/26-19.png","element":"img"}],[{"id":"id-78","text":"If","element":"span"},{"style":{"height":17.6},"width":141.66,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-0.png","element":"img","alt":"√k > d","inline":true},{"text":", the above expression becomes ","element":"span"},{"style":{"height":31.6},"width":595.08,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-1.png","element":"img","alt":"�O�min�√T kd,√Td + ϵ√dT��","inline":true}],[{"text":"Observe that in the case of a fixed action linear bandit problem, the regret upper bound we achieve for Stochastic CORRAL with a CORRAL meta-algorithm and a learning rate of ","element":"span"},{"style":{"height":25.5},"width":157.38,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-2.png","element":"img","alt":"η = 1√Td","inline":true,"padRight":true},{"text":"is of the form ","element":"span"},{"style":{"height":31.6},"width":614.02,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-3.png","element":"img","alt":" �O�min�kd√T, d√T + ϵ∗√dT��","inline":true},{"text":". The product of the terms inside","element":"span"}],[{"text":"the minimum is of order ","element":"span"},{"style":{"height":17.2},"width":107.72,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-4.png","element":"img","alt":"�O(kT","inline":true},{"text":"). This result matches the following lower bound that shows that it is impossible to achieve ","element":"span"},{"style":{"height":20.08},"width":696.05,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-5.png","element":"img","alt":"�O(min(√kT, d√T + ϵ∗√dT)) regret:","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Lemma 6.2 ","element":"span"},{"text":"(Implied by Theorem 24.4 in [","element":"span"},{"href":"#id-40","referenceIndex":19,"text":"LS20","element":"a"},{"text":"])","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-6.png","element":"img","alt":" Rν(T","inline":true},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"denote the cumulative regret at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"style":{"fontStyle":"italic"},"text":"on environment ","element":"span"},{"style":{"height":8},"width":24,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-7.png","element":"img","alt":" ν","inline":true},{"style":{"fontStyle":"italic"},"text":". For any algorithm, there exists a ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":"-dimensional linear bandit environment ","element":"span"},{"style":{"height":10.62},"width":38.56,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-8.png","element":"img","alt":" ν1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"style":{"fontStyle":"italic"},"text":"-armed bandit environment ","element":"span"},{"style":{"height":10.62},"width":38.56,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-9.png","element":"img","alt":" ν2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that ","element":"span"},{"style":{"height":17.6},"width":346.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-10.png","element":"img","alt":" Rν1(T) · Rν2(T) ≥","inline":true},{"style":{"height":19.13},"width":243.34,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-11.png","element":"img","alt":"T(k − 1)e−2.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Experiment (Figure ","element":"span"},{"href":"#id-44","style":{"fontWeight":"bold"},"text":"1","element":"a"},{"style":{"fontWeight":"bold"},"text":"). ","element":"span"},{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"= 2. Consider a contextual bandit problem with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"= 50 arms, where each arm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"has an associated vector ","element":"span"},{"style":{"height":20.15},"width":146.35,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-12.png","element":"img","alt":" aj ∈ Rd","inline":true,"padRight":true},{"text":"sampled uniformly at random from [0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1]","element":"span"},{"style":{"height":8.8},"width":18,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-13.png","element":"img","alt":"d","inline":true},{"text":". We consider two cases: (1) For a ","element":"span"},{"style":{"height":15.94},"width":124.62,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-14.png","element":"img","alt":" θ ∈ Rd","inline":true,"padRight":true},{"text":"sampled uniformly at random from [0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1]","element":"span"},{"style":{"height":8.8},"width":18,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-15.png","element":"img","alt":"d","inline":true},{"text":", reward of arm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"is ","element":"span"},{"style":{"height":19.69},"width":164.7,"height":49.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-16.png","element":"img","alt":" a⊤j θ + ηt","inline":true},{"text":", where ","element":"span"},{"style":{"height":17.6},"width":195.26,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-17.png","element":"img","alt":" ηt ∼ N(0,","inline":true,"padRight":true},{"text":"1), and (2) There are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"parameters ","element":"span"},{"style":{"height":17.82},"width":233.72,"height":44.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-18.png","element":"img","alt":" µj for j ∈ [k","inline":true},{"text":"] all sampled uniformly at random from [0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"10], so that the reward of arm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"is sampled from ","element":"span"},{"style":{"height":17.82},"width":112.29,"height":44.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-19.png","element":"img","alt":" N(µj,","inline":true,"padRight":true},{"text":"1). We use CORRAL with learning rate ","element":"span"},{"style":{"height":25.5},"width":156.23,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-20.png","element":"img","alt":" η = 2√Td","inline":true,"padRight":true},{"text":"and UCB and LinUCB as base algorithm. In case (1) LinUCB performs better while in case (2) UCB performs better. Each experiment is repeated 500 times.","element":"span"}],[{"style":{"width":"93%"},"width":1608,"height":658,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/27-21.png","element":"img"}],[{"id":"id-44","text":"Figure 1: CORRAL with UCB and LinUCB bases. Shaded regions denote the standard ","element":"figcaption","subtype":"caption"},{"text":"deviations.","element":"figcaption","subtype":"caption"}],[{"id":"id-72","style":{"fontWeight":"bold"},"text":"Contextual Bandits with Unknown Dimension","element":"span"}],[{"text":"We consider model selection in the nested contextual linear bandit problem studied by [","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"FKL19","element":"a"},{"text":"]. In this problem the context space ","element":"span"},{"style":{"height":16.33},"width":159.42,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-0.png","element":"img","alt":" A ⊂ RD","inline":true},{"text":". Each action is a ","element":"span"},{"style":{"height":12},"width":71.34,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-1.png","element":"img","alt":" D−","inline":true},{"text":"dimensional vector and each context ","element":"span"},{"style":{"height":15.42},"width":46.84,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-2.png","element":"img","alt":" At","inline":true,"padRight":true},{"text":"is a subset of ","element":"span"},{"style":{"height":15.13},"width":59.52,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-3.png","element":"img","alt":" RD","inline":true},{"text":". The unknown parameter vector ","element":"span"},{"style":{"height":17.75},"width":234.06,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-4.png","element":"img","alt":" θ∗ ∈ RD but","inline":true,"padRight":true},{"text":"only its first ","element":"span"},{"style":{"height":15.02},"width":39.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-5.png","element":"img","alt":" d∗","inline":true,"padRight":true},{"text":"coordinates are nonzero. Here, ","element":"span"},{"style":{"height":15.02},"width":39.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-6.png","element":"img","alt":" d∗","inline":true,"padRight":true},{"text":"is unknown and possibly much smaller than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":". We assume access to a family of LinUCB algorithms ","element":"span"},{"style":{"height":20.02},"width":140.69,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-7.png","element":"img","alt":" {Bi}Mi=1","inline":true,"padRight":true},{"text":"with increasing ","element":"span"},{"text":"dimensionality ","element":"span"},{"style":{"height":15.02},"width":34.71,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-8.png","element":"img","alt":" di","inline":true},{"text":". Algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"is designed to ’believe’ the unknown parameter vector ","element":"span"},{"style":{"height":15.02},"width":37.48,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-9.png","element":"img","alt":" θ∗","inline":true,"padRight":true},{"text":"has only nonzero entries in the first ","element":"span"},{"style":{"height":15.02},"width":34.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-10.png","element":"img","alt":" di","inline":true,"padRight":true},{"text":"entries. In [","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"FKL19","element":"a"},{"text":"] the authors consider the special case when ","element":"span"},{"style":{"height":17.6},"width":414.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-11.png","element":"img","alt":" |At| = k < ∞ for all t","inline":true},{"text":". In order to obtain their model selection guarantees they require a lower bound on the average eigenvalues of the covariance matrices of all actions. In contrast, we do not require any such structural assumptions on the context. We provide the first sublinear regret for this problem when the action set is infinite. Further, we have no eigenvalue assumptions and our regret does not scale with the number of actions ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":".","element":"span"}],[{"text":"We use LinUCB with each value of ","element":"span"},{"style":{"height":13.2},"width":71.13,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-12.png","element":"img","alt":" d ∈","inline":true,"padRight":true},{"text":"[1","element":"span"},{"style":{"height":19.13},"width":298.29,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-13.png","element":"img","alt":", 2, 22, ..., 2log(D)","inline":true},{"text":"] as a base algorithm for CORRAL and EXP3.P. We also consider the case when both the optimal dimension ","element":"span"},{"style":{"height":15.02},"width":39.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-14.png","element":"img","alt":" d∗","inline":true,"padRight":true},{"text":"and the misspecification ","element":"span"},{"style":{"height":10.22},"width":34.71,"height":25.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-15.png","element":"img","alt":" ϵ∗","inline":true,"padRight":true},{"text":"are unknown: we use ","element":"span"},{"style":{"height":17.6},"width":374.85,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-16.png","element":"img","alt":" M = log(E) · log(D","inline":true},{"text":") modified LinUCB bases (see the discussion on Misspecified Contextual Linear Bandits above) for each value of (","element":"span"},{"style":{"height":15.6},"width":95.74,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-17.png","element":"img","alt":"ϵ∗, d∗","inline":true},{"text":") in the grid [1","element":"span"},{"style":{"height":20.33},"width":719.69,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-18.png","element":"img","alt":", 2, 22, ..., 2log(E)] × [1, 2, 22, ..., 2log(D)].","inline":true}],[{"text":"From Lemma ","element":"span"},{"href":"#id-45","text":"4.4 ","element":"a"},{"text":"and Lemma ","element":"span"},{"href":"#id-46","text":"4.5","element":"a"},{"text":", for linear contextual bandit, LinUCB is (","element":"span"},{"style":{"height":15.6},"width":118.14,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-19.png","element":"img","alt":"U, δ, T","inline":true},{"text":")-bounded with ","element":"span"},{"style":{"height":19.38},"width":451.8,"height":48.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-20.png","element":"img","alt":" U(t, δ) = O(d√t log(1/δ","inline":true},{"text":")) for infinite action sets ","element":"span"},{"style":{"height":12.4},"width":68.55,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-21.png","element":"img","alt":" U−","inline":true},{"text":"bounded with ","element":"span"},{"style":{"height":17.2},"width":169.35,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-22.png","element":"img","alt":" U(t, δ) =","inline":true},{"style":{"height":20.08},"width":452.51,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-23.png","element":"img","alt":"O(√dt log3(kT log(T)/δ","inline":true},{"text":")) for finite action sets. Choose ","element":"span"},{"style":{"height":17.6},"width":154.02,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-24.png","element":"img","alt":" δ = 1/T","inline":true,"padRight":true},{"text":"and ignore the log factor, ","element":"span"},{"style":{"height":19.38},"width":309.85,"height":48.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-25.png","element":"img","alt":"U(t, δ) = �O(d√t","inline":true},{"text":") for infinite action sets and ","element":"span"},{"style":{"height":20.08},"width":309.88,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-26.png","element":"img","alt":" U(t, δ) = �O(√dt","inline":true},{"text":") for finite action sets. Then ","element":"span"},{"style":{"height":17.6},"width":254.26,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-27.png","element":"img","alt":"U(t) = c(δ)tα","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":17.6},"width":130.19,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-28.png","element":"img","alt":" α = 1/","inline":true},{"text":"2 and ","element":"span"},{"style":{"height":17.6},"width":206.69,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-29.png","element":"img","alt":" c(δ) = �O(d","inline":true},{"text":") for infinite action sets, and ","element":"span"},{"style":{"height":20.08},"width":243.05,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-30.png","element":"img","alt":" c(δ) = �O(√d","inline":true},{"text":") for finite action sets.","element":"span"}],[{"text":"Now consider the misspecified linear contextual bandit problem with unknown ","element":"span"},{"style":{"height":15.02},"width":184.7,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-31.png","element":"img","alt":" d∗ and ϵ∗.","inline":true,"padRight":true},{"text":"We use the smoothed LinUCB bases [","element":"span"},{"href":"#id-0","referenceIndex":20,"text":"LSW20","element":"a"},{"text":"; ","element":"span"},{"href":"#id-42","referenceIndex":32,"text":"Zan+20","element":"a"},{"text":"]. Using the calculation in the proof of Theorem ","element":"span"},{"href":"#id-47","text":"6.1 ","element":"a"},{"text":"in Section ","element":"span"},{"text":"6","element":"span"},{"text":", using CORRAL with a smooth LinUCB base with parameters (","element":"span"},{"style":{"height":15.6},"width":60.16,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-32.png","element":"img","alt":"d, ϵ","inline":true},{"text":") in the grids results in ","element":"span"},{"style":{"height":31.6},"width":432.98,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-33.png","element":"img","alt":"�O�1η + Td2η + ϵ√dT�","inline":true},{"text":"regret. Since ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"is unknown, choosing","element":"span"}],[{"style":{"height":19.98},"width":198.58,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-34.png","element":"img","alt":"η = 1/√T","inline":true,"padRight":true},{"text":"yields the regret ","element":"span"},{"style":{"height":31.6},"width":365.08,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-35.png","element":"img","alt":"�O�√Td2∗ + ϵ√dT�","inline":true},{"text":". Using EXP3.P with a smooth LinUCB base with parameters (","element":"span"},{"style":{"height":15.6},"width":60.16,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-36.png","element":"img","alt":"d, ϵ","inline":true},{"text":") in the grids results in:","element":"span"}],[{"style":{"width":"63%"},"width":1092,"height":373,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-37.png","element":"img"}],[{"text":"Since ","element":"span"},{"style":{"height":15.02},"width":39.72,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-38.png","element":"img","alt":" d∗","inline":true,"padRight":true},{"text":"is unknown, choosing ","element":"span"},{"style":{"height":19.54},"width":196.57,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-39.png","element":"img","alt":" p = T −1/3","inline":true,"padRight":true},{"text":"yields a ","element":"span"},{"style":{"height":22.89},"width":334.72,"height":57.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/28-40.png","element":"img","alt":"�O(T23 d∗ + ϵ∗√dT","inline":true},{"text":") regret bound. We","element":"span"}],[{"id":"id-73","text":"summarize our results in the following table:","element":"span"}],[{"style":{"width":"90%"},"width":1555,"height":549,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-0.png","element":"img"}],[{"text":"We study model selection in the setting of non-parametric contextual bandits.[","element":"span"},{"href":"#id-48","referenceIndex":16,"text":"GJ18","element":"a"},{"text":"] consider non-parametric stochastic contextual bandits. At time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"and given a context ","element":"span"},{"style":{"height":18.34},"width":242.47,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-1.png","element":"img","alt":" xt ∈ RD, the","inline":true,"padRight":true},{"text":"learner selects arm ","element":"span"},{"style":{"height":17.6},"width":136.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-2.png","element":"img","alt":" at ∈ [k","inline":true},{"text":"] and observes the reward ","element":"span"},{"style":{"height":17.6},"width":136.99,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-3.png","element":"img","alt":" f(at, xt","inline":true},{"text":") + ","element":"span"},{"style":{"height":16.4},"width":31.09,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-4.png","element":"img","alt":" ξt","inline":true},{"text":", where ","element":"span"},{"style":{"height":16.4},"width":31.09,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-5.png","element":"img","alt":" ξt","inline":true,"padRight":true},{"text":"is a 1-sub-Gaussian random variable and for all ","element":"span"},{"style":{"height":17.6},"width":118.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-6.png","element":"img","alt":" a ∈ [k","inline":true},{"text":"], the reward function ","element":"span"},{"style":{"height":17.6},"width":97.83,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-7.png","element":"img","alt":" f(a, ·","inline":true},{"text":") is ","element":"span"},{"style":{"height":12},"width":63.7,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-8.png","element":"img","alt":" L−","inline":true},{"text":"lipschitz in the context ","element":"span"},{"style":{"height":15.93},"width":140.66,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-9.png","element":"img","alt":" x ∈ RD","inline":true},{"text":". It is assumed that the contexts arrive in an IID fashion. [","element":"span"},{"href":"#id-48","referenceIndex":16,"text":"GJ18","element":"a"},{"text":"] obtain a ","element":"span"},{"style":{"height":31.81},"width":189.28,"height":79.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-10.png","element":"img","alt":"�O�T1+d2+d�","inline":true},{"text":"regret for this problem. Similar to [","element":"span"},{"href":"#id-1","referenceIndex":15,"text":"FKL19","element":"a"},{"text":"], we assume that only the first ","element":"span"},{"style":{"height":15.02},"width":39.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-11.png","element":"img","alt":" d∗","inline":true,"padRight":true},{"text":"context features are relevant for an unknown ","element":"span"},{"style":{"height":15.02},"width":139.69,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-12.png","element":"img","alt":" d∗ < D","inline":true},{"text":". It is important to find ","element":"span"},{"style":{"height":15.02},"width":39.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-13.png","element":"img","alt":" d∗","inline":true,"padRight":true},{"text":"because ","element":"span"},{"style":{"height":22.12},"width":289.62,"height":55.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-14.png","element":"img","alt":" T1+d∗2+d∗ ≪ T1+D2+D","inline":true,"padRight":true},{"text":". Stochastic CORRAL can successfully adapt to this unknown quantity: we can initialize a smoothed copy of Algorithm 2 of [","element":"span"},{"href":"#id-48","referenceIndex":16,"text":"GJ18","element":"a"},{"text":"] for each value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"in the grid [","element":"span"},{"style":{"height":19.14},"width":359.38,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-15.png","element":"img","alt":"b0, b1, b2, ..., blogb(D)","inline":true},{"text":"] for some ","element":"span"},{"style":{"fontStyle":"italic"},"text":"b > ","element":"span"},{"text":"1 and perform model selection with CORRAL and EXP3.P with these base algorithms.","element":"span"}],[{"style":{"width":"68%"},"width":1176,"height":149,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-16.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Tuning the Exploration Rate of ","element":"span"},{"style":{"height":8.8},"width":19,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-17.png","element":"img","alt":" ϵ","inline":true},{"style":{"fontWeight":"bold"},"text":"-greedy","element":"span"}],[{"text":"We study the problem of selecting for the optimal scaling for the exploration probability in the ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-18.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy algorithm. Recall that for a given positive constant ","element":"span"},{"style":{"height":15.6},"width":138.45,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-19.png","element":"img","alt":" c, the ϵ","inline":true},{"text":"-greedy algorithm pulls the arm with the largest empirical average reward with probability 1 ","element":"span"},{"style":{"height":17.6},"width":103.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-20.png","element":"img","alt":" − c/t","inline":true},{"text":", and otherwise pulls an arm uniformly at random. Let ","element":"span"},{"style":{"height":17.6},"width":146.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-21.png","element":"img","alt":" ϵt = c/t","inline":true},{"text":". It can be shown that the optimal value for ","element":"span"},{"style":{"height":25.34},"width":509.54,"height":63.34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-22.png","element":"img","alt":" ϵt is min{1, 5k∆2∗t} where ∆∗","inline":true,"padRight":true},{"text":"is the smallest gap between the optimal arm and the ","element":"span"},{"text":"sub-optimal arms [","element":"span"},{"href":"#id-40","referenceIndex":19,"text":"LS20","element":"a"},{"text":"]. With this exploration rate, the regret scales as ","element":"span"},{"style":{"height":19.98},"width":333.46,"height":49.95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-23.png","element":"img","alt":"�O(√T) for k = 2.","inline":true,"padRight":true},{"text":"We would like to find the optimal value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"without the knowledge of ∆","element":"span"},{"style":{"height":6},"width":17,"height":15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-24.png","element":"img","alt":"∗","inline":true},{"text":". In this discussion we show it is possible to obtain such result by applying CORRAL to a set of ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-25.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy base algorithms each instantiated with a ","element":"span"},{"style":{"height":20.33},"width":455.74,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-26.png","element":"img","alt":" c in [1, 2, 22, ..., 2log(kT)].","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Theorem 6.3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The regret of CORRAL using smoothed ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-27.png","element":"img","alt":" ϵ","inline":true},{"style":{"fontStyle":"italic"},"text":"-greedy base algorithms defined on the grid is bounded by ","element":"span"},{"style":{"height":20.33},"width":397.01,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/29-28.png","element":"img","alt":"�O(T 1/2) when k = 2.","inline":true}],[{"id":"id-75","style":{"width":"48%"},"width":836,"height":584,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-0.png","element":"img"}],[{"id":"id-50","text":"Figure 2: CORRAL with ","element":"figcaption","subtype":"caption"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-1.png","element":"img","alt":" ϵ","inline":true},{"text":"-Greedy bases with different exploration rates. ","element":"figcaption","subtype":"caption"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-2.png","element":"img","alt":"5","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"From Lemma ","element":"span"},{"href":"#id-49","text":"4.9","element":"a"},{"text":", we lower bound the smallest gap by 1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/T ","element":"span"},{"text":"(because the gaps smaller than 1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/T ","element":"span"},{"text":"will cause constant regret in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"time steps) and choose ","element":"span"},{"style":{"height":19.13},"width":172.26,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-3.png","element":"img","alt":" δ = 1/T 5","inline":true},{"text":". From Theorem ","element":"span"},{"href":"#id-19","text":"4.11","element":"a"},{"text":", the regret is ","element":"span"},{"style":{"height":20.34},"width":134.24,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-4.png","element":"img","alt":"�O(T 2/3","inline":true},{"text":") when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k > ","element":"span"},{"text":"2 and ","element":"span"},{"style":{"height":20.34},"width":134.24,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-5.png","element":"img","alt":"�O(T 1/2","inline":true},{"text":") when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"= 2 with the base running alone.","element":"span"}],[{"text":"Next we show that the best value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"in the exponential grid gives a regret that is within a constant factor of the regret above where we known the smallest non-zero gap ∆","element":"span"},{"style":{"height":6},"width":17,"height":15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-6.png","element":"img","alt":"∗","inline":true},{"text":". An exploration rates can be at most ","element":"span"},{"style":{"height":25.34},"width":304.45,"height":63.34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-7.png","element":"img","alt":" kT. Since 5K∆2∗ >","inline":true,"padRight":true},{"text":"1, we need to search only in the ","element":"span"},{"text":"interval [1","element":"span"},{"style":{"fontStyle":"italic"},"text":", KT","element":"span"},{"text":"]. Let ","element":"span"},{"style":{"height":10.62},"width":35.88,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-8.png","element":"img","alt":" c1","inline":true,"padRight":true},{"text":"be the element in the exponential grid such that ","element":"span"},{"style":{"height":14.95},"width":262.62,"height":37.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-9.png","element":"img","alt":" c1 ≤ c∗ ≤ 2c1","inline":true},{"text":". Then 2","element":"span"},{"style":{"height":16},"width":370.18,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-10.png","element":"img","alt":"c1 = γc∗ where γ <","inline":true,"padRight":true},{"text":"2 is a constant, and therefore using 2","element":"span"},{"style":{"height":15.94},"width":156.78,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-11.png","element":"img","alt":"c1 = γc∗ ","inline":true,"padRight":true},{"text":"will give a regret up to a constant factor of the optimal regret.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Experiment (Figure ","element":"span"},{"href":"#id-50","style":{"fontWeight":"bold"},"text":"2","element":"a"},{"style":{"fontWeight":"bold"},"text":"). ","element":"span"},{"text":"Let there be two Bernoulli arms with means 0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"5 and 0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"45. We use 18 ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-12.png","element":"img","alt":" ϵ","inline":true},{"text":"-greedy base algorithms differing in their choice of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"in the exploration rate ","element":"span"},{"style":{"height":19.98},"width":904.43,"height":49.95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/30-13.png","element":"img","alt":"ϵt = c/t. We take T = 50, 000, η = 20/√T and ϵ","inline":true},{"text":"’s to lie on a geometric grid in [1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":"]","element":"span"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"Each experiments is repeated 50 times.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Reinforcement Learning","element":"span"}],[{"text":"We can instantiate Stochastic CORRAL model selection regret guarantees to the episodic linear MDP setting of [","element":"span"},{"href":"#id-51","referenceIndex":18,"text":"Jin+20","element":"a"},{"text":"], again with nested feature classes of doubling dimension just as in the case of the Contextual Bandits with Unknown Dimension. Let’s formally define a Linear MDP,","element":"span"}],[{"id":"id-81","style":{"width":"43%"},"width":740,"height":552,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-0.png","element":"img"}],[{"text":"Figure 3: ","element":"figcaption","subtype":"caption"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-1.png","element":"img","alt":" ϵ","inline":true},{"id":"id-54","text":"-Greedy vs UCRL2 vs PSRL in the River Swim environment [","element":"figcaption","subtype":"caption"},{"href":"#id-52","referenceIndex":31,"text":"SL08","element":"a","subtype":"caption"},{"text":"].","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"Definition 6.4 ","element":"span"},{"text":"(Linear MDP ( Assumption A in [","element":"span"},{"href":"#id-51","referenceIndex":18,"text":"Jin+20","element":"a"},{"text":"]))","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"An episodic MDP (Denoted by the tuple ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"style":{"fontStyle":"italic"},"text":", A, H, ","element":"span"},{"text":"P","element":"span"},{"style":{"fontStyle":"italic"},"text":", r","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":") is a linear MDP with a feature map ","element":"span"},{"text":"Φ : ","element":"span"},{"style":{"height":15.53},"width":242.72,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-2.png","element":"img","alt":" S × A → Rd","inline":true},{"style":{"fontStyle":"italic"},"text":", if for any ","element":"span"},{"style":{"height":17.6},"width":129.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-3.png","element":"img","alt":" h ∈ [H","inline":true},{"text":"] ","element":"span"},{"style":{"fontStyle":"italic"},"text":"there exist ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"style":{"fontStyle":"italic"},"text":"unknown (signed) measures ","element":"span"},{"style":{"height":24.44},"width":373.61,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-4.png","element":"img","alt":" µh = (µ(1)h , · · · , µ(d)h","inline":true,"padRight":true},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"unknown vector ","element":"span"},{"style":{"height":17.98},"width":148.87,"height":44.95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-5.png","element":"img","alt":" θh ∈ Rd","inline":true},{"style":{"fontStyle":"italic"},"text":", such that for any ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":440.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-6.png","element":"img","alt":"s, a) ∈ S × A, we have,","inline":true}],[{"style":{"width":"61%"},"width":1049,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-7.png","element":"img"}],[{"text":"The value function for a linear MDP also satisfies a linear parametrization,","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proposition 6.5 ","element":"span"},{"text":"(Proposition 2.3 from [","element":"span"},{"href":"#id-51","referenceIndex":18,"text":"Jin+20","element":"a"},{"text":"])","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For a linear MDP, and for any policy ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-8.png","element":"img","alt":"π","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"there exist ","element":"span"},{"style":{"height":12.8},"width":56.71,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-9.png","element":"img","alt":" d−","inline":true},{"style":{"fontStyle":"italic"},"text":"dimensional weights ","element":"span"},{"style":{"height":19.95},"width":193.95,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-10.png","element":"img","alt":" {wπh}h∈[H]","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that for any ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":402.23,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-11.png","element":"img","alt":"s, a, h) ∈ S × A × [H","inline":true},{"text":"] ","element":"span"},{"style":{"fontStyle":"italic"},"text":"we have that the value function of policy ","element":"span"},{"style":{"height":18.51},"width":672.33,"height":46.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-12.png","element":"img","alt":" π satisfies Qπh(s, a) = ⟨Φ(s, a), wπh⟩.","inline":true}],[{"text":"For the purpose of studying model selection in the setting of linear MDPs we assume access to ","element":"span"},{"style":{"height":12},"width":71.34,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-13.png","element":"img","alt":" D−","inline":true},{"text":"dimensional feature maps Φ : ","element":"span"},{"style":{"height":15.54},"width":245.18,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-14.png","element":"img","alt":" S × A → RD","inline":true},{"text":". For all policies ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-15.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"the unknown parameters ","element":"span"},{"style":{"height":19.95},"width":193.96,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-16.png","element":"img","alt":" {wπh}h∈[H]","inline":true,"padRight":true},{"text":"are all assumed to have unknown coordinates only in their first ","element":"span"},{"style":{"height":15.02},"width":39.71,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-17.png","element":"img","alt":"d∗","inline":true,"padRight":true},{"text":"dimensions. We assume access to a family of LSVI-UCB (Algorithm 1 of [","element":"span"},{"href":"#id-51","referenceIndex":18,"text":"Jin+20","element":"a"},{"text":"]) algorithms ","element":"span"},{"style":{"height":20.02},"width":140.69,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-18.png","element":"img","alt":" {Bi}Mi=1","inline":true,"padRight":true},{"text":"with increasing dimensionality ","element":"span"},{"style":{"height":15.02},"width":34.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-19.png","element":"img","alt":" di","inline":true},{"text":". Algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"is designed to ‘believe’ ","element":"span"},{"text":"the unknown parameter vectors ","element":"span"},{"style":{"height":19.95},"width":193.96,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-20.png","element":"img","alt":" {wπh}h∈[H]","inline":true,"padRight":true},{"text":"has only nonzero entries in the first ","element":"span"},{"style":{"height":15.02},"width":34.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-21.png","element":"img","alt":" di","inline":true,"padRight":true},{"text":"entries ","element":"span"},{"text":"for all policies ","element":"span"},{"style":{"height":8},"width":38.44,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-22.png","element":"img","alt":" π.","inline":true}],[{"id":"id-53","style":{"fontWeight":"bold"},"text":"Theorem 6.6. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"= (","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"style":{"fontStyle":"italic"},"text":", A, H, ","element":"span"},{"text":"P","element":"span"},{"style":{"fontStyle":"italic"},"text":", r","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be a linear MDP parametrized by a feature map ","element":"span"},{"style":{"height":20.02},"width":697.29,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-23.png","element":"img","alt":"{Φ : S × A → RD}. Let {Φi(s, a)}Mi=1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be the family of nested feature maps such that ","element":"span"},{"text":"Φ","element":"span"},{"style":{"height":17.2},"width":110.07,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-24.png","element":"img","alt":"i(s, a)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"corresponds to the top ","element":"span"},{"style":{"height":15.02},"width":34.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-25.png","element":"img","alt":" di","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"entries of ","element":"span"},{"text":"Φ(","element":"span"},{"style":{"fontStyle":"italic"},"text":"s, a","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":". Assume that for all policies ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-26.png","element":"img","alt":" π","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the unknown parameters ","element":"span"},{"style":{"height":19.95},"width":193.96,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-27.png","element":"img","alt":" {wπh}h∈[H]","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"have nonzero coordinates only in their first ","element":"span"},{"style":{"height":15.02},"width":39.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-28.png","element":"img","alt":" d∗","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"dimensions and that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"there exists an index ","element":"span"},{"style":{"height":14.62},"width":32.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-29.png","element":"img","alt":" i∗","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that ","element":"span"},{"style":{"height":15.02},"width":287.62,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-30.png","element":"img","alt":" d∗ ≤ di ≤ 2d∗","inline":true},{"style":{"fontStyle":"italic"},"text":". Selecting among different smoothed LSVI-UCB base algorithms corresponding to the feature maps ","element":"span"},{"style":{"height":20.02},"width":144.17,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-31.png","element":"img","alt":" {Φi}Mi=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"using Stochastic ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CORRAL with a CORRAL meta-algorithm and ","element":"span"},{"style":{"height":26.19},"width":295.52,"height":65.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-32.png","element":"img","alt":" η = M1/2T 1/2d3/2H3/2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"satisfies a regret guarantee:","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"R","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":") ","element":"span"},{"style":{"height":31.6},"width":369.81,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/31-33.png","element":"img","alt":" ≤ �O�√Md3H3T�.","inline":true}],[{"id":"id-74","style":{"fontStyle":"italic"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-53","style":{"fontStyle":"italic"},"text":"6.6","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"When well specified the LSVI-UCB algorithm [","element":"span"},{"href":"#id-51","referenceIndex":18,"text":"Jin+20","element":"a"},{"text":"] satisfies the high probability bound ","element":"span"},{"style":{"height":20.65},"width":221.02,"height":51.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-0.png","element":"img","alt":"�O(√d3H3T","inline":true},{"text":") where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"H ","element":"span"},{"text":"is the length of each episode. The result then follows from Theorem ","element":"span"},{"href":"#id-19","text":"4.11 ","element":"a"},{"text":"by setting the CORRAL meta-algorithm learning rate as ","element":"span"},{"style":{"height":26.19},"width":314.58,"height":65.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-1.png","element":"img","alt":"η = M1/2T 1/2d3/2H3/2 .","inline":true}],[{"text":"We also observe that in practice, smoothing RL algorithms such as UCRL and PSRL and using a CORRAL meta-algorithm on top of them can lead to improved performance. In Figure ","element":"span"},{"href":"#id-54","text":"3","element":"a"},{"text":", we present results for the model selection problem among distinct RL algorithms in the River Swim environment [","element":"span"},{"href":"#id-52","referenceIndex":31,"text":"SL08","element":"a"},{"text":"]. We use three different bases, ","element":"span"},{"style":{"height":8},"width":51.72,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-2.png","element":"img","alt":" ϵ−","inline":true},{"text":"greedy ","element":"span"},{"style":{"height":16},"width":68.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-3.png","element":"img","alt":"Q−","inline":true},{"text":"learning with ","element":"span"},{"style":{"height":8},"width":87.75,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-4.png","element":"img","alt":" ϵ = .","inline":true},{"text":"1, Posterior Sampling Reinforcement Learning (PSRL), as described in [","element":"span"},{"href":"#id-55","referenceIndex":25,"text":"OVR17","element":"a"},{"text":"] and UCRL2 as described in [","element":"span"},{"href":"#id-56","referenceIndex":17,"text":"JOA10","element":"a"},{"text":"]. The implementation of these algorithms and the environment is taken from TabulaRL (","element":"span"},{"href":"https://github.com/iosband/TabulaRL","style":{"fontFamily":"monospace"},"text":"https://github.com/iosband/TabulaRL","element":"a"},{"text":"), a popular benchmark suite for tabular reinforcement learning problems. Smooth CORRAL uses a CORRAL meta-algorithm with a learning rate ","element":"span"},{"style":{"height":25.5},"width":149.76,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-5.png","element":"img","alt":" η = 15√T","inline":true,"padRight":true},{"text":", all base algorithms are ","element":"span"},{"text":"smoothed using Algorithm ","element":"span"},{"href":"#id-18","text":"8","element":"a"},{"text":". The curves for UCRL2, PSRL and ","element":"span"},{"style":{"height":8},"width":51.72,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-6.png","element":"img","alt":" ϵ−","inline":true},{"text":"greedy are all of their un-smoothed versions. Each experiment was repeated 10 times and we have reported the mean cumulative regret and shaded a region around them corresponding to ","element":"span"},{"style":{"height":11.6},"width":45.94,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-7.png","element":"img","alt":" ±.","inline":true},{"text":"3 the standard deviation across these 10 runs.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Generalized Linear Bandits with Unknown Link Function","element":"span"}],[{"text":"[","element":"span"},{"href":"#id-57","referenceIndex":23,"text":"LLZ17","element":"a"},{"text":"] study the generalized linear bandit model for the stochastic ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"-armed contextual bandit problem. In round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"and given context ","element":"span"},{"style":{"height":17.75},"width":186.36,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-8.png","element":"img","alt":" xt ∈ Rd×k","inline":true},{"text":", the learner chooses arm ","element":"span"},{"style":{"height":14.62},"width":27.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-9.png","element":"img","alt":" it","inline":true,"padRight":true},{"text":"and observes reward ","element":"span"},{"style":{"height":20.49},"width":256.83,"height":51.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-10.png","element":"img","alt":" rt = µ(x⊤t,itθ∗","inline":true},{"text":") + ","element":"span"},{"style":{"height":16.4},"width":31.09,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-11.png","element":"img","alt":" ξt","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":15.93},"width":151.74,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-12.png","element":"img","alt":" θ∗ ∈ Rd","inline":true,"padRight":true},{"text":"is an unknown parameter vector, ","element":"span"},{"style":{"height":16.4},"width":31.1,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-13.png","element":"img","alt":" ξt","inline":true,"padRight":true},{"text":"is a conditionally zero-mean random variable and ","element":"span"},{"style":{"height":16},"width":205.84,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-14.png","element":"img","alt":" µ : R → R","inline":true,"padRight":true},{"text":"is called the link function. [","element":"span"},{"href":"#id-57","referenceIndex":23,"text":"LLZ17","element":"a"},{"text":"] obtain the high probability regret bound ","element":"span"},{"style":{"height":20.08},"width":141.23,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-15.png","element":"img","alt":"�O(√dT","inline":true},{"text":") where the link function is known. Suppose we have a set of link functions ","element":"span"},{"text":"L ","element":"span"},{"text":"that contains the true link function ","element":"span"},{"style":{"height":12},"width":26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-16.png","element":"img","alt":" µ","inline":true},{"text":". Since the target regret ","element":"span"},{"style":{"height":20.08},"width":141.73,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-17.png","element":"img","alt":"�O(√dT","inline":true},{"text":") is known, we can run CORRAL with the algorithm in [","element":"span"},{"href":"#id-57","referenceIndex":23,"text":"LLZ17","element":"a"},{"text":"] with each link function in the set as a base algorithm. From Theorem ","element":"span"},{"href":"#id-19","text":"4.11","element":"a"},{"text":", CORRAL will achieve regret ","element":"span"},{"style":{"height":20.8},"width":231.68,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-18.png","element":"img","alt":"�O(�|L|dT).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Bandits with Heavy Tail","element":"span"}],[{"text":"[","element":"span"},{"href":"#id-58","referenceIndex":29,"text":"Sha+18","element":"a"},{"text":"] study the linear stochastic bandit problem with heavy tail. If the reward distribution has finite moment of order 1 + ","element":"span"},{"style":{"height":10.22},"width":34.72,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-19.png","element":"img","alt":" ϵ∗","inline":true},{"text":", [","element":"span"},{"href":"#id-58","referenceIndex":29,"text":"Sha+18","element":"a"},{"text":"] obtain the high probability regret bound ","element":"span"},{"style":{"height":31.6},"width":203.7,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-20.png","element":"img","alt":"�O�T 11+ϵ∗�","inline":true},{"text":". We consider the problem when ","element":"span"},{"style":{"height":12.22},"width":80.62,"height":30.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-21.png","element":"img","alt":" ϵ∗ ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1] is unknown with a known lower bound ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"is a conservative estimate and ","element":"span"},{"style":{"height":10.22},"width":34.71,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-22.png","element":"img","alt":" ϵ∗","inline":true,"padRight":true},{"text":"could be much larger than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":". To the best of our knowledge, we provide the first result when ","element":"span"},{"style":{"height":10.22},"width":34.71,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-23.png","element":"img","alt":" ϵ∗","inline":true,"padRight":true},{"text":"is unknown. We use the algorithms in [","element":"span"},{"href":"#id-58","referenceIndex":29,"text":"Sha+18","element":"a"},{"text":"] with value of ","element":"span"},{"style":{"height":10.22},"width":34.71,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-24.png","element":"img","alt":" ϵ∗","inline":true,"padRight":true},{"text":"in the grid [","element":"span"},{"style":{"height":19.14},"width":296.87,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-25.png","element":"img","alt":"blogb(L), ..., b1, b0","inline":true},{"text":"] for some 0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"< b < ","element":"span"},{"text":"1 as base algorithms with ","element":"span"},{"style":{"height":19.93},"width":189.72,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/32-26.png","element":"img","alt":" η = T −1/2 ","inline":true,"padRight":true},{"text":"for CORRAL. A direct application of Theorem ","element":"span"},{"href":"#id-19","text":"4.11 ","element":"a"},{"text":"yields ","element":"span"},{"id":"id-63","text":"regret ","element":"span"},{"style":{"height":21},"width":247.34,"height":52.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/33-0.png","element":"img","alt":"�O�T 1−0.5bϵ∗�","inline":true},{"text":". When ","element":"span"},{"style":{"height":21},"width":1031.34,"height":52.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/33-1.png","element":"img","alt":" ϵ∗ = 1 (as in the case of finite variance), �O�T 1−0.5bϵ∗�","inline":true},{"text":"is close to ","element":"span"},{"style":{"height":20.8},"width":306.09,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/33-2.png","element":"img","alt":"�O�T 0.5�when b","inline":true,"padRight":true},{"text":"is close to 1.","element":"span"}]]},{"heading":"7 Conclusion","paragraphs":[[{"text":"In this work we introduced the Stochastic CORRAL algorithm that successfully combines an EXP3 or CORRAL adversarial meta-algorithm with a wide variety of stochastic base algorithms for contextual bandits and reinforcement learning. We improve the results of the original CORRAL approach [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"] that requires the base algorithms to satisfy a stability condition not often fulfilled by even the simplest stochastic bandit algorithms such as UCB and OFUL. Our approach can make use of the input base algorithms in a fully blackbox fashion without the need of reproving regret bounds for the component base algorithms. This versatility has allowed us to crack several open problems ranging from algorithms that adapt to the misspecification level in linear contextual bandits to the effective dimension in non-parametric problems.","element":"span"}]]},{"heading":"References","paragraphs":[[{"text":"[AYPP20] ","element":"span"},{"text":"Yasin Abbasi-Yadkori, Aldo Pacchiano, and My Phan. “Regret Balancing for Bandit and RL Model Selection”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2006.05491 ","element":"span"},{"text":"(2020) (Cited on page ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":").","element":"span"}],[{"id":"id-5","text":"[AYPS11] ","element":"span"},{"text":"Yasin Abbasi-Yadkori, D´avid P´al, and Csaba Szepesv´ari. “Improved Algorithms for Linear Stochastic Bandits”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":". 2011, pp. 2312–2320 (Cited on pages ","element":"span"},{"text":"2","element":"span"},{"text":", ","element":"span"},{"href":"#id-60","text":"17","element":"a"},{"text":").","element":"span"}],[{"id":"id-6","text":"[AYPS12] ","element":"span"},{"text":"Yasin Abbasi-Yadkori, David P´al, and Csaba Szepesv´ari. “Online-to-Confidence-Set Conversions and Application to Sparse Stochastic Bandits”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":". PMLR. 2012 (Cited on page ","element":"span"},{"text":"2","element":"span"},{"text":").","element":"span"}],[{"id":"id-8","text":"[Aga+17] ","element":"span"},{"text":"Alekh Agarwal, Haipeng Luo, Behnam Neyshabur, and Robert E Schapire. “Corralling a Band of Bandit Algorithms”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on Learning Theory","element":"span"},{"text":". PMLR. 2017, pp. 12–38 (Cited on pages ","element":"span"},{"text":"2","element":"span"},{"text":"–","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":", ","element":"span"},{"href":"#id-61","text":"6","element":"a"},{"text":", ","element":"span"},{"text":"8","element":"span"},{"text":"–","element":"span"},{"href":"#id-62","text":"10","element":"a"},{"text":", ","element":"span"},{"href":"#id-63","text":"34","element":"a"},{"text":", ","element":"span"},{"href":"#id-64","text":"41","element":"a"},{"text":", ","element":"span"},{"href":"#id-65","text":"49","element":"a"},{"text":").","element":"span"}],[{"text":"[AMM21] ","element":"span"},{"text":"Raman Arora, Teodor Vanislavov Marinov, and Mehryar Mohri. “Corralling Stochastic Bandit Algorithms”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":". PMLR. 2021, pp. 2116–2124 (Cited on page ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":").","element":"span"}],[{"id":"id-2","text":"[AMS09] ","element":"span"},{"text":"Jean-Yves Audibert, R´emi Munos, and Csaba Szepesv´ari. “Exploration-Exploitation Tradeoff Using Variance Estimates in Multi-Armed Bandits”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Theoretical Computer Science ","element":"span"},{"text":"410.19 (2009), pp. 1876–1902 (Cited on page ","element":"span"},{"text":"2","element":"span"},{"text":").","element":"span"}],[{"id":"id-11","text":"[ACBF02] ","element":"span"},{"text":"Peter Auer, Nicolo Cesa-Bianchi, and Paul Fischer. “Finite-Time Analysis of the Multiarmed Bandit Problem”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine learning ","element":"span"},{"text":"47.2 (2002), pp. 235– 256 (Cited on pages ","element":"span"},{"text":"2","element":"span"},{"text":", ","element":"span"},{"href":"#id-66","text":"18","element":"a"},{"text":", ","element":"span"},{"href":"#id-67","text":"19","element":"a"},{"text":").","element":"span"}],[{"id":"id-9","text":"[BC12] ","element":"span"},{"text":"S´ebastien Bubeck and Nicol`o Cesa-Bianchi. “Regret Analysis of Stochastic and Nonstochastic Multi-Armed Bandit Problems”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CoRR ","element":"span"},{"text":"abs/1204.5721 (2012). arXiv: ","element":"span"},{"href":"https://arxiv.org/abs/1204.5721","style":{"fontFamily":"monospace"},"text":"1204.5721 ","element":"a"},{"text":"(Cited on pages ","element":"span"},{"text":"2","element":"span"},{"text":", ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":", ","element":"span"},{"href":"#id-62","text":"10","element":"a"},{"text":").","element":"span"}],[{"id":"id-22","text":"[BS12] ","element":"span"},{"text":"S´ebastien Bubeck and Aleksandrs Slivkins. “The Best of Both Worlds: Stochastic and Adversarial Bandits”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on Learning Theory","element":"span"},{"text":". PMLR. 2012, pp. 42–1 (Cited on pages ","element":"span"},{"href":"#id-68","text":"11","element":"a"},{"text":", ","element":"span"},{"href":"#id-69","text":"42","element":"a"},{"text":").","element":"span"}],[{"id":"id-7","text":"[CM12] ","element":"span"},{"text":"Alexandra Carpentier and R´emi Munos. “Bandit Theory Meets Compressed Sensing for High Dimensional Stochastic Linear Bandit”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":". PMLR. 2012, pp. 190–198 (Cited on page ","element":"span"},{"text":"2","element":"span"},{"text":").","element":"span"}],[{"id":"id-10","text":"[Chu+11] ","element":"span"},{"text":"Wei Chu, Lihong Li, Lev Reyzin, and Robert Schapire. “Contextual Bandits with Linear Payoff Functions”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":". PMLR, 2011, pp. 208–214 (Cited on pages ","element":"span"},{"text":"2","element":"span"},{"text":", ","element":"span"},{"href":"#id-60","text":"17","element":"a"},{"text":").","element":"span"}],[{"text":"[Cut+21] ","element":"span"},{"text":"Ashok Cutkosky, Christoph Dann, Abhimanyu Das, Claudio Gentile, Aldo Pacchiano, and Manish Purohit. “Dynamic Balancing for Model Selection in Bandits and RL”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":". PMLR. 2021, pp. 2276–2285 (Cited on page ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":").","element":"span"}],[{"id":"id-3","text":"[DHK08] ","element":"span"},{"text":"Varsha Dani, Thomas P. Hayes, and Sham M. Kakade. “Stochastic Linear Optimization under Bandit Feedback”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on Learning Theory","element":"span"},{"text":". PMLR. 2008 (Cited on page ","element":"span"},{"text":"2","element":"span"},{"text":").","element":"span"}],[{"id":"id-13","text":"[FR20] ","element":"span"},{"text":"Dylan Foster and Alexander Rakhlin. “Beyond UCB: Optimal and Efficient Contextual Bandits with Regression Oracles”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":". PMLR. 2020, pp. 3199–3210 (Cited on page ","element":"span"},{"href":"#id-70","text":"5","element":"a"},{"text":").","element":"span"}],[{"id":"id-1","text":"[FKL19] ","element":"span"},{"text":"Dylan J Foster, Akshay Krishnamurthy, and Haipeng Luo. “Model Selection for Contextual Bandits”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":". 2019, pp. 14741–14752 (Cited on pages ","element":"span"},{"href":"#id-71","text":"1","element":"a"},{"text":", ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":", ","element":"span"},{"href":"#id-72","text":"29","element":"a"},{"text":", ","element":"span"},{"href":"#id-73","text":"30","element":"a"},{"text":").","element":"span"}],[{"id":"id-48","text":"[GJ18] ","element":"span"},{"text":"Melody Guan and Heinrich Jiang. “Nonparametric Stochastic Contextual Bandits”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AAAI Conference on Artificial Intelligence","element":"span"},{"text":". Vol. 32. 1. 2018 (Cited on page ","element":"span"},{"href":"#id-73","text":"30","element":"a"},{"text":").","element":"span"}],[{"id":"id-56","text":"[JOA10] ","element":"span"},{"text":"Thomas Jaksch, Ronald Ortner, and Peter Auer. “Near-Optimal Regret Bounds for Reinforcement Learning”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research ","element":"span"},{"text":"11.Apr (2010), pp. 1563–1600 (Cited on page ","element":"span"},{"href":"#id-74","text":"33","element":"a"},{"text":").","element":"span"}],[{"id":"id-51","text":"[Jin+20] ","element":"span"},{"text":"Chi Jin, Zhuoran Yang, Zhaoran Wang, and Michael I Jordan. “Provably Efficient Reinforcement Learning with Linear Function Approximation”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on Learning Theory","element":"span"},{"text":". PMLR. 2020, pp. 2137–2143 (Cited on pages ","element":"span"},{"href":"#id-75","text":"31","element":"a"},{"text":"– ","element":"span"},{"href":"#id-74","text":"33","element":"a"},{"text":").","element":"span"}],[{"id":"id-40","text":"[LS20] ","element":"span"},{"text":"Tor Lattimore and Csaba Szepesv´ari. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Bandit Algorithms","element":"span"},{"text":". Cambridge University Press, 2020 (Cited on pages ","element":"span"},{"href":"#id-76","text":"24","element":"a"},{"text":", ","element":"span"},{"href":"#id-77","text":"27","element":"a"},{"text":", ","element":"span"},{"href":"#id-78","text":"28","element":"a"},{"text":", ","element":"span"},{"href":"#id-73","text":"30","element":"a"},{"text":").","element":"span"}],[{"id":"id-0","text":"[LSW20] ","element":"span"},{"text":"Tor Lattimore, Csaba Szepesvari, and Gellert Weisz. “Learning with Good Feature Representations in Bandits and in RL with a Generative Model”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":". PMLR. 2020, pp. 5662–5670 (Cited on pages ","element":"span"},{"href":"#id-71","text":"1","element":"a"},{"text":", ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":", ","element":"span"},{"href":"#id-79","text":"26","element":"a"},{"text":", ","element":"span"},{"href":"#id-77","text":"27","element":"a"},{"text":", ","element":"span"},{"href":"#id-72","text":"29","element":"a"},{"text":").","element":"span"}],[{"text":"[Lee+21] ","element":"span"},{"text":"Jonathan Lee, Aldo Pacchiano, Vidya Muthukumar, Weihao Kong, and Emma Brunskill. “Online Model Selection for Reinforcement Learning with Function Approximation”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":". PMLR. 2021, pp. 3340–3348 (Cited on page ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":").","element":"span"}],[{"id":"id-4","text":"[Li+10] ","element":"span"},{"text":"Lihong Li, Wei Chu, John Langford, and Robert E Schapire. “A Contextual Bandit Approach to Personalized News Article Recommendation”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International conference on World wide web","element":"span"},{"text":". 2010, pp. 661–670 (Cited on page ","element":"span"},{"text":"2","element":"span"},{"text":").","element":"span"}],[{"id":"id-57","text":"[LLZ17] ","element":"span"},{"text":"Lihong Li, Yu Lu, and Dengyong Zhou. “Provably Optimal Algorithms for Generalized Linear Contextual Bandits”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":". PMLR. 2017, pp. 2071–2080 (Cited on page ","element":"span"},{"href":"#id-74","text":"33","element":"a"},{"text":").","element":"span"}],[{"id":"id-12","text":"[OM11] ","element":"span"},{"text":"Maillard Odalric and R´emi Munos. “Adaptive Bandits: Towards the Best History-Dependent Strategy”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":". PMLR. 2011, pp. 570–578 (Cited on page ","element":"span"},{"href":"#id-80","text":"3","element":"a"},{"text":").","element":"span"}],[{"id":"id-55","text":"[OVR17] ","element":"span"},{"text":"Ian Osband and Benjamin Van Roy. “Why is Posterior Sampling Better than Optimism for Reinforcement Learning?” In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":". PMLR. 2017, pp. 2701–2710 (Cited on page ","element":"span"},{"href":"#id-74","text":"33","element":"a"},{"text":").","element":"span"}],[{"text":"[PDG22] ","element":"span"},{"text":"Aldo Pacchiano, Christoph Dann, and Claudio Gentile. “Best of Both Worlds Model Selection”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems ","element":"span"},{"text":"(2022) (Cited on page ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":").","element":"span"}],[{"text":"[Pac+20] ","element":"span"},{"text":"Aldo Pacchiano, Christoph Dann, Claudio Gentile, and Peter Bartlett. “Regret Bound Balancing and Elimination for Model Selection in Bandits and RL”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2012.13045 ","element":"span"},{"text":"(2020) (Cited on page ","element":"span"},{"href":"#id-59","text":"4","element":"a"},{"text":").","element":"span"}],[{"id":"id-33","text":"[Sel+13] ","element":"span"},{"text":"Yevgeny Seldin, Csaba Szepesvari, Peter Auer, and Yasin Abbasi-Yadkori. “Evaluation and Analysis of the Performance of the EXP3 Algorithm in Stochastic Environments”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"European Workshop on Reinforcement Learning","element":"span"},{"text":". 2013 (Cited on page ","element":"span"},{"href":"#id-60","text":"17","element":"a"},{"text":").","element":"span"}],[{"id":"id-58","text":"[Sha+18] ","element":"span"},{"text":"Han Shao, Xiaotian Yu, Irwin King, and Michael R Lyu. “Almost Optimal Algorithms for Linear Stochastic Bandits with Heavy-Tailed Payoffs”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":". Ed. by S. Bengio, H. Wallach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett. 2018, pp. 8420–8429 (Cited on page ","element":"span"},{"href":"#id-74","text":"33","element":"a"},{"text":").","element":"span"}],[{"id":"id-34","text":"[Sli19] ","element":"span"},{"text":"Aleksandrs Slivkins. “Introduction to Multi-Armed Bandits”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1904.07272 ","element":"span"},{"text":"(2019) (Cited on page ","element":"span"},{"href":"#id-60","text":"17","element":"a"},{"text":").","element":"span"}],[{"id":"id-52","text":"[SL08] ","element":"span"},{"text":"Alexander L Strehl and Michael L Littman. “An Analysis of Model-Based Interval Estimation for Markov Decision Processes”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Computer and System Sciences ","element":"span"},{"text":"74.8 (2008), pp. 1309–1331 (Cited on pages ","element":"span"},{"href":"#id-81","text":"32","element":"a"},{"text":", ","element":"span"},{"href":"#id-74","text":"33","element":"a"},{"text":").","element":"span"}],[{"id":"id-42","text":"[Zan+20] ","element":"span"},{"text":"Andrea Zanette, Alessandro Lazaric, Mykel Kochenderfer, and Emma Brunskill. “Learning Near Optimal Policies with Low Inherent Bellman Error”. In: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":". PMLR. 2020, pp. 10978–10989 (Cited on pages ","element":"span"},{"href":"#id-79","text":"26","element":"a"},{"text":", ","element":"span"},{"href":"#id-77","text":"27","element":"a"},{"text":", ","element":"span"},{"href":"#id-72","text":"29","element":"a"},{"text":").","element":"span"}]]},{"heading":"A Omitted proofs of Section 3","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"Bounding term ","element":"span"},{"text":"I","element":"span"}],[{"text":"When the base algorithms are not chosen, they repeat their step 2’s policy to ensure that the conditional instantaneous regret is decreasing. To ensure the decreasing conditional instantaneous regret serves its purpose, when the base algorithms are chosen by the meta-algorithm, we only send step 2’s rewards to the meta-algorithm as feedback signals. This is to ensure that the sequence of rewards the meta-algorithm is competing against satisfies the decreasing instantaneous regret condition. However, since the bases play and incur regrets from both step 1 and step 2 when they are chosen, we must account for the difference between the reward of step 1 and step 2 (that the bases incur when they play the arms), and 2 times the reward of step 2 (what the bases send to the meta-algorithm as feedback signals).","element":"span"}],[{"text":"Since we assume all base algorithms to be smoothed and satisfy a two step feedback structure, we also denote by ","element":"span"},{"style":{"height":23.42},"width":68.15,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/38-0.png","element":"img","alt":" π(j)t","inline":true,"padRight":true},{"text":"as the policy used by the meta-algorithm during round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":". Term I, the regret of the meta-algorithm with respect to base ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/38-1.png","element":"img","alt":" i⋆","inline":true,"padRight":true},{"text":"can be written as:","element":"span"}],[{"style":{"width":"75%"},"width":1296,"height":158,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/38-2.png","element":"img"}],[{"text":"The reader should keep in mind the meta-algorithm is updated only using the reward of Step 2 of base algorithms even though the bases play both step 1 and 2. Let ","element":"span"},{"style":{"height":14.62},"width":41.1,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/38-3.png","element":"img","alt":" Ti","inline":true,"padRight":true},{"text":"be the random subset of rounds when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"choose base ","element":"span"},{"style":{"height":15.02},"width":40.66,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/38-4.png","element":"img","alt":"�Bi","inline":true},{"text":", (","element":"span"},{"style":{"height":14.62},"width":109.55,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/38-5.png","element":"img","alt":"it = i","inline":true},{"text":") for all ","element":"span"},{"style":{"height":17.6},"width":133.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/38-6.png","element":"img","alt":" i ∈ [M","inline":true},{"text":"]. Adding and subtracting terms ","element":"span"},{"style":{"height":23.48},"width":330.43,"height":58.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/38-7.png","element":"img","alt":" {f(A(1)t , π(2)t )}Tt=1 ","inline":true,"padRight":true},{"text":"we see that:","element":"span"}],[{"style":{"width":"86%"},"width":1490,"height":848,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-0.png","element":"img"}],[{"text":"Equality (","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":") holds because term I","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-1.png","element":"img","alt":"0","inline":true,"padRight":true},{"text":"equals zero (recall for all ","element":"span"},{"style":{"height":16.21},"width":541.92,"height":40.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-2.png","element":"img","alt":" t ∈ Ti⋆ algorithm i⋆ is chosen","inline":true,"padRight":true},{"text":"by the meta-algorithm) and therefore I","element":"span"},{"style":{"height":16.68},"width":118.74,"height":41.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-3.png","element":"img","alt":"0 = I′0","inline":true,"padRight":true},{"text":"and in all steps ","element":"span"},{"style":{"height":18.48},"width":129.59,"height":46.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-4.png","element":"img","alt":" t ∈ Tci⋆","inline":true},{"text":", base ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-5.png","element":"img","alt":" i⋆","inline":true,"padRight":true},{"text":"repeated ","element":"span"},{"text":"a policy of Step 2 so that I","element":"span"},{"style":{"height":16.68},"width":114.45,"height":41.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-6.png","element":"img","alt":"1 = I′1","inline":true},{"text":". Equality (","element":"span"},{"style":{"fontStyle":"italic"},"text":"ii","element":"span"},{"text":") follows by adding and subtracting term ","element":"span"},{"text":"I","element":"span"},{"style":{"height":8.8},"width":26,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-7.png","element":"img","alt":"B","inline":true},{"text":". Term ","element":"span"},{"style":{"height":17.6},"width":103.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-8.png","element":"img","alt":" E [IA]","inline":true,"padRight":true},{"text":"is the regret of the meta-algorithm with respect to base ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-9.png","element":"img","alt":" i⋆","inline":true},{"text":". Term ","element":"span"},{"style":{"height":17.6},"width":105.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-10.png","element":"img","alt":" E [IB]","inline":true,"padRight":true},{"text":"accounts for the difference between the rewards of Step 1 and Step 2 (that the bases incur) and 2 times the rewards of Step 2 (that the bases send to the meta-algorithm). We now focus on bounding ","element":"span"},{"style":{"height":17.6},"width":320.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-11.png","element":"img","alt":" E [IA] and E [IB].","inline":true}],[{"style":{"height":16},"width":531.82,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-12.png","element":"img","alt":"Biased step 2’s rewards.","inline":true,"padRight":true},{"text":"We set the bias functions to ","element":"span"},{"style":{"height":24.22},"width":253.96,"height":60.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-13.png","element":"img","alt":" bj(s) = U(s,δ)s","inline":true,"padRight":true},{"text":"in Algorithm ","element":"span"},{"href":"#id-32","text":"9","element":"a"},{"text":". This will become useful to control ","element":"span"},{"style":{"height":17.6},"width":105.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-14.png","element":"img","alt":" E [IB]","inline":true},{"text":". ","element":"span"},{"text":"Instead of sending the meta-algorithm the unadulterated 2","element":"span"},{"style":{"height":26.41},"width":64,"height":66.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-15.png","element":"img","alt":"r(2)t,j","inline":true,"padRight":true},{"text":"feedback, at all time step ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", all bases will send the following modified ","element":"span"},{"text":"feedback:","element":"span"}],[{"id":"id-82","style":{"width":"62%"},"width":1070,"height":175,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-16.png","element":"img"}],[{"text":"This reward satisfies:","element":"span"}],[{"style":{"width":"55%"},"width":952,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-17.png","element":"img"}],[{"text":"Define the modified rewards ","element":"span"},{"style":{"height":26.41},"width":876.95,"height":66.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-18.png","element":"img","alt":"�f(A, π(2)t,j ) = f(A, π(2)t,j ) − bj(st,j) for all j ∈ [M","inline":true},{"text":"] and context ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":". ","element":"span"},{"text":"Let’s write I","element":"span"},{"style":{"height":14.7},"width":122.46,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-19.png","element":"img","alt":"A + IB","inline":true,"padRight":true},{"text":"in terms of these ","element":"span"},{"style":{"height":16.4},"width":38.06,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/39-20.png","element":"img","alt":"�f.","inline":true}],[{"style":{"width":"75%"},"width":1291,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-0.png","element":"img"}],[{"id":"id-64","text":"I","element":"span"},{"style":{"height":14.7},"width":171.69,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-1.png","element":"img","alt":"A + IB =","inline":true}],[{"style":{"width":"86%"},"width":1476,"height":436,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-2.png","element":"img"}],[{"text":"Where inequality (","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":") holds because ","element":"span"},{"style":{"height":22},"width":101.63,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-3.png","element":"img","alt":"�Tt=1","inline":true,"padRight":true},{"text":"2","element":"span"},{"style":{"height":24.2},"width":489.42,"height":60.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-4.png","element":"img","alt":"b(st,jt) − �t∈Tci⋆ b(st,jt) ≤","inline":true,"padRight":true},{"text":"0. In the coming","element":"span"}],[{"text":"discussion we’ll show that this modification allows us to control term ","element":"span"},{"style":{"height":14.7},"width":46,"height":36.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-5.png","element":"img","alt":"�IB","inline":true},{"text":". In the following two sections we will control ","element":"span"},{"style":{"height":32.4},"width":121.1,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-6.png","element":"img","alt":" E��IA�","inline":true},{"text":"and ","element":"span"},{"style":{"height":32.4},"width":122.84,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-7.png","element":"img","alt":" E��IB�","inline":true},{"text":". We will control ","element":"span"},{"style":{"height":32.4},"width":121.09,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-8.png","element":"img","alt":" E��IA�","inline":true},{"text":"by using standard arguments from the adversarial bandits literature. ","element":"span"},{"text":"We will also show that with high probability ","element":"span"},{"style":{"height":33.2},"width":517.85,"height":83.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-9.png","element":"img","alt":" E��IB�≤ 8�MT log( 4TMδ )","inline":true},{"text":". The use of the biased rewards ","element":"span"},{"style":{"height":26.41},"width":63.99,"height":66.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-10.png","element":"img","alt":" �r(2)t,j","inline":true,"padRight":true},{"text":"allows us to ","element":"span"},{"text":"ensure the collected reward during steps of type 1 plus the bias terms vs the collected reward of steps of type 2 is close to zero. Without these bias terms, bounding term I","element":"span"},{"style":{"height":10.8},"width":122.34,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-11.png","element":"img","alt":"B may","inline":true,"padRight":true},{"text":"prove problematic since the rewards of steps of type 1 may be smaller than the collected rewards of steps of type 2. In this case ","element":"span"},{"style":{"height":17.6},"width":83.05,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-12.png","element":"img","alt":" E[IB","inline":true},{"text":"] may give rise to a regret term dependent on the putative regret upper bounds of all algorithms ","element":"span"},{"style":{"height":17.6},"width":131.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-13.png","element":"img","alt":" j ∈ [M","inline":true},{"text":"] and not only on ","element":"span"},{"style":{"height":17.6},"width":164.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-14.png","element":"img","alt":" U⋆(T, δ).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Bounding term ","element":"span"},{"style":{"height":34.8},"width":129.69,"height":87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-15.png","element":"img","alt":" E��IA�","inline":true}],[{"text":"Let’s start by noting that after taking expectations,","element":"span"}],[{"id":"id-83","style":{"width":"75%"},"width":1293,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-16.png","element":"img"}],[{"text":"The modification of the bases’ rewards in Equation ","element":"span"},{"href":"#id-82","text":"16 ","element":"a"},{"text":"modifies both the bases rewards as well as the comparator. Since both meta-algorithms CORRAL and EXP3.P are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"-armed bandit adversarial algorithms, their worst-case performance guarantees hold for this biased pseudo-reward sequence.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"CORRAL Meta-Algorithm","element":"span"}],[{"style":{"width":"1%"},"width":23,"height":2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-17.png","element":"img"}],[{"text":"We can bound Equation ","element":"span"},{"href":"#id-83","text":"18 ","element":"a"},{"text":"using Lemma 13 from [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"]. Indeed, in term ","element":"span"},{"style":{"height":14.7},"width":45,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-18.png","element":"img","alt":"�IA","inline":true},{"text":", the policy choice for all base algorithms ","element":"span"},{"style":{"height":19.81},"width":177.55,"height":49.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/40-19.png","element":"img","alt":" { �Bm}Mm=1","inline":true,"padRight":true},{"text":"during any round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"is chosen before the value of ","element":"span"},{"style":{"height":14.62},"width":27.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-0.png","element":"img","alt":"it","inline":true,"padRight":true},{"id":"id-69","text":"is revealed. This ensures the estimates ","element":"span"},{"style":{"height":36.97},"width":71.3,"height":92.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-1.png","element":"img","alt":"2r(2)tpitt","inline":true,"padRight":true},{"text":"and 0 for all ","element":"span"},{"style":{"height":16.8},"width":109.85,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-2.png","element":"img","alt":" i ̸= it","inline":true,"padRight":true},{"text":"are indeed unbiased estimators of the base algorithm’s rewards. We conclude:","element":"span"}],[{"style":{"width":"42%"},"width":724,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-3.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"EXP3.P Meta-Algorithm","element":"span"}],[{"text":"Since ","element":"span"},{"style":{"height":17.6},"width":103.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-4.png","element":"img","alt":" E [IA]","inline":true,"padRight":true},{"text":"is the regret of base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"with respect to the meta-algorithm, it can be upper bounded by the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"-armed bandit regret of the meta-algorithm with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"arms. ","element":"span"},{"text":"Choose ","element":"span"},{"style":{"height":16.8},"width":277.77,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-5.png","element":"img","alt":"η = 1, γ = 2kβ","inline":true,"padRight":true},{"text":"in Theorem 3.3 in [","element":"span"},{"href":"#id-22","referenceIndex":9,"text":"BS12","element":"a"},{"text":"], we have that if ","element":"span"},{"style":{"height":21.29},"width":119.85,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-6.png","element":"img","alt":" p ≤ 12k","inline":true},{"text":", the regret of EXP3.P:","element":"span"}],[{"style":{"width":"35%"},"width":605,"height":109,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-7.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Bounding ","element":"span"},{"style":{"height":34.8},"width":131.44,"height":87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-8.png","element":"img","alt":" E��IB�","inline":true}],[{"text":"Notice that:","element":"span"}],[{"style":{"width":"80%"},"width":1386,"height":690,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-9.png","element":"img"}],[{"text":"Substituting the definition of ","element":"span"},{"style":{"height":23.43},"width":212.41,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-10.png","element":"img","alt":"�f(A(2)t , π(2)t","inline":true,"padRight":true},{"text":") and ","element":"span"},{"style":{"height":18.22},"width":110.67,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-11.png","element":"img","alt":" bj(st,j","inline":true},{"text":") back into the expectation for ","element":"span"},{"style":{"height":32.4},"width":307.96,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/41-12.png","element":"img","alt":"E��IB�becomes:","inline":true}],[{"id":"id-85","style":{"width":"98%"},"width":1691,"height":486,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-0.png","element":"img"}],[{"text":"Equality (1) follows by noting ","element":"span"},{"style":{"height":18.48},"width":268.87,"height":46.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-1.png","element":"img","alt":" Tci⋆ = ∪j̸=i⋆Tj","inline":true},{"text":". Inequality (2) follows because by Lemma ","element":"span"},{"href":"#id-84","text":"B.1","element":"a"},{"text":", we have ","element":"span"},{"style":{"height":24.22},"width":464.03,"height":60.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-2.png","element":"img","alt":" Uj(sT,j, δ) ≤ �sT,js=1U(s,δ)s","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":17.6},"width":132,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-3.png","element":"img","alt":" j ∈ [M","inline":true},{"text":"]. If the ","element":"span"},{"style":{"height":16},"width":54.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-4.png","element":"img","alt":" j−","inline":true},{"text":"th algorithm was adapted to the environment, then with high probability satisfies the following bound:","element":"span"}],[{"id":"id-86","style":{"width":"93%"},"width":1603,"height":304,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-5.png","element":"img"}],[{"text":"Inequality (","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":") follows because by definition ","element":"span"},{"style":{"height":23.42},"width":642.93,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-6.png","element":"img","alt":" f(A(2)t , π∗) ≥ f(A(2)t , π(2)t ) and (B","inline":true},{"text":") because if ","element":"span"},{"style":{"height":17.42},"width":43.66,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-7.png","element":"img","alt":"Bj","inline":true,"padRight":true},{"text":"is adapted to the environment it satisfies a high probability regret bound. Let Adapt = ","element":"span"},{"style":{"height":17.6},"width":155.79,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-8.png","element":"img","alt":"{j ∈ [M","inline":true},{"text":"] s.t. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"is adapted ","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":". Let’s rewrite the upper bound for ","element":"span"},{"style":{"height":32.4},"width":122.84,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-9.png","element":"img","alt":" E��IB�","inline":true},{"text":"from Equation ","element":"span"},{"href":"#id-85","text":"19","element":"a"}],[{"text":"as a sum of terms corresponding to base algorithms ","element":"span"},{"style":{"height":17.6},"width":592.19,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-10.png","element":"img","alt":" j ∈ Adapt and j ∈ [M]\\Adapt.","inline":true}],[{"style":{"width":"99%"},"width":1704,"height":346,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-11.png","element":"img"}],[{"text":"Equation ","element":"span"},{"href":"#id-86","text":"20 ","element":"a"},{"text":"implies that with probability at least 1 ","element":"span"},{"style":{"height":17.6},"width":228.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-12.png","element":"img","alt":" − |Adapt| δ,","inline":true}],[{"style":{"width":"99%"},"width":1711,"height":114,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-13.png","element":"img"}],[{"text":"We are left with controlling the component of the upper bound of ","element":"span"},{"style":{"height":32.4},"width":122.84,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-14.png","element":"img","alt":" E��IB�","inline":true},{"text":"that runs over misspecified algorithms. When ","element":"span"},{"style":{"height":17.42},"width":43.66,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/42-15.png","element":"img","alt":" Bj","inline":true,"padRight":true},{"text":"is not adapted, Equation ","element":"span"},{"href":"#id-86","text":"20 ","element":"a"},{"text":"may or may not hold. In ","element":"span"},{"text":"order to ensure we are able to control ","element":"span"},{"style":{"height":32.4},"width":122.84,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-0.png","element":"img","alt":" E��IB�","inline":true},{"text":"we will make sure that algorithms that violate Equation ","element":"span"},{"href":"#id-86","text":"20 ","element":"a"},{"text":"by a large margin are dropped by the meta-algorithm. Since it is impossible to compute the terms ","element":"span"},{"style":{"height":23.42},"width":1050.46,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-1.png","element":"img","alt":" f(A(2)t , π(2)t ) − f(A(2)t , π∗) and f(A(1)t , π∗) − f(A(1)t , π(1)t","inline":true,"padRight":true},{"text":") directly, we instead rely on the following test:","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Base Test. ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":18.22},"width":76.68,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-2.png","element":"img","alt":" Tj(l","inline":true},{"text":") be the set of time indices in [","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":"] when the meta-algorithm chose to play base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":". We drop base ","element":"span"},{"style":{"height":17.42},"width":43.66,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-3.png","element":"img","alt":"�Bj","inline":true,"padRight":true},{"text":"if at any point during the history of the algorithm,","element":"span"}],[{"id":"id-89","style":{"width":"81%"},"width":1394,"height":143,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-4.png","element":"img"}],[{"text":"Let’s start by showing that with high probability ","element":"span"},{"style":{"height":27.9},"width":343.87,"height":69.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-5.png","element":"img","alt":"�t∈Tj(l) r(2)t,j − r(1)t,j ","inline":true,"padRight":true},{"text":"is a good estimator ","element":"span"},{"text":"of ","element":"span"},{"style":{"height":27.9},"width":1495.13,"height":69.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-6.png","element":"img","alt":"�t∈Tj(l) f(A(2)t , π(2)t,j ) − f(A(2)t , π∗) + f(A(1)t , π∗) − f(A(1)t , π(1)t,j ) for all j ∈ [M].","inline":true}],[{"text":"As a simple consequence of the Azuma-Hoeffding martingale bound and Assumption ","element":"span"},{"href":"#id-25","text":"2.1","element":"a"},{"text":", with probability at least 1 ","element":"span"},{"style":{"height":17.6},"width":130.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-7.png","element":"img","alt":" − δ/M","inline":true,"padRight":true},{"text":"and for all ","element":"span"},{"style":{"height":17.6},"width":114.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-8.png","element":"img","alt":" ℓ ∈ [T","inline":true},{"text":"] and for any ","element":"span"},{"style":{"height":17.6},"width":157.14,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-9.png","element":"img","alt":" j ∈ [M]:","inline":true}],[{"id":"id-87","style":{"width":"90%"},"width":1548,"height":335,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-10.png","element":"img"}],[{"text":"Combining Equation ","element":"span"},{"href":"#id-87","text":"22 ","element":"a"},{"text":"and Equation ","element":"span"},{"href":"#id-87","text":"23 ","element":"a"},{"text":"we get, with probability at least 1 ","element":"span"},{"style":{"height":22.49},"width":83.89,"height":56.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-11.png","element":"img","alt":" − δM","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":17.6},"width":110.33,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-12.png","element":"img","alt":"l ∈ [T","inline":true},{"text":"] and for any ","element":"span"},{"style":{"height":17.6},"width":157.14,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-13.png","element":"img","alt":" j ∈ [M]:","inline":true}],[{"id":"id-88","style":{"width":"102%"},"width":1757,"height":501,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-14.png","element":"img"}],[{"text":"Equation ","element":"span"},{"href":"#id-86","text":"20 ","element":"a"},{"text":"holds for all ","element":"span"},{"style":{"height":16.8},"width":193.87,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-15.png","element":"img","alt":" j ∈ Adapt","inline":true,"padRight":true},{"text":"with probability at least 1 ","element":"span"},{"style":{"height":17.6},"width":215.14,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-16.png","element":"img","alt":" − |Adapt| δ","inline":true},{"text":". Combining this result with Equation ","element":"span"},{"href":"#id-88","text":"24 ","element":"a"},{"text":"we conclude that with probability at least 1 ","element":"span"},{"style":{"height":21.29},"width":353.43,"height":53.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-17.png","element":"img","alt":" − |Adapt|(1 + 1M )δ","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":16},"width":61.59,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-18.png","element":"img","alt":" j ∈","inline":true,"padRight":true},{"text":"Adapt and all ","element":"span"},{"style":{"height":17.6},"width":139.33,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-19.png","element":"img","alt":" ℓ ∈ [T],","inline":true}],[{"style":{"width":"63%"},"width":1089,"height":146,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/43-20.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":17.6},"width":114.63,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-0.png","element":"img","alt":" ℓ ∈ [T","inline":true},{"text":"]. Thus with high probability no well adapted algorithm will be eliminated.","element":"span"}],[{"text":"Let’s now show that for all ","element":"span"},{"style":{"height":17.6},"width":287.43,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-1.png","element":"img","alt":" j ∈ [M]\\Adapt","inline":true,"padRight":true},{"text":"the contribution of ","element":"span"},{"style":{"height":17.42},"width":43.66,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-2.png","element":"img","alt":"�Bj","inline":true,"padRight":true},{"text":"to ","element":"span"},{"style":{"height":32.4},"width":122.84,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-3.png","element":"img","alt":" E��IB�","inline":true},{"text":"while the test of Equation ","element":"span"},{"href":"#id-89","text":"21 ","element":"a"},{"text":"has not been triggered is small. If Equation ","element":"span"},{"href":"#id-89","text":"21 ","element":"a"},{"text":"holds for algorithm ","element":"span"},{"style":{"height":17.82},"width":355.7,"height":44.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-4.png","element":"img","alt":"j ∈ [M] (even if �Bj","inline":true,"padRight":true},{"text":"is not adapted), then Equation ","element":"span"},{"href":"#id-88","text":"24 ","element":"a"},{"text":"implies that with probability at least 1 ","element":"span"},{"style":{"height":22.49},"width":101.13,"height":56.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-5.png","element":"img","alt":" − δM :","inline":true}],[{"style":{"width":"101%"},"width":1744,"height":810,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-6.png","element":"img"}],[{"text":"The last inequality holds because ","element":"span"},{"style":{"height":23.54},"width":412.6,"height":58.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-7.png","element":"img","alt":"�j̸=i⋆�|Tj| ≤√TM","inline":true},{"text":". And therefore,","element":"span"}],[{"style":{"width":"59%"},"width":1029,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-8.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Bounding term ","element":"span"},{"text":"II","element":"span"}],[{"text":"Recall term II equals:","element":"span"}],[{"style":{"width":"71%"},"width":1233,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-9.png","element":"img"}],[{"text":"We use ","element":"span"},{"style":{"height":19.05},"width":38.2,"height":47.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-10.png","element":"img","alt":" nit ","inline":true,"padRight":true},{"text":"to denote the number of rounds base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"is chosen up to time ","element":"span"},{"style":{"height":17.2},"width":308.71,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-11.png","element":"img","alt":" t for all i ∈ [M].","inline":true,"padRight":true},{"text":"Let ","element":"span"},{"style":{"height":16.44},"width":47.65,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-12.png","element":"img","alt":" tl,i","inline":true,"padRight":true},{"text":"be the round index of the ","element":"span"},{"style":{"height":12.8},"width":47.88,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-13.png","element":"img","alt":" l−","inline":true},{"text":"th time the meta-algorithm chooses algorithm ","element":"span"},{"style":{"height":15.02},"width":183.97,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-14.png","element":"img","alt":" Bi and let","inline":true},{"style":{"height":21.95},"width":1188.6,"height":54.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-15.png","element":"img","alt":"bl,i = tl,i − tl−1,i with t0,i = 0 and tniT +1,i = T + 1. Let Ti ⊂ [T","inline":true},{"text":"] be the set of rounds where ","element":"span"},{"text":"base ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"is chosen and ","element":"span"},{"style":{"height":18.09},"width":225.28,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-16.png","element":"img","alt":" Tci = [T]\\Ti","inline":true},{"text":". For ","element":"span"},{"style":{"height":17.6},"width":132.29,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-17.png","element":"img","alt":" S ⊂ [T","inline":true},{"text":"] and ","element":"span"},{"style":{"height":17.6},"width":183,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-18.png","element":"img","alt":" j ∈ {1, 2}","inline":true},{"text":", we define the regret of the ","element":"span"},{"style":{"height":12.4},"width":49.03,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-19.png","element":"img","alt":"i−","inline":true},{"text":"th base algorithm during Step ","element":"span"},{"style":{"height":26.41},"width":1105.85,"height":66.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/44-20.png","element":"img","alt":" j of rounds S as R(j)i (S) = �t∈S f(A(j)t , π∗)−f(A(j)t , π(j)t,i ).","inline":true}],[{"text":"The following decomposition of ","element":"span"},{"text":"E ","element":"span"},{"text":"[II] holds:","element":"span"}],[{"id":"id-94","style":{"width":"82%"},"width":1411,"height":184,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/45-0.png","element":"img"}],[{"style":{"height":25.61},"width":76.56,"height":64.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/45-1.png","element":"img","alt":"R(1)i⋆","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":16.21},"width":52.63,"height":40.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/45-2.png","element":"img","alt":"Ti⋆","inline":true},{"text":") consists of the regret when base ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/45-3.png","element":"img","alt":" i⋆","inline":true,"padRight":true},{"text":"was updated in step 1 while the remaining 3 ","element":"span"},{"text":"terms consists of the regret when the policies are reused by step 2.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Biased step ","element":"span"},{"text":"2","element":"span"},{"style":{"fontWeight":"bold"},"text":"’s rewards","element":"span"}],[{"text":"Note that we modified the rewards of step 2 as defined in Equation ","element":"span"},{"href":"#id-82","text":"16","element":"a"},{"text":", both when the base is chosen and not chosen. We now analyze the effect of this modification:","element":"span"}],[{"style":{"width":"94%"},"width":1625,"height":1484,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/45-4.png","element":"img"}],[{"text":"We provided a bound for term I-modified at the beginning of Section ","element":"span"},{"text":"A","element":"span"},{"text":". In this section we concern ourselves with II","element":"span"},{"style":{"height":4.8},"width":34,"height":12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-0.png","element":"img","alt":"−","inline":true},{"text":"modified. Notice its expectation can be written as:","element":"span"}],[{"style":{"width":"76%"},"width":1310,"height":158,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-1.png","element":"img"}],[{"text":"Now the second part of this sum is easy to deal with as it can be incorporated into the bound of ","element":"span"},{"text":"E ","element":"span"},{"text":"[II] by slightly modifying the bound given by Equation ","element":"span"},{"href":"#id-90","text":"28 ","element":"a"},{"text":"below and changing 2","element":"span"},{"style":{"height":15.24},"width":224.78,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-2.png","element":"img","alt":"bl − 1 to 2bl","inline":true,"padRight":true},{"text":"+ 1. The rest of the argument remains the same.","element":"span"}],[{"style":{"width":"43%"},"width":742,"height":61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-3.png","element":"img"}],[{"text":"From this section onward we drop the subscript ","element":"span"},{"style":{"height":14.62},"width":31.03,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-4.png","element":"img","alt":" i⋆","inline":true,"padRight":true},{"text":"whenever clear to simplify the notations. In this section we show an upper bound for Term II when there is a value ","element":"span"},{"style":{"height":22.48},"width":296.76,"height":56.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-5.png","element":"img","alt":" pi⋆ ∈ (0, 1) that","inline":true,"padRight":true},{"text":"lower bounds ","element":"span"},{"style":{"height":21.5},"width":191.08,"height":53.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-6.png","element":"img","alt":" pi1, · · · , pi⋆T ","inline":true,"padRight":true},{"text":"with probability 1. We then use the restarting trick to extend the ","element":"span"},{"text":"proof to the case when ","element":"span"},{"style":{"height":15.68},"width":33.95,"height":39.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-7.png","element":"img","alt":" pi","inline":true,"padRight":true},{"text":"is random in Theorem ","element":"span"},{"href":"#id-91","text":"4.10","element":"a"}],[{"id":"id-92","style":{"fontWeight":"bold"},"text":"Lemma A.1 ","element":"span"},{"text":"(Fixed ","element":"span"},{"style":{"height":22.48},"width":84.06,"height":56.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-8.png","element":"img","alt":" pi⋆).","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":18.88},"width":104.31,"height":47.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-9.png","element":"img","alt":" pi⋆ ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be such that ","element":"span"},{"style":{"height":25.56},"width":466.54,"height":63.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-10.png","element":"img","alt":"1ρi⋆ = pi⋆ ≤ pi⋆1 , · · · , pi⋆T","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"probability one, then, ","element":"span"},{"style":{"height":17.6},"width":678.59,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-11.png","element":"img","alt":" E [II] ≤ 4ρi⋆ Ui(T/ρi⋆, δ) log T + δT.","inline":true}],[{"href":"#id-92","style":{"height":17.6},"width":968.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-12.png","element":"img","alt":"Proof of Lemma A.1. Since E [II] ≤ E [1{E}II]+δT","inline":true},{"text":", we focus on bounding ","element":"span"},{"text":"E ","element":"span"},{"text":"[","element":"span"},{"style":{"fontWeight":"bold"},"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":"{E}","element":"span"},{"text":"II]. since base ","element":"span"},{"style":{"height":32.4},"width":1179.01,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-13.png","element":"img","alt":" i is (U, T, δ)−bounded, E�R(1)i⋆ (Ti)1(E)�≤ E�Ui⋆(δ, ni⋆T )1(E)�","inline":true},{"text":". We proceed to bound the regret corresponding to the remaining terms in II","element":"span"},{"style":{"height":10.22},"width":30.93,"height":25.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-14.png","element":"img","alt":"0:","inline":true}],[{"id":"id-90","style":{"width":"78%"},"width":1342,"height":341,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-15.png","element":"img"}],[{"text":"The multiplier 2","element":"span"},{"style":{"height":15.24},"width":75.03,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-16.png","element":"img","alt":"bl −","inline":true,"padRight":true},{"text":"1 arises because the policies proposed by the base algorithm during the rounds it is not selected by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"satisfy ","element":"span"},{"style":{"height":26.41},"width":372.97,"height":66.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-17.png","element":"img","alt":" π(1)t,i⋆ = π(2)t,i⋆ = π(2)tl,i","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":21.62},"width":137.52,"height":54.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-18.png","element":"img","alt":" l ≤ nTi⋆","inline":true,"padRight":true},{"text":"+ 1 and ","element":"span"},{"style":{"height":14.04},"width":143.8,"height":35.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-19.png","element":"img","alt":"t = tl−1","inline":true,"padRight":true},{"text":"+ 1","element":"span"},{"style":{"height":14.4},"width":176.11,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-20.png","element":"img","alt":", · · · , tl −","inline":true,"padRight":true},{"text":"1. The factorization is a result of conditional independence between ","element":"span"},{"style":{"height":32.4},"width":268.62,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-21.png","element":"img","alt":"E�r(2)tl,i⋆|Ftl−1�","inline":true},{"text":"and ","element":"span"},{"style":{"height":20.8},"width":210.7,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-22.png","element":"img","alt":" E�bl|Ftl−1�","inline":true},{"text":"where ","element":"span"},{"style":{"height":17.27},"width":91.23,"height":43.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-23.png","element":"img","alt":" Ftl−1","inline":true,"padRight":true},{"text":"already includes algorithm ","element":"span"},{"style":{"height":16.21},"width":56.63,"height":40.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-24.png","element":"img","alt":"�Bi⋆","inline":true,"padRight":true},{"text":"update right after round ","element":"span"},{"style":{"height":14.04},"width":69.59,"height":35.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-25.png","element":"img","alt":" tl−1","inline":true},{"text":". The inequality holds because ","element":"span"},{"style":{"height":22.49},"width":426.57,"height":56.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-26.png","element":"img","alt":"�Bi⋆ is (Ui⋆, δ2M , T (2))−","inline":true},{"text":"smooth and therefore ","element":"span"},{"text":"satisfies Equation ","element":"span"},{"href":"#id-93","text":"3 ","element":"a"},{"text":"on event ","element":"span"},{"style":{"fontStyle":"italic"},"text":"E","element":"span"},{"text":". Recall that as a consequence of Equation ","element":"span"},{"href":"#id-94","text":"27 ","element":"a"},{"text":"we have","element":"span"}],[{"style":{"width":"45%"},"width":783,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/46-27.png","element":"img"}],[{"text":"The first term is bounded by ","element":"span"},{"style":{"height":32.4},"width":355.89,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-0.png","element":"img","alt":" E�Ui⋆(ni⋆T , δ)1(E)�","inline":true},{"text":"while the second term satisfies the bound","element":"span"}],[{"text":"in (","element":"span"},{"href":"#id-90","text":"28","element":"a"},{"text":"). Let ","element":"span"},{"href":"#id-84","style":{"height":24.57},"width":1379.78,"height":61.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-1.png","element":"img","alt":" ul = Ui⋆(l,δ/2M)l . By Lemma B.1, �tl=1 ul ≥ Ui⋆(t, δ/M) for all t, and so,","inline":true}],[{"id":"id-95","style":{"width":"70%"},"width":1201,"height":161,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-2.png","element":"img"}],[{"text":"By (","element":"span"},{"href":"#id-90","text":"28","element":"a"},{"text":") and (","element":"span"},{"href":"#id-95","text":"29","element":"a"},{"text":"),","element":"span"}],[{"style":{"width":"58%"},"width":1005,"height":161,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-3.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":17.6},"width":193.43,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-4.png","element":"img","alt":" al = E [bl]","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":". Consider a meta-algorithm that uses ","element":"span"},{"style":{"height":17.28},"width":45.48,"height":43.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-5.png","element":"img","alt":" pi⋆","inline":true,"padRight":true},{"text":"instead of ","element":"span"},{"style":{"height":20.63},"width":45.49,"height":51.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-6.png","element":"img","alt":" pi⋆t","inline":true,"padRight":true},{"text":". In this ","element":"span"},{"text":"new process let ","element":"span"},{"style":{"height":16.51},"width":31.75,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-7.png","element":"img","alt":" t′l","inline":true,"padRight":true},{"text":"be the corresponding rounds when the base is selected, ¯","element":"span"},{"style":{"height":21.5},"width":50.19,"height":53.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-8.png","element":"img","alt":"ni⋆T","inline":true,"padRight":true},{"text":"be the total ","element":"span"},{"text":"number of rounds the base is selected, and ","element":"span"},{"style":{"height":20.8},"width":321.88,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-9.png","element":"img","alt":" cl = E�t′l − t′l−1�","inline":true},{"text":". Since ","element":"span"},{"style":{"height":25.28},"width":162.44,"height":63.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-10.png","element":"img","alt":" pi⋆ ≤ pi⋆t","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"it ","element":"span"},{"text":"holds that ","element":"span"},{"style":{"height":22.53},"width":346.9,"height":56.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-11.png","element":"img","alt":"�jl=1 al ≤ �jl=1 cl","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":". If we use the same coin flips used to generate ","element":"span"},{"style":{"height":14.04},"width":25.76,"height":35.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-12.png","element":"img","alt":" tl","inline":true,"padRight":true},{"text":"to generate ","element":"span"},{"style":{"height":16.52},"width":31.76,"height":41.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-13.png","element":"img","alt":" t′l","inline":true},{"text":", we observe that ","element":"span"},{"style":{"height":16.52},"width":117.42,"height":41.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-14.png","element":"img","alt":" t′l ⊂ tl","inline":true,"padRight":true},{"text":"and ¯","element":"span"},{"style":{"height":21.5},"width":170.6,"height":53.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-15.png","element":"img","alt":"ni⋆T ≤ ni⋆T","inline":true,"padRight":true},{"text":". Let ","element":"span"},{"style":{"height":16.4},"width":158.15,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-16.png","element":"img","alt":" f : R →","inline":true,"padRight":true},{"text":"[0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1] be a decreasing ","element":"span"},{"text":"function such that for integer ","element":"span"},{"style":{"height":17.6},"width":266.01,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-17.png","element":"img","alt":" i⋆, f(i⋆) = ui⋆","inline":true},{"text":". Then ","element":"span"},{"style":{"height":28.87},"width":219.6,"height":72.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-18.png","element":"img","alt":"�ni⋆T +1l=1 alul","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":28.87},"width":215.42,"height":72.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-19.png","element":"img","alt":"�¯ni⋆T +1l=1 clul","inline":true,"padRight":true},{"text":"are two estimates of integral","element":"span"},{"style":{"height":23.94},"width":195.45,"height":59.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-20.png","element":"img","alt":"� T0 f(x)dx","inline":true},{"text":". Given that ","element":"span"},{"style":{"height":17.71},"width":249.03,"height":44.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-21.png","element":"img","alt":" t′l ⊂ tl and ul","inline":true,"padRight":true},{"text":"is a decreasing sequence in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":",","element":"span"}],[{"style":{"width":"46%"},"width":805,"height":148,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-22.png","element":"img"}],[{"text":"and thus","element":"span"}],[{"style":{"width":"60%"},"width":1045,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-23.png","element":"img"}],[{"text":"We proceed to upper bound the right hand side of this inequality:","element":"span"}],[{"style":{"width":"56%"},"width":971,"height":226,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-24.png","element":"img"}],[{"text":"The first inequality holds because ","element":"span"},{"style":{"height":29.03},"width":328.58,"height":72.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-25.png","element":"img","alt":" E�t′l − t′l−1�≤ 1pi⋆","inline":true,"padRight":true},{"text":"and the second inequality follows by ","element":"span"},{"text":"concavity of ","element":"span"},{"style":{"height":17.6},"width":132.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/47-26.png","element":"img","alt":" Ui⋆(t, δ","inline":true},{"text":") as a function of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". The proof follows.","element":"span"}],[{"id":"id-65","style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-91","style":{"fontWeight":"bold"},"text":"4.10","element":"a"}],[{"text":"We use the restarting trick to extend Lemma ","element":"span"},{"href":"#id-92","text":"A.1 ","element":"a"},{"text":"to the case when the lower bound ","element":"span"},{"style":{"height":17.28},"width":45.49,"height":43.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-0.png","element":"img","alt":" pi⋆","inline":true,"padRight":true},{"text":"is random (more specifically the algorithm (CORRAL) will maintain a lower bound that in the end will satisfy ","element":"span"},{"style":{"height":25.28},"width":250.64,"height":63.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-1.png","element":"img","alt":" pi⋆ ≈ mint pi⋆t","inline":true,"padRight":true},{"text":") in Theorem ","element":"span"},{"href":"#id-91","text":"4.10","element":"a"},{"text":". We restate the theorem statement ","element":"span"},{"text":"here for convenience.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem A.2 ","element":"span"},{"text":"(Theorem ","element":"span"},{"href":"#id-91","text":"4.10 ","element":"a"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":".","element":"span"}],[{"style":{"width":"59%"},"width":1017,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-2.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Here, the expectation is over the random variable ","element":"span"},{"style":{"height":28.06},"width":263.68,"height":70.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-3.png","element":"img","alt":" ρi⋆ = maxt 1pi⋆t","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":". If ","element":"span"},{"style":{"height":17.6},"width":284.19,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-4.png","element":"img","alt":" U(t, δ) = tαc(δ","inline":true},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"some ","element":"span"},{"style":{"height":24.25},"width":1259.88,"height":60.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-5.png","element":"img","alt":" α ∈ [1/2, 1) then, E [II] ≤ 4 21−α21−α−1T αc(δ)E�ρ1−αi �+ δT(log T + 1).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"R","element":"span"},{"text":"estarting trick: Initialize ","element":"span"},{"style":{"height":31.26},"width":662.4,"height":78.14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-6.png","element":"img","alt":" pi⋆ = 12M . If pi⋆t < pi⋆, set pi⋆ = pi⋆t2 ","inline":true,"padRight":true},{"text":"and restart the base.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-91","style":{"fontStyle":"italic"},"text":"4.10","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"The proof follows that of Theorem 15 in [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"]. Let ","element":"span"},{"style":{"height":17.19},"width":237.72,"height":42.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-7.png","element":"img","alt":" ℓ1, · · · , ℓdi <","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"be the rounds where Line 10 of the CORRAL is executed. Let ","element":"span"},{"style":{"height":18.78},"width":479.95,"height":46.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-8.png","element":"img","alt":" ℓ0 = 0 and ℓdi⋆+1 = T for","inline":true,"padRight":true},{"text":"notational convenience. Let ","element":"span"},{"style":{"height":17.6},"width":177.45,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-9.png","element":"img","alt":" el = [ℓl−1","inline":true,"padRight":true},{"text":"+ 1","element":"span"},{"style":{"height":15.6},"width":132.43,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-10.png","element":"img","alt":", · · · , ℓl","inline":true},{"text":"]. Denote by ","element":"span"},{"style":{"height":18.08},"width":84,"height":45.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-11.png","element":"img","alt":" pi⋆,ℓl","inline":true,"padRight":true},{"text":"the probability lower bound maintained by CORRAL during time-steps ","element":"span"},{"style":{"height":23.28},"width":749,"height":58.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-12.png","element":"img","alt":" t ∈ [ℓl−1, · · · , ℓl] and ρi⋆,ℓl = 1/pi⋆,ℓl. In","inline":true,"padRight":true},{"text":"the proof of Lemma 13 in [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"], the authors prove ","element":"span"},{"style":{"height":17.6},"width":216.21,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-13.png","element":"img","alt":" di⋆ ≤ log(T","inline":true},{"text":") with probability one. Therefore,","element":"span"}],[{"style":{"width":"71%"},"width":1233,"height":434,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-14.png","element":"img"}],[{"text":"The inequality is a consequence of Lemma ","element":"span"},{"href":"#id-92","text":"A.1 ","element":"a"},{"text":"applied to the restarted segment [","element":"span"},{"style":{"height":17.2},"width":232.74,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-15.png","element":"img","alt":"ℓl−1, · · · , ℓl].","inline":true,"padRight":true},{"text":"This step is valid because by assumption ","element":"span"},{"style":{"height":48.44},"width":1197.95,"height":121.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-16.png","element":"img","alt":"1ρi⋆,ℓl ≤ mint∈[ℓl−1,··· ,ℓl] pt.If Ui⋆(t, δ) = tαc(δ","inline":true},{"text":") for some function ","element":"span"},{"style":{"height":14.73},"width":217.66,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-17.png","element":"img","alt":" c : R → R+","inline":true},{"text":", then ","element":"span"},{"style":{"height":22.54},"width":531.07,"height":56.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-18.png","element":"img","alt":" ρi⋆U(T/ρi⋆, δ) = ρ1−αi⋆ T αc(δ","inline":true},{"text":"). And therefore:","element":"span"}],[{"style":{"width":"59%"},"width":1024,"height":267,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-19.png","element":"img"}],[{"text":"Where ¯","element":"span"},{"style":{"height":12},"width":188.07,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/48-20.png","element":"img","alt":"α = 1 − α","inline":true},{"text":". The last inequality follows from the same argument as in Theorem 15 in [","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"Aga+17","element":"a"},{"text":"].","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-19","style":{"fontWeight":"bold"},"text":"4.11","element":"a"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"For the CORRAL meta-algorithm,","element":"span"}],[{"style":{"width":"83%"},"width":1426,"height":114,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-0.png","element":"img"}],[{"text":"Using Theorem ","element":"span"},{"href":"#id-91","text":"4.10 ","element":"a"},{"text":"to control term II, the total regret of CORRAL is:","element":"span"}],[{"style":{"width":"99%"},"width":1713,"height":969,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-1.png","element":"img"}],[{"text":"Using Lemma ","element":"span"},{"href":"#id-92","text":"A.1 ","element":"a"},{"text":"to control term II, we have the total regret of EXP3.P when ","element":"span"},{"style":{"height":17.6},"width":166.42,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-2.png","element":"img","alt":" δ = 1/T:","inline":true}],[{"style":{"width":"49%"},"width":856,"height":168,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-3.png","element":"img"}],[{"text":"When both ","element":"span"},{"style":{"height":17.2},"width":176.3,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-4.png","element":"img","alt":" α and c(δ","inline":true},{"text":") are known, choose ","element":"span"},{"style":{"height":24.38},"width":482.27,"height":60.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-5.png","element":"img","alt":" p = T − 1−α2−α M− 12−α c(δ) 12−α","inline":true,"padRight":true},{"text":". When only ","element":"span"},{"style":{"height":14.8},"width":213.8,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-6.png","element":"img","alt":" α is known,","inline":true,"padRight":true},{"text":"choose ","element":"span"},{"style":{"height":23.58},"width":341.33,"height":58.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-7.png","element":"img","alt":" p = T − 1−α2−α M− 12−α","inline":true,"padRight":true},{"text":". We then have the following regret:","element":"span"}]]},{"heading":"B Ancillary Technical Results","paragraphs":[[{"id":"id-84","style":{"height":19.93},"width":1024.15,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-8.png","element":"img","alt":"Lemma B.1. If U(t, δ) = tβc(δ), for 0 ≤ β ≤ 1 then:","inline":true}],[{"style":{"width":"34%"},"width":597,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/49-9.png","element":"img"}],[{"style":{"width":"87%"},"width":1498,"height":285,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-0.png","element":"img"}],[{"text":"Table 2: The top row shows the general regret guarantees. The middle row shows the regret guarantees when ","element":"figcaption","subtype":"caption"},{"style":{"height":17.2},"width":181.13,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-1.png","element":"img","alt":" α and c(δ","inline":true},{"text":") are known. The bottom row shows the regret guarantees when ","element":"figcaption","subtype":"caption"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-2.png","element":"img","alt":"α","inline":true,"padRight":true},{"text":"is known and ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":55.85,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-3.png","element":"img","alt":" c(δ","inline":true},{"text":") is unknown.","element":"figcaption","subtype":"caption"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"The LHS follows immediately from observing ","element":"span"},{"style":{"height":24.22},"width":90.31,"height":60.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-4.png","element":"img","alt":"U(t,δ)t","inline":true,"padRight":true},{"text":"is decreasing as a function of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"and therefore ","element":"span"},{"style":{"height":24.22},"width":564.54,"height":60.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-5.png","element":"img","alt":"�lt=1U(t,δ)t ≥ l U(l,δ)l = U(l, δ","inline":true},{"text":"). The RHS is a consequence of bounding the sum by the integral","element":"span"},{"style":{"height":24.33},"width":187.62,"height":60.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-6.png","element":"img","alt":"� l0U(t,δ)t dt","inline":true},{"text":", substituting the definition ","element":"span"},{"style":{"height":19.93},"width":277.03,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-7.png","element":"img","alt":" U(t, δ) = tβc(δ","inline":true},{"text":") and solving it.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma B.2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a concave and doubly differentiable function on ","element":"span"},{"style":{"height":17.2},"width":360.89,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-8.png","element":"img","alt":" x > 0 and f(0) ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":"/x ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is decreasing on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x > ","element":"span"},{"text":"0","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"In order to show that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":"/x ","element":"span"},{"text":"is decreasing when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x > ","element":"span"},{"text":"0, we want to show that","element":"span"}],[{"style":{"width":"99%"},"width":1706,"height":77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-9.png","element":"img"}],[{"style":{"height":17.6},"width":385.73,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-10.png","element":"img","alt":"g(x) = xf′(x) − f(x","inline":true},{"text":") is a non-increasing function on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x > ","element":"span"},{"text":"0. We have ","element":"span"},{"style":{"height":17.6},"width":330.62,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-11.png","element":"img","alt":" g′(x) = xf′′(x) ≤","inline":true,"padRight":true},{"text":"0 when ","element":"span"},{"style":{"height":13.6},"width":71.49,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-12.png","element":"img","alt":" x ≥","inline":true,"padRight":true},{"text":"0 because ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") is concave. Therefore ","element":"span"},{"style":{"height":17.6},"width":384.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-13.png","element":"img","alt":" xf′(x) − f(x) ≤ 0f′","inline":true},{"text":"(0) ","element":"span"},{"style":{"height":16.4},"width":68.81,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-14.png","element":"img","alt":" − f","inline":true},{"text":"(0) ","element":"span"},{"style":{"height":13.6},"width":34,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-15.png","element":"img","alt":" ≤","inline":true,"padRight":true},{"text":"0 for all ","element":"span"},{"style":{"height":13.6},"width":71.06,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-16.png","element":"img","alt":"x ≥","inline":true,"padRight":true},{"text":"0, which completes the proof.","element":"span"}],[{"id":"id-41","style":{"width":"99%"},"width":1711,"height":128,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-17.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"By definition ","element":"span"},{"style":{"height":24.71},"width":794.54,"height":61.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-18.png","element":"img","alt":" kl(p, q) = p log(p/q) + (1 − p) log( 1−p1−q), so","inline":true}],[{"style":{"width":"83%"},"width":1431,"height":436,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01704/images/50-19.png","element":"img"}]]}],"_version":"3.3.4"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]