1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTgwNS4xMTc5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2025-05-07T00:00:00.000Z","paperID":"1805.11792","published":"2018-05-29T00:00:00.000Z","authors":"[\"Scarlett Jonathan\"]","title":"Tight Regret Bounds for Bayesian Optimization in One Dimension","scoreTrending":null,"summary":"We consider the problem of Bayesian optimization (BO) in one dimension, under\na Gaussian process prior and Gaussian sampling noise. We provide a theoretical\nanalysis showing that, under fairly mild technical assumptions on the kernel,\nthe best possible cumulative regret up to time $T$ behaves as\n$\\Omega(\\sqrt{T})$ and $O(\\sqrt{T\\log T})$. This gives a tight characterization\nup to a $\\sqrt{\\log T}$ factor, and includes the first non-trivial lower bound\nfor noisy BO. Our assumptions are satisfied, for example, by the squared\nexponential and Mat\\'ern-$\\nu$ kernels, with the latter requiring $\\nu > 2$.\nOur results certify the near-optimality of existing bounds (Srinivas {\\em et\nal.}, 2009) for the SE kernel, while proving them to be strictly suboptimal for\nthe Mat\\'ern kernel with $\\nu > 2$.","lastCheckedForCode":"2025-05-08T05:11:46.722Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci90aWdodC1yZWdyZXQtYm91bmRzLWZvci1iYXllc2lhbi1vcHRpbWl6YXRpb24ifQ==","type":"pwc","url":"https://paperswithcode.com/paper/tight-regret-bounds-for-bayesian-optimization","data":"{\"date\":\"2025-05-10T04:03:31.489Z\"}"},{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9tZXRob2QvZ2F1c3NpYW4tcHJvY2VzcyJ9","type":"method","url":"https://paperswithcode.com/method/gaussian-process","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[{"id":"eyJuYW1lIjoiYmF5ZXNpYW4gb3B0aW1pemF0aW9uIiwidHlwZSI6InRhc2sifQ==","name":"bayesian optimization","description":"Bayesian optimization is a method where the input is a set of parameters and the output is the optimal parameters that maximize a function. It's commonly used in hyperparameter tuning in machine learning models, where it helps to find the best parameters to improve the model's performance.","scoreTrending":null,"count":{"stars":1529,"papers":806,"models":360},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[{"author":"scarlett jonathan","node":{"id":"eyJhZGRyZXNzIjoic2NhcmxldHRAY29tcC5udXMuZWR1LnNnIn0=","address":"scarlett@comp.nus.edu.sg","name":"Scarlett","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"National University of Singapore"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"a4D08aQAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJjMTNkOTA1Yy05YTUyLTQ1M2MtODU2NC0yYTMwMDhkYjdiN2YifQ==","name":"jonathan scarlett","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTUwMS4wNzQ0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1501.07440"},{"id":"eyJwYXBlcklEIjoiMTYwMS4wNjY1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1601.06650"},{"id":"eyJwYXBlcklEIjoiMTgxMC4xMDc3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.10775"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wNDkxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.04918"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wMDA5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.00090"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMjQxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.12415"},{"id":"eyJwYXBlcklEIjoiMTYwMi4wMzY0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1602.03647"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wNzM3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.07379"},{"id":"eyJwYXBlcklEIjoiMTkwMS4wMDU1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.00555"},{"id":"eyJwYXBlcklEIjoiMTgwNS4xMTc5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.11792"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNzAyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.07028"},{"id":"eyJwYXBlcklEIjoiMTkwOS4wNzQyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.07425"},{"id":"eyJwYXBlcklEIjoiMjAwOC4wODc1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.08757"},{"id":"eyJwYXBlcklEIjoiMTkwOC4xMDc0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.10744"},{"id":"eyJwYXBlcklEIjoiMTYwNy4wMjQxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1607.02413"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wMTY5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.01697"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNzc4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.07788"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wNDAwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.04005"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xNDM3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.14373"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wODY2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.08663"},{"id":"eyJwYXBlcklEIjoiMTcxMC4wNjc2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1710.06766"},{"id":"eyJwYXBlcklEIjoiMjAwMS4wOTMyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2001.09327"},{"id":"eyJwYXBlcklEIjoiMTkwMS4xMDY0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.10647"},{"id":"eyJwYXBlcklEIjoiMjExMC4wODQ0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.08449"},{"id":"eyJwYXBlcklEIjoiMjIxMS4wNTQzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.05430"},{"id":"eyJwYXBlcklEIjoiMjAxMi4xMzA4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.13088"},{"id":"eyJwYXBlcklEIjoiMTYwMi4wMDg3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1602.00877"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wMzQxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.03410"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNTc5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.05793"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xNTM1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.15358"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wMzU3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.03570"},{"id":"eyJwYXBlcklEIjoiMjExMS4wODg2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.08862"},{"id":"eyJwYXBlcklEIjoiMjIwMi4xMDYxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.10615"},{"id":"eyJwYXBlcklEIjoiMjIwMy4wOTY5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.09693"},{"id":"eyJwYXBlcklEIjoiMjMwNC4xMjY4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.12680"},{"id":"eyJwYXBlcklEIjoiMjQwMS4wNTcxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.05716"},{"id":"eyJwYXBlcklEIjoiMjIxMS4wMTU2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.01561"},{"id":"eyJwYXBlcklEIjoiMjMwOS4wNDIyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2309.04221"},{"id":"eyJwYXBlcklEIjoiNTM0MTEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53411"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wMzc1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.03758"},{"id":"eyJwYXBlcklEIjoiNzAwOTkiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70099"}]}]}}]},"__typename":"paper","authorArray":["Scarlett Jonathan"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2b",null,{"publisher":"arxiv","paperID":"1805.11792","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2c",null,{"article":"$L2d","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2e",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L2f",null,{"paperID":"1805.11792","publisher":"arxiv","paperJSON":{"title":"Tight Regret Bounds for Bayesian Optimization in One Dimension","paperID":"1805.11792","avgLineHeight":11.91,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"We consider the problem of Bayesian optimization (BO) in one dimension, under a Gaussian process prior and Gaussian sampling noise. We provide a theoretical analysis showing that, under fairly mild technical assumptions on the kernel, the best possible cumulative regret up to time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"behaves as ","element":"span"},{"style":{"height":18.3},"width":416.92,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-0.png","element":"img","alt":" Ω(√T) and O(√T log T)","inline":true},{"text":". This gives a tight characterization up to a ","element":"span"},{"style":{"height":16.03},"width":120.32,"height":40.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-1.png","element":"img","alt":"√log T","inline":true,"padRight":true},{"text":"factor, and includes the first non-trivial lower bound for noisy BO. Our assumptions are satisfied, for example, by the squared exponential and Matérn-","element":"span"},{"style":{"height":13.6},"width":155.46,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-2.png","element":"img","alt":"ν kernels,","inline":true,"padRight":true},{"text":"with the latter requiring ","element":"span"},{"style":{"height":11.6},"width":95.36,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-3.png","element":"img","alt":" ν > 2","inline":true},{"text":". Our results certify the near-optimality of existing bounds (Srinivas ","element":"span"},{"style":{"fontStyle":"italic"},"text":"et al.","element":"span"},{"text":", 2009) for the SE kernel, while proving them to be strictly suboptimal for the Matérn kernel with ","element":"span"},{"style":{"height":11.6},"width":105.28,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-4.png","element":"img","alt":" ν > 2.","inline":true}]]},{"heading":"1. Introduction","paragraphs":[[{"text":"Bayesian optimization (BO) (","element":"span"},{"href":"#id-0","referenceIndex":21,"text":"Shahriari et al.","element":"a"},{"href":"#id-0","referenceIndex":21,"text":", ","element":"a"},{"href":"#id-0","referenceIndex":21,"text":"2016","element":"a"},{"text":") is a powerful and versatile tool for black-box function optimization, with applications including parameter tuning, robotics, molecular design, sensor networks, and more. The idea is to model the unknown function as a Gaussian process with a given ","element":"span"},{"style":{"fontStyle":"italic"},"text":"kernel function ","element":"span"},{"text":"dictating the smoothness properties. This model is updated using (typically noisy) samples, which are selected to steer towards the function maximum.","element":"span"}],[{"text":"One of the most attractive properties of BO is its efficiency in terms of the number of function samples used. Consequently, algorithms with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"rigorous guarantees ","element":"span"},{"text":"on the tradeoff between samples and optimization performance are particularly valuable. Perhaps the most prominent work in the literature giving such guarantees is that of (","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"text":",","element":"span"}],[{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":"), who consider the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cumulative regret","element":"span"},{"text":":","element":"span"}],[{"id":"id-27","style":{"width":"78%"},"width":736,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is the function being optimized, and ","element":"span"},{"style":{"height":9.19},"width":34.78,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-6.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"is the point chosen at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". Under a Gaussian process (GP) prior and Gaussian noise, it is shown in (","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"href":"#id-1","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":") that an algorithm called Gaussian Process Upper Confidence Bound (GP-UCB) achieves a cumulative regret of the form","element":"span"}],[{"id":"id-3","style":{"width":"66%"},"width":625,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-7.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16.79},"width":415.81,"height":41.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-8.png","element":"img","alt":" γT = maxx1,...,xT I(f; y)","inline":true,"padRight":true},{"text":"(with function values ","element":"span"},{"style":{"fontWeight":"bold"},"text":"f ","element":"span"},{"text":"= ","element":"span"},{"style":{"height":16},"width":318.18,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-9.png","element":"img","alt":"(f(x1), . . . , f(xT ))","inline":true,"padRight":true},{"text":"and noisy samples ","element":"span"},{"style":{"height":16},"width":287.26,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-10.png","element":"img","alt":" y = (y1, . . . , yT )","inline":true},{"text":") is known as the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"maximum information gain","element":"span"},{"text":". Here ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"f","element":"span"},{"text":"; ","element":"span"},{"style":{"fontWeight":"bold"},"text":"y","element":"span"},{"text":") ","element":"span"},{"text":"denotes the mutual information (","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"Cover & Thomas","element":"a"},{"href":"#id-2","referenceIndex":6,"text":", ","element":"a"},{"href":"#id-2","referenceIndex":6,"text":"2001","element":"a"},{"text":") between the function values and noisy samples, and ","element":"span"},{"style":{"height":16},"width":92.39,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-11.png","element":"img","alt":" O∗(·)","inline":true,"padRight":true},{"text":"denotes asymptotic notation up to logarithmic factors.","element":"span"}],[{"text":"The guarantee (","element":"span"},{"href":"#id-3","text":"2","element":"a"},{"text":") ensures sub-linear cumulative regret for many kernels of interest. However, the literature is severely lacking in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"algorithm-independent lower bounds","element":"span"},{"text":", and without these, it is impossible to know to what extent the upper bounds, including (","element":"span"},{"href":"#id-3","text":"2","element":"a"},{"text":"), can be improved. In this work, we address this gap in detail in the special case of a one-dimensional function. We show that the best possible cumulative regret behaves as ","element":"span"},{"style":{"height":18.29},"width":142.84,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/0-12.png","element":"img","alt":" Θ∗(√T)","inline":true,"padRight":true},{"text":"under mild assumptions on the kernel, thus identifying both cases where (","element":"span"},{"href":"#id-3","text":"2","element":"a"},{"text":") is nearoptimal, and cases where it is strictly suboptimal.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"1.1. Related Work","element":"span"}],[{"text":"An extensive range of BO algorithms have been proposed in the literature, typically involving the maximization of an acquisition function (","element":"span"},{"href":"#id-4","referenceIndex":10,"text":"Hennig & Schuler","element":"a"},{"href":"#id-4","referenceIndex":10,"text":", ","element":"a"},{"href":"#id-4","referenceIndex":10,"text":"2012","element":"a"},{"text":"; ","element":"span"},{"href":"#id-5","referenceIndex":11,"text":"Hernández- ","element":"a"},{"href":"#id-5","referenceIndex":11,"text":"Lobato et al.","element":"a"},{"href":"#id-5","referenceIndex":11,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":11,"text":"2014","element":"a"},{"text":"; ","element":"span"},{"href":"#id-6","referenceIndex":19,"text":"Russo & Van Roy","element":"a"},{"href":"#id-6","referenceIndex":19,"text":", ","element":"a"},{"href":"#id-6","referenceIndex":19,"text":"2014","element":"a"},{"text":"; ","element":"span"},{"href":"#id-7","referenceIndex":25,"text":"Wang et al.","element":"a"},{"href":"#id-7","referenceIndex":25,"text":", ","element":"a"},{"href":"#id-7","referenceIndex":25,"text":"2016","element":"a"},{"text":"); see (","element":"span"},{"href":"#id-0","referenceIndex":21,"text":"Shahriari et al.","element":"a"},{"href":"#id-0","referenceIndex":21,"text":", ","element":"a"},{"href":"#id-0","referenceIndex":21,"text":"2016","element":"a"},{"text":") for a recent overview. As mentioned above, the most relevant algorithm to this work for the noisy setting is GP-UCB (","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"href":"#id-1","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":"), which constructs ","element":"span"},{"style":{"fontStyle":"italic"},"text":"confidence bounds ","element":"span"},{"text":"in which the function lies with high probability, and samples the point with the highest upper confidence bound. Several extensions to GPUCB have also been proposed, including contextual (","element":"span"},{"href":"#id-8","referenceIndex":15,"text":"Krause ","element":"a"},{"href":"#id-8","referenceIndex":15,"text":"& Ong","element":"a"},{"href":"#id-8","referenceIndex":15,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":15,"text":"2011","element":"a"},{"text":"; ","element":"span"},{"href":"#id-9","referenceIndex":1,"text":"Bogunovic et al.","element":"a"},{"href":"#id-9","referenceIndex":1,"text":", ","element":"a"},{"href":"#id-9","referenceIndex":1,"text":"2016a","element":"a"},{"text":"), batch (","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Contal et al.","element":"a"},{"href":"#id-10","referenceIndex":5,"text":", ","element":"a"},{"href":"#id-10","referenceIndex":5,"text":"2013","element":"a"},{"text":"; ","element":"span"},{"href":"#id-11","referenceIndex":8,"text":"Desautels et al.","element":"a"},{"href":"#id-11","referenceIndex":8,"text":", ","element":"a"},{"href":"#id-11","referenceIndex":8,"text":"2014","element":"a"},{"text":"), and high-dimensional (","element":"span"},{"href":"#id-12","referenceIndex":12,"text":"Kan- ","element":"a"},{"href":"#id-12","referenceIndex":12,"text":"dasamy et al.","element":"a"},{"href":"#id-12","referenceIndex":12,"text":", ","element":"a"},{"href":"#id-12","referenceIndex":12,"text":"2015","element":"a"},{"text":"; ","element":"span"},{"href":"#id-13","referenceIndex":18,"text":"Rolland et al.","element":"a"},{"href":"#id-13","referenceIndex":18,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":18,"text":"2018","element":"a"},{"text":") variants.","element":"span"}],[{"text":"In the noiseless setting, it has been shown that it is possible to achieve ","element":"span"},{"style":{"fontStyle":"italic"},"text":"bounded ","element":"span"},{"text":"cumulative regret (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":"; ","element":"span"},{"href":"#id-15","referenceIndex":13,"text":"Kawaguchi et al.","element":"a"},{"href":"#id-15","referenceIndex":13,"text":", ","element":"a"},{"href":"#id-15","referenceIndex":13,"text":"2015","element":"a"},{"text":") under some technical assumptions. In (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":"), this is done by keeping track of a set of potential maximizers, and sampling increasingly finely in order to shrink that set and “zoom in” towards the optimal point. Similar ideas have also been used in the noisy setting for studying batch variants of GP-UCB (","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Con- ","element":"a"},{"href":"#id-10","referenceIndex":5,"text":"tal et al.","element":"a"},{"href":"#id-10","referenceIndex":5,"text":", ","element":"a"},{"href":"#id-10","referenceIndex":5,"text":"2013","element":"a"},{"text":"), simultaneous online optimization (SOO) methods (","element":"span"},{"href":"#id-16","referenceIndex":24,"text":"Wang et al.","element":"a"},{"href":"#id-16","referenceIndex":24,"text":", ","element":"a"},{"href":"#id-16","referenceIndex":24,"text":"2014","element":"a"},{"text":"), and lookahead algorithms that use confidence bounds (","element":"span"},{"href":"#id-17","referenceIndex":2,"text":"Bogunovic et al.","element":"a"},{"href":"#id-17","referenceIndex":2,"text":", ","element":"a"},{"href":"#id-17","referenceIndex":2,"text":"2016b","element":"a"},{"text":"). Returning to the noiseless setting, upper and lower bounds were given in (","element":"span"},{"href":"#id-18","referenceIndex":9,"text":"Grünewälder et al.","element":"a"},{"href":"#id-18","referenceIndex":9,"text":", ","element":"a"},{"href":"#id-18","referenceIndex":9,"text":"2010","element":"a"},{"text":") for kernels satisfying certain smoothness assumptions, with the lower bounds showing that bounded cumulative regret is not always to be expected.","element":"span"}],[{"text":"Alongside the Bayesian view of the Gaussian process model, several works have also considered a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"non-Bayesian ","element":"span"},{"text":"counterpart assuming that the function has a bounded norm in the associated reproducing kernel Hilbert space (RKHS). Interestingly, GP-UCB still provides similar guarantees to (","element":"span"},{"href":"#id-3","text":"2","element":"a"},{"text":") in this setting (","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"href":"#id-1","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":"). Moreover, lower bounds have been proved; see (","element":"span"},{"href":"#id-19","referenceIndex":4,"text":"Bull","element":"a"},{"href":"#id-19","referenceIndex":4,"text":", ","element":"a"},{"href":"#id-19","referenceIndex":4,"text":"2011","element":"a"},{"text":") for the noiseless setting, and (","element":"span"},{"href":"#id-20","referenceIndex":20,"text":"Scarlett et al.","element":"a"},{"href":"#id-20","referenceIndex":20,"text":", ","element":"a"},{"href":"#id-20","referenceIndex":20,"text":"2017","element":"a"},{"text":") for the noisy setting. In the latter, the lower bounds nearly match the GP-UCB upper bound for the squared exponential (SE) kernel, but gaps remain for the Matérn kernel. For reference, we note that these kernels are defined as follows:","element":"span"}],[{"style":{"width":"86%"},"width":811,"height":333,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-0.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l > ","element":"span"},{"text":"0 ","element":"span"},{"text":"is a lengthscale parameter, ","element":"span"},{"style":{"height":11.6},"width":95.36,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-1.png","element":"img","alt":" ν > 0","inline":true,"padRight":true},{"text":"is a smoothness parameter, ","element":"span"},{"style":{"height":13.59},"width":47.23,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-2.png","element":"img","alt":" Bν","inline":true,"padRight":true},{"text":"is the modified Bessel function, and ","element":"span"},{"style":{"height":10.8},"width":25,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-3.png","element":"img","alt":" Γ","inline":true,"padRight":true},{"text":"is the gamma function.","element":"span"}],[{"text":"The multi-armed bandit (MAB) (","element":"span"},{"href":"#id-21","referenceIndex":3,"text":"Bubeck & Cesa-Bianchi","element":"a"},{"href":"#id-21","referenceIndex":3,"text":", ","element":"a"},{"href":"#id-21","referenceIndex":3,"text":"2012","element":"a"},{"text":") literature has developed alongside the BO literature, with the two often bearing similar concepts. The MAB literature is far too extensive to cover here, but it is worth mentioning that sharp lower bounds are known in numerous settings (","element":"span"},{"href":"#id-21","referenceIndex":3,"text":"Bubeck & Cesa-Bianchi","element":"a"},{"href":"#id-21","referenceIndex":3,"text":", ","element":"a"},{"href":"#id-21","referenceIndex":3,"text":"2012","element":"a"},{"text":"), and the abovementioned concept of “zooming in” to the optimal point has also been explored (","element":"span"},{"href":"#id-22","referenceIndex":14,"text":"Kleinberg et al.","element":"a"},{"href":"#id-22","referenceIndex":14,"text":", ","element":"a"},{"href":"#id-22","referenceIndex":14,"text":"2008","element":"a"},{"text":"). To our knowledge, however, none of the existing MAB results are closely related to our own.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"1.2. Our Results and Their Implications","element":"span"}],[{"text":"The main results of this paper are informally summarized as follows.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Main Results (Informal). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Under mild technical assumptions on the kernel, satisfied (for example) by the SE kernel and Matérn-","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-4.png","element":"img","alt":"ν","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"kernel with ","element":"span"},{"style":{"height":11.6},"width":95.41,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-5.png","element":"img","alt":" ν > 2","inline":true},{"style":{"fontStyle":"italic"},"text":", the best possible cumulative regret of noisy BO in one dimension behaves as ","element":"span"},{"style":{"height":18.3},"width":122.31,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-6.png","element":"img","alt":" Ω(√T)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":16.83},"width":228.11,"height":42.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-7.png","element":"img","alt":" O(√T log T).","inline":true}],[{"text":"Our results have several important implications:","element":"span"}],[{"text":"• To our knowledge, our lower bound is the first of any kind in the noisy Bayesian setting, and is tight up to a ","element":"span"},{"style":{"height":16.03},"width":120.32,"height":40.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-8.png","element":"img","alt":"√log T","inline":true,"padRight":true},{"text":"factor under our technical assumptions.","element":"span"}],[{"text":"• Our lower bound also establishes the order-optimality of the ","element":"span"},{"style":{"height":18.3},"width":143.36,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-9.png","element":"img","alt":" O∗(√T)","inline":true,"padRight":true},{"text":"upper bound of (","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"href":"#id-1","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":") applied to the SE kernel, up to logarithmic factors.","element":"span"}],[{"text":"• On the other hand, our upper bound establishes that the upper bound of (","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"href":"#id-1","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":") for the Matérn-","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-10.png","element":"img","alt":"ν","inline":true,"padRight":true},{"text":"kernel, namely ","element":"span"},{"style":{"height":16},"width":94.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-11.png","element":"img","alt":" O∗(T","inline":true},{"style":{"height":7.2},"width":50.08,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-12.png","element":"img","alt":"ν+2","inline":true},{"style":{"height":16},"width":85.97,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-13.png","element":"img","alt":"2ν+2 )","inline":true},{"text":", is strictly suboptimal for ","element":"span"},{"style":{"height":11.6},"width":95.36,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-14.png","element":"img","alt":"ν > 2","inline":true},{"text":". For example, if ","element":"span"},{"style":{"height":10.8},"width":95.38,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-15.png","element":"img","alt":" ν = 3","inline":true},{"text":", then this is ","element":"span"},{"style":{"height":17.78},"width":194.63,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-16.png","element":"img","alt":" O∗(T 0.625),","inline":true,"padRight":true},{"text":"as opposed to our upper bound of ","element":"span"},{"style":{"height":17.78},"width":153.37,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-17.png","element":"img","alt":" O∗(T 0.5)","inline":true},{"text":". (See also (","element":"span"},{"href":"#id-23","referenceIndex":22,"text":"Shekhar & Javidi","element":"a"},{"href":"#id-23","referenceIndex":22,"text":", ","element":"a"},{"href":"#id-23","referenceIndex":22,"text":"2017","element":"a"},{"text":") for recent improvements over (","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"href":"#id-1","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":") under the Matérn kernel in higher dimensions and/or with smaller ","element":"span"},{"style":{"height":14},"width":45.49,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-18.png","element":"img","alt":" ν).","inline":true}],[{"text":"• Another important implication for the Matérn kernel with ","element":"span"},{"style":{"height":11.6},"width":95.36,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-19.png","element":"img","alt":" ν > 2","inline":true,"padRight":true},{"text":"is that the Bayesian setting is provably less difficult than the non-Bayesian RKHS counterpart; the latter has cumulative regret ","element":"span"},{"style":{"height":16},"width":73.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-20.png","element":"img","alt":" Ω(T","inline":true},{"style":{"height":7.2},"width":50.08,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-21.png","element":"img","alt":"ν+1","inline":true},{"style":{"height":16},"width":85.97,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-22.png","element":"img","alt":"2ν+1 )","inline":true,"padRight":true},{"text":"(","element":"span"},{"href":"#id-20","referenceIndex":20,"text":"Scarlett et al.","element":"a"},{"href":"#id-20","referenceIndex":20,"text":", ","element":"a"},{"href":"#id-20","referenceIndex":20,"text":"2017","element":"a"},{"text":"), which is strictly worse than ","element":"span"},{"style":{"height":16.83},"width":228.11,"height":42.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-23.png","element":"img","alt":" O(√T log T).","inline":true}],[{"text":"Our upper bound is stated formally in Section ","element":"span"},{"text":"3","element":"span"},{"text":", and its technical assumptions are given in Section ","element":"span"},{"href":"#id-24","text":"2.1","element":"a"},{"text":". We build on the ideas of (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":") for the noiseless setting, while addressing highly non-trivial challenges arising in the presence of noise.","element":"span"}],[{"text":"Our lower bound is stated formally in Section ","element":"span"},{"href":"#id-25","text":"4","element":"a"},{"text":", and its technical assumptions are given in Section ","element":"span"},{"href":"#id-24","text":"2.1","element":"a"},{"text":". The analysis is based on a reduction to binary hypothesis testing and an application of Fano’s inequality (","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"Cover & Thomas","element":"a"},{"href":"#id-2","referenceIndex":6,"text":", ","element":"a"},{"href":"#id-2","referenceIndex":6,"text":"2001","element":"a"},{"text":"). This approach is inspired by previous work on lower bounds for stochastic convex optimization (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":"), but the details are very different.","element":"span"}]]},{"heading":"2. Problem Setup","paragraphs":[[{"id":"id-24","style":{"fontWeight":"bold"},"text":"2.1. Bayesian Optimization","element":"span"}],[{"text":"We seek to sequentially optimize an unknown reward function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"over the one-dimensional domain ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"= [0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1]","element":"span"},{"text":"; note that any interval can be transformed to this choice via re-scaling. At time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", we query a single point ","element":"span"},{"style":{"height":13.19},"width":244.21,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-24.png","element":"img","alt":" xt ∈ D and ob-","inline":true,"padRight":true},{"text":"serve a noisy sample ","element":"span"},{"style":{"height":17.38},"width":610.24,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-25.png","element":"img","alt":" yt = f(xt)+zt, where zt ∼ N(0, σ2)","inline":true,"padRight":true},{"text":"for some noise variance ","element":"span"},{"style":{"height":14.18},"width":115.21,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-26.png","element":"img","alt":" σ2 > 0","inline":true},{"text":", with independence across different times. We measure the performance using the cumulative regret ","element":"span"},{"style":{"height":13.19},"width":53.26,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/1-27.png","element":"img","alt":" RT","inline":true,"padRight":true},{"text":", defined in (","element":"span"},{"href":"#id-27","text":"1","element":"a"},{"text":").","element":"span"}],[{"text":"We henceforth assume ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"to be distributed according to Gaussian process (GP) (","element":"span"},{"href":"#id-28","referenceIndex":17,"text":"Rasmussen","element":"a"},{"href":"#id-28","referenceIndex":17,"text":", ","element":"a"},{"href":"#id-28","referenceIndex":17,"text":"2006","element":"a"},{"text":") having mean zero and kernel function ","element":"span"},{"style":{"height":16},"width":127.94,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-0.png","element":"img","alt":" k(x, x′)","inline":true},{"text":". The posterior distribution of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"given the points ","element":"span"},{"style":{"height":17.39},"width":320.39,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-1.png","element":"img","alt":" xt = [x1, . . . , xt]T","inline":true,"padRight":true},{"text":"and observations ","element":"span"},{"style":{"height":17.38},"width":314.38,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-2.png","element":"img","alt":"yt = [y1, . . . , yt]T","inline":true,"padRight":true},{"text":"up to time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"is again a GP, with the posterior mean and variance given by (","element":"span"},{"href":"#id-28","referenceIndex":17,"text":"Rasmussen","element":"a"},{"href":"#id-28","referenceIndex":17,"text":", ","element":"a"},{"href":"#id-28","referenceIndex":17,"text":"2006","element":"a"},{"text":")","element":"span"}],[{"id":"id-75","style":{"width":"94%"},"width":888,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-3.png","element":"img"}],[{"id":"id-76","text":"where ","element":"span"},{"style":{"height":25.58},"width":702.47,"height":63.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-4.png","element":"img","alt":" kt(x) =�k(xi, x)�ti=1, Kt =�k(xi, xj)�i,j","inline":true},{"text":", and ","element":"span"},{"style":{"height":13.19},"width":29.38,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-5.png","element":"img","alt":" It","inline":true,"padRight":true},{"text":"is the ","element":"span"},{"style":{"height":10},"width":77.1,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-6.png","element":"img","alt":" t × t","inline":true,"padRight":true},{"text":"identity matrix.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.2. Technical Assumptions","element":"span"}],[{"text":"Here we introduce several assumptions that will be adopted in our main results, some of which were also used in the noiseless setting (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":").","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Assumption 1. ","element":"span"},{"text":"We have the following:","element":"span"}],[{"text":"1. The kernel ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"is stationary, depending on its inputs ","element":"span"},{"style":{"height":16},"width":105.94,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-7.png","element":"img","alt":"(x, x′)","inline":true,"padRight":true},{"text":"only through ","element":"span"},{"style":{"height":10},"width":191.5,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-8.png","element":"img","alt":" τ = x − x′;","inline":true}],[{"text":"2. The kernel ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"satisfies ","element":"span"},{"style":{"height":16},"width":200.57,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-9.png","element":"img","alt":" k(x, x′) ≤ 1","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":16},"width":105.94,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-10.png","element":"img","alt":" (x, x′)","inline":true},{"text":", and ","element":"span"},{"style":{"height":16},"width":422.1,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-11.png","element":"img","alt":"k(x, x) = 1 for all x ∈ D;","inline":true}],[{"text":"Given the stationarity assumption, ","element":"span"},{"text":"the assumptions ","element":"span"},{"style":{"height":16},"width":213.53,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-12.png","element":"img","alt":"k(x, x′) ≤ 1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x, x","element":"span"},{"text":") = 1 ","element":"span"},{"text":"are without loss of generality, as one can always re-scale the function and adjust the noise variance ","element":"span"},{"style":{"height":13.38},"width":40.21,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-13.png","element":"img","alt":" σ2 ","inline":true,"padRight":true},{"text":"accordingly.","element":"span"}],[{"text":"Next, we give some high-probability assumptions on the random function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"itself.","element":"span"}],[{"id":"id-31","style":{"fontWeight":"bold"},"text":"Assumption 2. ","element":"span"},{"text":"There exists a constant ","element":"span"},{"style":{"height":16},"width":184.71,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-14.png","element":"img","alt":" δ1 ∈ (0, 1)","inline":true,"padRight":true},{"text":"such that, with probability at least ","element":"span"},{"style":{"height":13.99},"width":102.34,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-15.png","element":"img","alt":" 1 − δ1","inline":true},{"text":", we have the following:","element":"span"}],[{"style":{"width":"96%"},"width":904,"height":127,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-16.png","element":"img"}],[{"id":"id-35","text":"for any ","element":"span"},{"style":{"fontStyle":"italic"},"text":"local maximum ","element":"span"},{"style":{"height":6.8},"width":36.78,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-17.png","element":"img","alt":" x′","inline":true,"padRight":true},{"text":"that differs from ","element":"span"},{"style":{"height":10.98},"width":38.78,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-18.png","element":"img","alt":" x∗","inline":true},{"text":", for some constant ","element":"span"},{"style":{"height":11.6},"width":99.23,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-19.png","element":"img","alt":" ϵ > 0.","inline":true}],[{"text":"2. The function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is twice differentiable;","element":"span"}],[{"text":"3. The function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"and its first two derivatives are bounded:","element":"span"}],[{"style":{"width":"85%"},"width":802,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-20.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":116.31,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-21.png","element":"img","alt":" x ∈ D","inline":true,"padRight":true},{"text":"and some constants ","element":"span"},{"style":{"height":16},"width":172.29,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-22.png","element":"img","alt":" (c0, c1, c2)","inline":true},{"text":". This implies that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is ","element":"span"},{"style":{"height":9.19},"width":33.25,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-23.png","element":"img","alt":" c1","inline":true},{"text":"-Lipschitz continuous, and ","element":"span"},{"style":{"height":14},"width":37.8,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-24.png","element":"img","alt":" f ′","inline":true,"padRight":true},{"text":"is ","element":"span"},{"style":{"height":9.19},"width":33.24,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-25.png","element":"img","alt":"c2","inline":true},{"text":"-Lipschitz continuous.","element":"span"}],[{"text":"The assumption of a unique maximizer holds with probability one in most non-trivial cases (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":"), and ","element":"span"},{"text":"(","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":") simply formally defines the gap to the second-highest peak. Moreover, given twice differentiability, the remaining conditions in (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":") are very mild, only requiring that the function value and its derivatives are bounded, and formally defining the corresponding constants.","element":"span"}],[{"text":"Next, we provide assumptions regarding the derivatives of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"and the resulting Taylor expansions (typically around the optimizer ","element":"span"},{"style":{"height":10.98},"width":38.78,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-26.png","element":"img","alt":" x∗","inline":true},{"text":"). We adopt slightly different assumptions for the upper and lower bounds, starting with the former.","element":"span"}],[{"id":"id-34","style":{"fontWeight":"bold"},"text":"Assumption 3. ","element":"span"},{"text":"There exist constants ","element":"span"},{"style":{"height":16},"width":327.02,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-27.png","element":"img","alt":" δ2 ∈ (0, 1) and ρ0 ∈","inline":true},{"style":{"height":19.37},"width":99.35,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-28.png","element":"img","alt":"�0, 12�","inline":true},{"text":"such that conditioned on the events in Assumption ","element":"span"},{"href":"#id-31","text":"2","element":"a"},{"text":", we have with probability at least ","element":"span"},{"style":{"height":13.99},"width":103.91,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-29.png","element":"img","alt":" 1 − δ2","inline":true,"padRight":true},{"text":"that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"one of the following ","element":"span"},{"text":"is true:","element":"span"}],[{"text":"1. The maximizer is at an endpoint (i.e., ","element":"span"},{"style":{"height":11.38},"width":250.49,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-30.png","element":"img","alt":" x∗ = 0 or x∗ =","inline":true,"padRight":true},{"text":"1","element":"span"},{"text":"), and satisfies the following ","element":"span"},{"style":{"fontStyle":"italic"},"text":"locally linear ","element":"span"},{"text":"behavior: For all ","element":"span"},{"style":{"height":16},"width":184.22,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-31.png","element":"img","alt":" ξ ∈ [0, ρ0]","inline":true,"padRight":true},{"text":"(if ","element":"span"},{"style":{"height":10.98},"width":132.28,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-32.png","element":"img","alt":" x∗ = 0","inline":true},{"text":") or ","element":"span"},{"style":{"height":16},"width":215.22,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-33.png","element":"img","alt":" ξ ∈ [−ρ0, 0]","inline":true,"padRight":true},{"text":"(if ","element":"span"},{"style":{"height":10.99},"width":114.25,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-34.png","element":"img","alt":"x∗ = 1","inline":true},{"text":"), it holds that","element":"span"}],[{"id":"id-32","style":{"width":"91%"},"width":857,"height":111,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-35.png","element":"img"}],[{"text":"2. The maximizer satisfies ","element":"span"},{"style":{"height":16},"width":483.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-36.png","element":"img","alt":" x∗ ∈ (ρ0, 1 − ρ0), and f satis-","inline":true,"padRight":true},{"text":"fies the following ","element":"span"},{"style":{"fontStyle":"italic"},"text":"locally quadratic ","element":"span"},{"text":"behavior: For all ","element":"span"},{"style":{"height":16},"width":366.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-37.png","element":"img","alt":"ξ ∈ [−ρ0, ρ0], we have","inline":true}],[{"id":"id-33","style":{"width":"91%"},"width":857,"height":115,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-38.png","element":"img"}],[{"text":"This assumption is near-identical to the main assumption adopted in the noiseless setting (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":"), and is also mild given the assumption of twice differentiability. Indeed, (","element":"span"},{"href":"#id-32","text":"9","element":"a"},{"text":") and (","element":"span"},{"href":"#id-33","text":"10","element":"a"},{"text":") amount to standard Taylor expansions, with the assumptions ","element":"span"},{"style":{"height":14.78},"width":110.07,"height":36.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-39.png","element":"img","alt":" c1 > 0","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.78},"width":110.07,"height":36.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-40.png","element":"img","alt":" c2 > 0","inline":true,"padRight":true},{"text":"only requiring non-vanishing gradient at the endpoint (first case) or non-vanishing second derivative at the function maximizer (second case). These conditions typically hold with probability one (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":").","element":"span"}],[{"id":"id-29","text":"The following assumption will be used for the lower bound. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Assumption 4. ","element":"span"},{"text":"There exists constants ","element":"span"},{"style":{"height":16},"width":325.37,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-41.png","element":"img","alt":" δ′2 ∈ (0, 1) and ρ0 ∈","inline":true},{"style":{"height":19.37},"width":99.35,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-42.png","element":"img","alt":"�0, 12�","inline":true},{"text":"such that conditioned on the events in Assumption ","element":"span"},{"href":"#id-31","text":"2","element":"a"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"both of the following ","element":"span"},{"text":"hold with probability at least ","element":"span"},{"style":{"height":15.56},"width":115.22,"height":38.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-43.png","element":"img","alt":" 1 − δ′2:","inline":true}],[{"text":"1. For any ","element":"span"},{"style":{"height":16},"width":734.94,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-44.png","element":"img","alt":" x ∈ D and ξ ∈ [−ρ0, ρ0] for which x+ξ ∈ D,","inline":true,"padRight":true},{"text":"we have","element":"span"}],[{"id":"id-30","style":{"width":"91%"},"width":857,"height":141,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-45.png","element":"img"}],[{"text":"2. The maximizer satisfies ","element":"span"},{"style":{"height":16},"width":483.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-46.png","element":"img","alt":" x∗ ∈ (ρ0, 1 − ρ0), and f satis-","inline":true,"padRight":true},{"text":"fies the following for all ","element":"span"},{"style":{"height":16},"width":226.78,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-47.png","element":"img","alt":" ξ ∈ [−ρ0, ρ0]:","inline":true}],[{"id":"id-92","style":{"width":"91%"},"width":857,"height":115,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/2-48.png","element":"img"}],[{"id":"id-36","style":{"width":"97%"},"width":915,"height":716,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Figure 1. ","element":"figcaption","subtype":"caption"},{"text":"Illustration of some of the main assumptions: The function is bounded within ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":130.42,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-1.png","element":"img","alt":" [−c0, c0]","inline":true,"padRight":true},{"text":"and its derivative within ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":139.66,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-2.png","element":"img","alt":" [−c1, c1],","inline":true,"padRight":true},{"text":"the gap to the second highest peak is at least ","element":"figcaption","subtype":"caption"},{"style":{"height":0},"width":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-3.png","element":"img","alt":" ϵ","inline":true},{"text":", and the function is locally quadratic for points within a distance ","element":"figcaption","subtype":"caption"},{"style":{"height":9.6},"width":33.99,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-4.png","element":"img","alt":" ρ0","inline":true,"padRight":true},{"text":"of the maximizer.","element":"figcaption","subtype":"caption"}],[{"text":"The first part is similar to (","element":"span"},{"href":"#id-33","text":"10","element":"a"},{"text":"), but performs a Taylor expansion around an arbitrary point rather than the specific point ","element":"span"},{"style":{"height":10.98},"width":38.78,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-5.png","element":"img","alt":"x∗","inline":true},{"text":", and the second part is precisely (","element":"span"},{"href":"#id-33","text":"10","element":"a"},{"text":"). Note, however, that here we are assuming ","element":"span"},{"style":{"fontStyle":"italic"},"text":"both ","element":"span"},{"text":"of two conditions to hold, rather than one of two. Hence, we are implicitly assuming that the first item of Assumption ","element":"span"},{"href":"#id-34","text":"3 ","element":"a"},{"text":"does ","element":"span"},{"style":{"fontStyle":"italic"},"text":"not ","element":"span"},{"text":"have a significant probability of occurring. For stationary kernels, the only situations where an endpoint has a high probability of being optimal are those where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"varies very slowly (e.g., the SE kernel with a larger lengthscale than the domain width). Such functions are of limited practical interest.","element":"span"}],[{"text":"Similarly to the noiseless setting (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":"), all of the above assumptions hold for the SE kernel, as well as the Matérn-","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-6.png","element":"img","alt":"ν","inline":true,"padRight":true},{"text":"kernel with ","element":"span"},{"style":{"height":11.6},"width":98.03,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-7.png","element":"img","alt":" ν > 2","inline":true},{"text":", with the added caveat that ","element":"span"},{"style":{"height":15.56},"width":33.71,"height":38.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-8.png","element":"img","alt":" δ′2","inline":true,"padRight":true},{"text":"in Assumption ","element":"span"},{"href":"#id-35","text":"4 ","element":"a"},{"text":"is a function of the lengthscale ","element":"span"},{"text":"and cannot be chosen arbitrarily. Specifically, a smaller lengthscale implies a smaller value of ","element":"span"},{"style":{"height":15.56},"width":33.71,"height":38.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-9.png","element":"img","alt":" δ′2","inline":true},{"text":". In contrast, ","element":"span"},{"style":{"height":13.99},"width":101.5,"height":34.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-10.png","element":"img","alt":" δ1 and","inline":true},{"style":{"height":13.99},"width":33.72,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-11.png","element":"img","alt":"δ2","inline":true,"padRight":true},{"text":"in Assumptions ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-34","text":"3 ","element":"a"},{"text":"can be made arbitrary small by suitably changing the constants ","element":"span"},{"style":{"height":10},"width":237.85,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-12.png","element":"img","alt":" ϵ, c0, c1, c2, ρ0","inline":true},{"text":", and so on.","element":"span"}],[{"text":"An illustration of some of the main assumptions and their associated constants is given in Figure ","element":"span"},{"href":"#id-36","text":"1","element":"a"},{"text":".","element":"span"}]]},{"heading":"3. Upper Bound","paragraphs":[[{"text":"Our upper bound is formally stated as follows.","element":"span"}],[{"id":"id-37","style":{"fontWeight":"bold"},"text":"Theorem 1. ","element":"span"},{"text":"(Upper Bound) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the problem of BO in one dimension described in Section ","element":"span"},{"href":"#id-24","style":{"fontStyle":"italic"},"text":"2.1","element":"a"},{"style":{"fontStyle":"italic"},"text":", with time horizon ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and noise variance ","element":"span"},{"style":{"height":19.08},"width":627.98,"height":47.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-13.png","element":"img","alt":" σ2 satisfying σ2 ≥ cσT 1−ζ for some cσ >","inline":true,"padRight":true},{"text":"0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":14},"width":97.13,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-14.png","element":"img","alt":" ζ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":". Under Assumptions ","element":"span"},{"href":"#id-32","style":{"fontStyle":"italic"},"text":"1","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-31","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":", and ","element":"span"},{"href":"#id-34","style":{"fontStyle":"italic"},"text":"3","element":"a"},{"style":{"fontStyle":"italic"},"text":", there exists an algorithm satisfying the following: With probability at least ","element":"span"},{"style":{"height":13.99},"width":199.8,"height":34.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-15.png","element":"img","alt":" 1 − δ1 − δ2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"(with respect to the Gaussian process ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"style":{"fontStyle":"italic"},"text":"), the average cumulative regret (averaged over the noisy","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"samples) satisfies","element":"span"}],[{"id":"id-48","style":{"width":"76%"},"width":720,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-16.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Here ","element":"span"},{"style":{"height":13.99},"width":33.71,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-17.png","element":"img","alt":" δ1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":13.99},"width":33.71,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-18.png","element":"img","alt":" δ2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are defined in Assumptions ","element":"span"},{"href":"#id-31","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-34","style":{"fontStyle":"italic"},"text":"3","element":"a"},{"style":{"fontStyle":"italic"},"text":", and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"style":{"fontStyle":"italic"},"text":"depends only on the constants therein and ","element":"span"},{"style":{"height":16},"width":117.79,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-19.png","element":"img","alt":" (cσ, ζ).","inline":true}],[{"text":"The assumption that ","element":"span"},{"style":{"height":19.08},"width":180.9,"height":47.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-20.png","element":"img","alt":" σ2 ≥ cσT 1−ζ","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":16},"width":108.29,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-21.png","element":"img","alt":" (cσ, ζ)","inline":true,"padRight":true},{"text":"is very ","element":"span"},{"text":"mild, since typically ","element":"span"},{"style":{"height":13.39},"width":40.2,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-22.png","element":"img","alt":" σ2","inline":true,"padRight":true},{"text":"is constant with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":". The proof of Theorem ","element":"span"},{"href":"#id-37","text":"1 ","element":"a"},{"text":"extends immediately to a high probability guarantee with respect to both ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"and the noisy samples (i.e., holding with probability ","element":"span"},{"style":{"height":13.99},"width":473.03,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-23.png","element":"img","alt":" 1−δ1−δ2−δ for δ in Lemma","inline":true,"padRight":true},{"href":"#id-38","text":"1 ","element":"a"},{"text":"below). We have stated the above form for consistency with the lower bound, which will be given in Section ","element":"span"},{"href":"#id-25","text":"4","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3.1. High-Level Description of the Algorithm","element":"span"}],[{"text":"The algorithm considered in the proof of Theorem ","element":"span"},{"href":"#id-37","text":"1 ","element":"a"},{"text":"is described informally in Algorithm ","element":"span"},{"href":"#id-39","text":"1","element":"a"},{"text":"; the details will be established throughout the proof of Theorem ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":", and a complete description is given in Appendix ","element":"span"},{"href":"#id-40","text":"B","element":"a"},{"text":".","element":"span"}],[{"id":"id-39","style":{"width":"100%"},"width":939,"height":1237,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-24.png","element":"img"}],[{"text":"As in the noiseless setting (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":"), the idea is to operate in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"epochs ","element":"span"},{"text":"and sample a set of increasingly closelypacked points ","element":"span"},{"style":{"height":16.48},"width":63.22,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/3-25.png","element":"img","alt":" L(i)","inline":true,"padRight":true},{"text":"to reduce the posterior variance, but only within a set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"potential maximizers ","element":"span"},{"text":"that are updated according to the confidence bounds. As a simple means of bringing the effective noise level down, we perform ","element":"span"},{"style":{"fontStyle":"italic"},"text":"resampling","element":"span"},{"text":", i.e., ","element":"span"},{"text":"sampling the same point ","element":"span"},{"style":{"height":16.48},"width":69.58,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-0.png","element":"img","alt":" K(i)","inline":true,"padRight":true},{"text":"times consecutively. In each epoch, we sample enough to be able to produce upper and lower confidence bounds ","element":"span"},{"style":{"height":16},"width":378.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-1.png","element":"img","alt":" UCBt(x) and LCBt(x)","inline":true,"padRight":true},{"text":"that differ by at most a target value ","element":"span"},{"style":{"height":16.48},"width":75.44,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-2.png","element":"img","alt":" 2η(i)","inline":true,"padRight":true},{"text":"within ","element":"span"},{"style":{"height":16.48},"width":115.2,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-3.png","element":"img","alt":" M(i−1)","inline":true},{"text":", and then the target is halved for the next epoch.","element":"span"}],[{"text":"We do not expect our algorithm to perform well in practice by any means, but it still suffices for our purposes in establishing ","element":"span"},{"style":{"height":16.84},"width":218.61,"height":42.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-4.png","element":"img","alt":" O(√T log T)","inline":true,"padRight":true},{"text":"regret. Indeed, we have made no attempt to optimize the corresponding constant factors, and doing so would require more sophisticated techniques. Moreover, the quantities ","element":"span"},{"style":{"height":16.48},"width":283.92,"height":41.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-5.png","element":"img","alt":" L(i), K(i), UCBt","inline":true},{"text":", and ","element":"span"},{"style":{"height":13.19},"width":93.91,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-6.png","element":"img","alt":" LCBt","inline":true,"padRight":true},{"text":"in Algorithm ","element":"span"},{"href":"#id-39","text":"1 ","element":"a"},{"text":"are chosen as functions of both the kernel and the constants appearing in our assumptions, which limits the algorithm’s practical utility even further. Note, however, that these constants are merely a function of the kernel, and that suitable bounds suffice in place of exact values (e.g., lower bound on ","element":"span"},{"style":{"height":10},"width":36.6,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-7.png","element":"img","alt":" ρ0","inline":true},{"text":", upper bound on ","element":"span"},{"style":{"height":14},"width":134.74,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-8.png","element":"img","alt":" c0, etc.).","inline":true}],[{"text":"While our algorithm assumes a known time horizon ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"(which is used when selecting ","element":"span"},{"style":{"height":16.48},"width":69.58,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-9.png","element":"img","alt":" K(i)","inline":true},{"text":"; see Appendix ","element":"span"},{"href":"#id-40","text":"B","element":"a"},{"text":"), this ","element":"span"},{"id":"id-46","text":"assumption can easily be dropped via a standard doubling ","element":"span"},{"text":"trick. The details are given in Appendix ","element":"span"},{"href":"#id-41","text":"A","element":"a"},{"text":".","element":"span"}],[{"id":"id-44","style":{"fontWeight":"bold"},"text":"3.2. Auxiliary Lemmas","element":"span"}],[{"text":"Here we present two very standard auxiliary lemmas. We begin with a simpler version of the conditions of Srinivas ","element":"span"},{"style":{"fontStyle":"italic"},"text":"et al. ","element":"span"},{"text":"(","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"href":"#id-1","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":") guaranteeing that the posterior mean and variance provide valid confidence bounds with high probability.","element":"span"},{"text":"1 ","element":"span"},{"text":"The reason for being slightly simpler is that we are considering a fixed time horizon.","element":"span"}],[{"id":"id-38","style":{"fontWeight":"bold"},"text":"Lemma 1. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Fix ","element":"span"},{"style":{"height":16},"width":156.99,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-10.png","element":"img","alt":" δ ∈ (0, 1)","inline":true},{"style":{"fontStyle":"italic"},"text":". For any finite set of points ","element":"span"},{"style":{"height":13.2},"width":113.62,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-11.png","element":"img","alt":" L ⊆ D","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and time horizon ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"style":{"fontStyle":"italic"},"text":", under the choice ","element":"span"},{"style":{"height":21.63},"width":266.98,"height":54.07,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-12.png","element":"img","alt":" βT = 2 log |L|·Tδ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":", it holds that","element":"span"}],[{"id":"id-74","style":{"width":"96%"},"width":904,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-13.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"with probability at least ","element":"span"},{"style":{"height":12},"width":97.85,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-14.png","element":"img","alt":" 1 − δ.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"It was shown in (","element":"span"},{"href":"#id-1","referenceIndex":23,"text":"Srinivas et al.","element":"a"},{"href":"#id-1","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":23,"text":"2010","element":"a"},{"text":") that for fixed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", the event ","element":"span"},{"style":{"height":21.37},"width":461.3,"height":53.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-15.png","element":"img","alt":" |f(x) − µt(x)| ≤ β1/2T σt(x)","inline":true,"padRight":true},{"text":"holds with probability at least ","element":"span"},{"style":{"height":14.18},"width":184.47,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-16.png","element":"img","alt":" 1 − e−βT /2","inline":true},{"text":". the lemma follows by substituting the choice of ","element":"span"},{"style":{"height":14.4},"width":45.54,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-17.png","element":"img","alt":" βT","inline":true,"padRight":true},{"text":"and taking the union bound over the ","element":"span"},{"style":{"height":16},"width":395.77,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-18.png","element":"img","alt":" |L| · T values of x and t.","inline":true}],[{"text":"The following lemma is also standard, and has been used (implicitly or explicitly) in the study of multiple algorithms that eliminate suboptimal points based on confidence bounds (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":"; ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Contal et al.","element":"a"},{"href":"#id-10","referenceIndex":5,"text":", ","element":"a"},{"href":"#id-10","referenceIndex":5,"text":"2013","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":2,"text":"Bogunovic et al.","element":"a"},{"href":"#id-17","referenceIndex":2,"text":", ","element":"a"},{"href":"#id-17","referenceIndex":2,"text":"2016b","element":"a"},{"text":"). For completeness, we give a short proof.","element":"span"}],[{"id":"id-47","style":{"fontWeight":"bold"},"text":"Lemma 2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that at time ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"style":{"fontStyle":"italic"},"text":", for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"style":{"fontStyle":"italic"},"text":"within a set of points ","element":"span"},{"style":{"height":13.2},"width":113.62,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-19.png","element":"img","alt":"�L ⊆ D","inline":true},{"style":{"fontStyle":"italic"},"text":", it holds that","element":"span"}],[{"id":"id-42","style":{"width":"75%"},"width":713,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-20.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"for some bounds ","element":"span"},{"style":{"height":13.59},"width":431.88,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-21.png","element":"img","alt":" UCBt and LCBt such that","inline":true}],[{"id":"id-45","style":{"width":"80%"},"width":751,"height":110,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-22.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Then any point ","element":"span"},{"style":{"height":11.6},"width":99.69,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-23.png","element":"img","alt":" x ∈ �L","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"satisfying ","element":"span"},{"style":{"height":18.19},"width":414.52,"height":45.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-24.png","element":"img","alt":" f(x) < maxx′∈ �L f(x′) −","inline":true},{"style":{"height":14.4},"width":39.92,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-25.png","element":"img","alt":"4η","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"must also satisfy","element":"span"}],[{"style":{"width":"73%"},"width":693,"height":67,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-26.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"That is, any ","element":"span"},{"style":{"height":14.4},"width":39.92,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-27.png","element":"img","alt":" 4η","inline":true},{"style":{"fontStyle":"italic"},"text":"-suboptimal point can be ruled out according to the confidence bounds ","element":"span"},{"text":"(","element":"span"},{"href":"#id-42","text":"15","element":"a"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"We have","element":"span"}],[{"id":"id-43","style":{"width":"78%"},"width":741,"height":365,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-28.png","element":"img"}],[{"text":"where (","element":"span"},{"href":"#id-43","text":"18","element":"a"},{"text":") and (","element":"span"},{"href":"#id-44","text":"22","element":"a"},{"text":") follow from (","element":"span"},{"href":"#id-45","text":"16","element":"a"},{"text":"), (","element":"span"},{"href":"#id-43","text":"19","element":"a"},{"text":") and (","element":"span"},{"href":"#id-44","text":"21","element":"a"},{"text":") follow from the confidence bounds in (","element":"span"},{"href":"#id-42","text":"15","element":"a"},{"text":"), and (","element":"span"},{"href":"#id-46","text":"20","element":"a"},{"text":") follows from the assumption ","element":"span"},{"style":{"height":18.19},"width":474.51,"height":45.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-29.png","element":"img","alt":" f(x) < maxx′∈ �L f(x′) − 4η.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"3.3. Outline of Proof of Theorem ","element":"span"},{"href":"#id-37","style":{"fontWeight":"bold"},"text":"1","element":"a"}],[{"text":"Here we provide a high-level outline of the Proof of Theorem ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":"; the details are given in Appendix ","element":"span"},{"href":"#id-40","text":"B","element":"a"},{"text":".","element":"span"}],[{"text":"Algorithm ","element":"span"},{"href":"#id-39","text":"1 ","element":"a"},{"text":"only samples on a discrete sub-domain ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":". This set is chosen to be a set of regularly-spaced points that are fine enough to ensure that the cumulative regret with respect to ","element":"span"},{"style":{"height":16},"width":222.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-30.png","element":"img","alt":" maxx∈L f(x)","inline":true,"padRight":true},{"text":"is within a constant value of the cumulative regret with respect to ","element":"span"},{"style":{"height":16},"width":227.33,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-31.png","element":"img","alt":" maxx∈D f(x)","inline":true},{"text":". Working with the finite set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"helps to simplify the subsequent analysis.","element":"span"}],[{"text":"We split the epochs into two classes, which we call ","element":"span"},{"style":{"fontStyle":"italic"},"text":"early epochs ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"late epochs","element":"span"},{"text":". The late epochs are those in which we have shrunk the potential maximizers down enough to be entirely within the locally quadratic region, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cf.","element":"span"},{"text":", Figure ","element":"span"},{"href":"#id-36","text":"1","element":"a"},{"text":"; here we only discuss the second case of Assumption ","element":"span"},{"href":"#id-34","text":"3","element":"a"},{"text":", which is the more interesting of the two. Since the width of the locally quadratic region is constant, we can show that this occurs after a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"finite number of epochs","element":"span"},{"text":", each lasting for at most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(log ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":") ","element":"span"},{"text":"time. Hence, even if we naively upper bound the instant regret by ","element":"span"},{"style":{"height":13.19},"width":53.16,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/4-32.png","element":"img","alt":" 2c0","inline":true,"padRight":true},{"text":"according to (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":"), the overall regret incurred within the early epochs is insignificant.","element":"span"}],[{"text":"In the later epochs, we exploit the locally quadratic behavior to show that the set of potential maximizers shrinks rapidly, i.e., by a constant factor after each epoch. As a result, we can let the repeatedly-sampled set ","element":"span"},{"style":{"height":16.48},"width":63.22,"height":41.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-0.png","element":"img","alt":" L(i)","inline":true,"padRight":true},{"text":"in Algorithm ","element":"span"},{"href":"#id-39","text":"1 ","element":"a"},{"text":"lie within a given interval that similarly shrinks, thereby controlling the number of samples we need to take in the epoch.","element":"span"}],[{"text":"By Lemma ","element":"span"},{"href":"#id-47","text":"2","element":"a"},{"text":", after we attain uniform ","element":"span"},{"style":{"height":12.48},"width":55.52,"height":31.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-1.png","element":"img","alt":" η(i)","inline":true},{"text":"-confidence, the instant regret incurred at each time thereafter is at most ","element":"span"},{"style":{"height":16.48},"width":87.89,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-2.png","element":"img","alt":" 4η(i).","inline":true,"padRight":true},{"text":"Using the fact that ","element":"span"},{"style":{"height":18.66},"width":237.58,"height":46.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-3.png","element":"img","alt":" η(i) = η(0)2−i","inline":true,"padRight":true},{"text":"and summing over the epochs, we find that the overall regret behaves as in (","element":"span"},{"href":"#id-48","text":"13","element":"a"},{"text":").","element":"span"}],[{"text":"A notable difficulty that we omitted above is how we attain the confidence bounds in order to update the potential maximizers ","element":"span"},{"style":{"height":16.48},"width":74.39,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-4.png","element":"img","alt":" M(i)","inline":true},{"text":". While we directly apply Lemma ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"for the points that were repeatedly sampled, we found it difficult to do this for the non-sampled points. For those, we instead use Lipschitz properties of the function. In the early epochs, we use the global Lipschitz constant ","element":"span"},{"style":{"height":9.19},"width":33.24,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-5.png","element":"img","alt":" c1","inline":true,"padRight":true},{"text":"from Assumption ","element":"span"},{"href":"#id-31","text":"2","element":"a"},{"text":", whereas in the later epochs, we find a considerably smaller Lipschitz constant due to the locally quadratic behavior.","element":"span"}]]},{"heading":"4. Lower Bound","paragraphs":[[{"text":"Our lower bound is formally stated as follows.","element":"span"}],[{"id":"id-60","style":{"fontWeight":"bold"},"text":"Theorem 2. ","element":"span"},{"text":"(Lower Bound) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the one-dimensional BO problem from Section ","element":"span"},{"href":"#id-24","style":{"fontStyle":"italic"},"text":"2.1","element":"a"},{"style":{"fontStyle":"italic"},"text":", with time horizon ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and noise variance ","element":"span"},{"style":{"height":17.33},"width":788.85,"height":43.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-6.png","element":"img","alt":" σ2 satisfying σ2 ≤ c′σT 1−ζ′ for some c′σ > 0 and","inline":true},{"style":{"height":14},"width":119.63,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-7.png","element":"img","alt":"ζ′ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":". Under Assumptions ","element":"span"},{"href":"#id-32","style":{"fontStyle":"italic"},"text":"1","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-31","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":", and ","element":"span"},{"href":"#id-35","style":{"fontStyle":"italic"},"text":"4","element":"a"},{"style":{"fontStyle":"italic"},"text":", any algorithm must yield the following: With probability at least","element":"span"},{"text":"2 ","element":"span"},{"style":{"height":13.99},"width":137.13,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-8.png","element":"img","alt":"1 − δ1 −","inline":true},{"style":{"height":15.56},"width":33.71,"height":38.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-9.png","element":"img","alt":"δ′2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"(with respect to the Gaussian process ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"style":{"fontStyle":"italic"},"text":"), the average ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cumulative regret (averaged over the noisy samples) satisfies","element":"span"}],[{"id":"id-49","style":{"width":"72%"},"width":676,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-10.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Here ","element":"span"},{"style":{"height":15.56},"width":148.78,"height":38.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-11.png","element":"img","alt":" δ1 and δ′2 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are defined in Assumptions ","element":"span"},{"href":"#id-31","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-35","style":{"fontStyle":"italic"},"text":"4","element":"a"},{"style":{"fontStyle":"italic"},"text":", and ","element":"span"},{"style":{"height":10.8},"width":45.33,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-12.png","element":"img","alt":" C′","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"depends only on the constants therein and ","element":"span"},{"style":{"height":16},"width":128.97,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-13.png","element":"img","alt":" (c′σ, ζ′).","inline":true}],[{"text":"The assumption that ","element":"span"},{"style":{"height":17.33},"width":245.84,"height":43.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-14.png","element":"img","alt":" σ2 ≤ c′σT 1−ζ′","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":16},"width":119.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-15.png","element":"img","alt":" (c′σ, ζ′)","inline":true,"padRight":true},{"text":"is ","element":"span"},{"id":"id-93","text":"very mild, since typically ","element":"span"},{"style":{"height":13.38},"width":40.2,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-16.png","element":"img","alt":" σ2","inline":true,"padRight":true},{"text":"is constant with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":". ","element":"span"},{"id":"id-94","text":"The assumption is required to avoid ","element":"span"},{"text":"(","element":"span"},{"href":"#id-49","text":"23","element":"a"},{"text":") contradicting the trivial ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":") ","element":"span"},{"text":"upper bound. We also note that Theorem ","element":"span"},{"href":"#id-50","text":"2 ","element":"a"},{"text":"immediately implies an ","element":"span"},{"style":{"height":19.79},"width":220.14,"height":49.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-17.png","element":"img","alt":" Ω�1 + σ√T�","inline":true},{"text":"lower bound on the expected regret ","element":"span"},{"style":{"height":16},"width":104,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-18.png","element":"img","alt":" E[RT ]","inline":true,"padRight":true},{"text":"with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"both ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"and the noisy samples, as long as ","element":"span"},{"style":{"height":15.56},"width":246.43,"height":38.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-19.png","element":"img","alt":" 1−δ1 −δ′2 > 0","inline":true},{"text":". As discussed following ","element":"span"},{"text":"Assumption ","element":"span"},{"href":"#id-35","text":"4","element":"a"},{"text":", the latter condition is mild.","element":"span"}],[{"text":"In the remainder of the section, we introduce some of the main tools and ideas, and then outline the proof. We note that ","element":"span"},{"style":{"height":16},"width":245.68,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-20.png","element":"img","alt":" E[RT ] = Ω(1)","inline":true,"padRight":true},{"text":"is trivial, as the average regret of the first sample alone is lower bounded by a constant. As a result, we only need to show that ","element":"span"},{"style":{"height":18.3},"width":313.21,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-21.png","element":"img","alt":" E[RT ] = Ω(σ√T).","inline":true}],[{"id":"id-51","style":{"width":"83%"},"width":779,"height":628,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-22.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Figure 2. ","element":"figcaption","subtype":"caption"},{"text":"Examples of functions ","element":"figcaption","subtype":"caption"},{"style":{"height":13.2},"width":148.08,"height":32.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-23.png","element":"img","alt":" f+ and f−","inline":true,"padRight":true},{"text":"considered in the lower bound. The two are identical up to a small horizontal shift.","element":"figcaption","subtype":"caption"}],[{"id":"id-66","style":{"fontWeight":"bold"},"text":"4.1. Reduction to Binary Hypothesis Testing","element":"span"}],[{"id":"id-25","text":"Recall that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is a one-dimensional GP on ","element":"span"},{"text":"[0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1] ","element":"span"},{"text":"with a stationary kernel ","element":"span"},{"style":{"height":16},"width":370.54,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-24.png","element":"img","alt":" k(x, x′). We fix ∆ > 0","inline":true},{"text":", and think of the GP as being generated by the following procedure:","element":"span"}],[{"id":"id-50","text":"1. ","element":"span"},{"text":"Generate a GP ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-25.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"with the same kernel on the larger domain ","element":"span"},{"style":{"height":16},"width":216.9,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-26.png","element":"img","alt":" [−∆, 1 + ∆];","inline":true}],[{"text":"2. Randomly shift ","element":"span"},{"style":{"height":15.2},"width":607.8,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-27.png","element":"img","alt":" f0 along the x-axis by +∆ or −∆ with","inline":true,"padRight":true},{"text":"equal probability, to obtain ","element":"span"},{"style":{"height":14},"width":34.8,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-28.png","element":"img","alt":"�f;","inline":true}],[{"text":"3. Let ","element":"span"},{"style":{"height":16},"width":435.85,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-29.png","element":"img","alt":" f(x) = �f(x) for x ∈ [0, 1].","inline":true}],[{"text":"Since the kernel is stationary, the shifting does not affect the distribution, so the induced distribution of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is indeed the desired GP on ","element":"span"},{"text":"[0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1]","element":"span"},{"text":".","element":"span"}],[{"text":"We consider a genie argument in which ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-30.png","element":"img","alt":" f0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is revealed to the algorithm","element":"span"},{"text":". Clearly this additional information can only help the algorithm, so any lower bound still remains valid for the original setting. Stated differently, the algorithm knows that","element":"span"}],[{"style":{"width":"99%"},"width":935,"height":182,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-31.png","element":"img"}],[{"text":"See Figure ","element":"span"},{"href":"#id-51","text":"2 ","element":"a"},{"text":"for an illustrative example.","element":"span"}],[{"text":"This argument allows us to reduce the BO problem to a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"binary hypothesis test ","element":"span"},{"text":"with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"adaptive sampling","element":"span"},{"text":", as depicted in Figure ","element":"span"},{"href":"#id-52","text":"3","element":"a"},{"text":". The hypothesis, indexed by ","element":"span"},{"style":{"height":16},"width":303.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-32.png","element":"img","alt":" v ∈ {−, +}, is that","inline":true,"padRight":true},{"text":"the underlying function is ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-33.png","element":"img","alt":" fv","inline":true},{"text":". We show that under a suitable choice of ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-34.png","element":"img","alt":" ∆","inline":true},{"text":", achieving small cumulative regret means that we can construct a decision rule ","element":"span"},{"style":{"height":18.83},"width":430.71,"height":47.07,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/5-35.png","element":"img","alt":"ˆV (x) such that ˆV = v with","inline":true,"padRight":true},{"text":"high probability, i.e., the hypothesis test is successful. The contrapositive statement is then that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"if the hypothesis test cannot be successful, we cannot achieve small cumulative regret","element":"span"},{"text":", from which it only remains to prove the former. This","element":"span"}],[{"id":"id-52","style":{"width":"64%"},"width":1265,"height":399,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Figure 3. ","element":"figcaption","subtype":"caption"},{"text":"Illustration of reduction from optimization to binary hypothesis testing. The gray boxes are considered to be fixed, whereas the white boxes are introduced for the purpose of proving the lower bound.","element":"figcaption","subtype":"caption"}],[{"text":"idea was used previously for stochastic convex optimization in (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":").","element":"span"}],[{"id":"id-59","text":"In the remainder of the analysis, we implicitly condition on ","element":"span"},{"text":"an arbitrary realization of ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-1.png","element":"img","alt":" f0","inline":true},{"text":", meaning that all expectations and probabilities are only with respect to the random index ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"and/or the noise. We also assume that ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-2.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"satisfies the conditions in Assumptions ","element":"span"},{"href":"#id-32","text":"1","element":"a"},{"text":", ","element":"span"},{"href":"#id-31","text":"2","element":"a"},{"text":", and ","element":"span"},{"href":"#id-35","text":"4","element":"a"},{"text":", which holds with probability at least ","element":"span"},{"style":{"height":15.56},"width":191.27,"height":38.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-3.png","element":"img","alt":" 1 − δ1 − δ′2","inline":true},{"text":". For sufficiently small ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-4.png","element":"img","alt":" ∆","inline":true},{"text":", ","element":"span"},{"text":"the same assumptions are directly inherited by ","element":"span"},{"style":{"height":14.79},"width":44.51,"height":36.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-5.png","element":"img","alt":" f+","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14},"width":44.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-6.png","element":"img","alt":" f−","inline":true},{"text":". We henceforth assume that ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-7.png","element":"img","alt":" ∆","inline":true,"padRight":true},{"text":"is indeed sufficiently small; we will verify that this is the case when we set its value.","element":"span"}],[{"text":"We introduce some further notation. Letting ","element":"span"},{"style":{"height":16.52},"width":118.78,"height":41.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-8.png","element":"img","alt":" x∗+, x∗−","inline":true},{"text":", and ","element":"span"},{"style":{"height":14.94},"width":38.78,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-9.png","element":"img","alt":"x∗0","inline":true,"padRight":true},{"text":"denote the maximizers of ","element":"span"},{"style":{"height":14.79},"width":117.01,"height":36.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-10.png","element":"img","alt":" f+, f−","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-11.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"(which are ","element":"span"},{"text":"unique by Assumption ","element":"span"},{"href":"#id-31","text":"2","element":"a"},{"text":"), we see that Assumption ","element":"span"},{"href":"#id-35","text":"4 ","element":"a"},{"text":"ensures these are in the interior ","element":"span"},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1)","element":"span"},{"text":", and hence the optimal values coincide: ","element":"span"},{"style":{"height":17.54},"width":587.94,"height":43.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-12.png","element":"img","alt":" f+(x∗+) = f−(x∗−) = f0(x∗0) =: f ∗","inline":true},{"text":". To ","element":"span"},{"text":"simplify some of the notation, instead of working with these functions directly, we consider the equivalent problem of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"minimizing the corresponding regret functions","element":"span"},{"text":":","element":"span"}],[{"style":{"width":"68%"},"width":645,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-13.png","element":"img"}],[{"text":"Indeed, since we assume the algorithm knows ","element":"span"},{"style":{"height":14.4},"width":206.11,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-14.png","element":"img","alt":" f0 and hence","inline":true,"padRight":true},{"text":"also the optimal value ","element":"span"},{"style":{"height":14.18},"width":39.8,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-15.png","element":"img","alt":" f ∗","inline":true},{"text":", it can always choose to transform the samples as ","element":"span"},{"style":{"height":14.19},"width":215.98,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-16.png","element":"img","alt":" y → f ∗ − y","inline":true},{"text":". In this form, we have the convenient normalization ","element":"span"},{"style":{"height":17.54},"width":386.4,"height":43.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-17.png","element":"img","alt":" r+(x∗+) = r−(x∗−) = 0.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"4.2. Auxiliary Lemmas","element":"span"}],[{"text":"We first state the following useful properties of ","element":"span"},{"style":{"height":15.19},"width":176.78,"height":37.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-18.png","element":"img","alt":" r+ and r−.","inline":true,"padRight":true},{"id":"id-54","style":{"fontWeight":"bold"},"text":"Lemma 3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The functions ","element":"span"},{"style":{"height":10.79},"width":42.98,"height":26.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-19.png","element":"img","alt":" r+","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":9.19},"width":42.98,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-20.png","element":"img","alt":" r−","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"constructed above satisfy the following for sufficiently small ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-21.png","element":"img","alt":" ∆","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"under the conditions in Assumptions ","element":"span"},{"href":"#id-31","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-35","style":{"fontStyle":"italic"},"text":"4","element":"a"},{"style":{"fontStyle":"italic"},"text":":","element":"span"}],[{"id":"id-57","style":{"width":"96%"},"width":909,"height":337,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-22.png","element":"img"}],[{"style":{"width":"96%"},"width":908,"height":187,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-23.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"See Appendix ","element":"span"},{"href":"#id-53","text":"C","element":"a"},{"text":".","element":"span"}],[{"text":"The first part states that any point can be better than ","element":"span"},{"style":{"height":17.37},"width":84.33,"height":43.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-24.png","element":"img","alt":" c2∆2","inline":true},{"text":"-optimal for at most one of the two functions, the second part shows that the two functions are close for points near ","element":"span"},{"style":{"height":14.94},"width":38.78,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-25.png","element":"img","alt":"x∗0","inline":true},{"text":", and the third part shows that the instant regret is lower ","element":"span"},{"text":"bounded by a quadratic function.","element":"span"}],[{"text":"The first part of the Lemma ","element":"span"},{"href":"#id-54","text":"3 ","element":"a"},{"text":"allows us to bound the cumulative regret using Fano’s inequality for binary hypothesis testing with adaptive sampling (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":"). This inequality lower bounds the success probability of such a hypothesis test in terms of a mutual information quantity (","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"Cover & Thomas","element":"a"},{"href":"#id-2","referenceIndex":6,"text":", ","element":"a"},{"href":"#id-2","referenceIndex":6,"text":"2001","element":"a"},{"text":"). The resulting lower bound on regret is stated in the following; it is worth noting that the consideration of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cumulative ","element":"span"},{"text":"regret here provides a distinction from the analogous bound on the instant regret in (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":").","element":"span"}],[{"id":"id-56","style":{"fontWeight":"bold"},"text":"Lemma 4. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Under the preceding setup, we have","element":"span"}],[{"id":"id-58","style":{"width":"92%"},"width":865,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-26.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is equiprobable on ","element":"span"},{"style":{"height":16},"width":119.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-27.png","element":"img","alt":" {+, −}","inline":true},{"style":{"fontStyle":"italic"},"text":", and ","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"style":{"fontWeight":"bold"},"text":"y","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"are the selected points and samples when the minimization algorithm is applied to ","element":"span"},{"style":{"height":9.19},"width":42.98,"height":22.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-28.png","element":"img","alt":" rV","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":". Here ","element":"span"},{"style":{"height":19.91},"width":445.09,"height":49.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-29.png","element":"img","alt":" H−12 : [0, log 2] →�0, 12�","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the functional inverse of the binary entropy function ","element":"span"},{"style":{"height":19.37},"width":601.25,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/6-30.png","element":"img","alt":"H2(α) = α log 1α + (1 − α) log 11−α.","inline":true}],[{"text":"Since this result is particularly fundamental to our analysis, we provide a proof at the end of this section.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.3. Outline of Proof of Theorem ","element":"span"},{"href":"#id-50","style":{"fontWeight":"bold"},"text":"2","element":"a"}],[{"text":"Here we provide a high-level outline of the proof of Theorem ","element":"span"},{"href":"#id-50","text":"2","element":"a"},{"text":"; the details are given in Appendix ","element":"span"},{"href":"#id-55","text":"D","element":"a"},{"text":".","element":"span"}],[{"text":"Once the lower bound in Lemma ","element":"span"},{"href":"#id-56","text":"4 ","element":"a"},{"text":"is established, the main technical challenge is upper bounding the mutual information. A useful property called ","element":"span"},{"style":{"fontStyle":"italic"},"text":"tensorization ","element":"span"},{"text":"(e.g., see (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":")) allows us to simplify the mutual information with the vectors ","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"style":{"fontWeight":"bold"},"text":"y","element":"span"},{"text":") ","element":"span"},{"text":"to a sum of mutual informations containing only a single pair ","element":"span"},{"style":{"height":16},"width":119.58,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-0.png","element":"img","alt":" (xt, yt)","inline":true},{"text":": ","element":"span"},{"style":{"height":20.4},"width":517.29,"height":50.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-1.png","element":"img","alt":"I(V ; x, y) ≤ �Tt=1 I(V ; yt|xt).","inline":true}],[{"text":"Each such mutual information term ","element":"span"},{"style":{"height":16},"width":183.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-2.png","element":"img","alt":" I(V ; yt|xt)","inline":true,"padRight":true},{"text":"can further be upper bounded by the KL divergence (","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"Cover & Thomas","element":"a"},{"href":"#id-2","referenceIndex":6,"text":", ","element":"a"},{"href":"#id-2","referenceIndex":6,"text":"2001","element":"a"},{"text":") between the conditional output distributions corresponding to ","element":"span"},{"style":{"height":14.79},"width":161.96,"height":36.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-3.png","element":"img","alt":" r+ and r−","inline":true},{"text":", which in turn equals ","element":"span"},{"style":{"height":23.77},"width":225.42,"height":59.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-4.png","element":"img","alt":"(r+(x)−r−(x))22σ2","inline":true}],[{"id":"id-63","text":"when ","element":"span"},{"style":{"height":9.19},"width":121.89,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-5.png","element":"img","alt":" xt = x","inline":true},{"text":". By substituting the property (","element":"span"},{"href":"#id-57","text":"29","element":"a"},{"text":") given in Lemma ","element":"span"},{"href":"#id-54","text":"3","element":"a"},{"text":", we find that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"; ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"style":{"fontWeight":"bold"},"text":"y","element":"span"},{"text":") ","element":"span"},{"text":"is upper bounded by a constant times ","element":"span"},{"style":{"height":21.11},"width":586.44,"height":52.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-6.png","element":"img","alt":"1σ2�∆2E� �Tt=1 |xt − x∗0|2�+ T∆4�","inline":true},{"text":". If we can further upper bound ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"; ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"style":{"fontWeight":"bold"},"text":"y","element":"span"},{"text":") ","element":"span"},{"text":"by a constant in ","element":"span"},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"log 2)","element":"span"},{"text":", then (","element":"span"},{"href":"#id-58","text":"31","element":"a"},{"text":") establishes an ","element":"span"},{"style":{"height":17.38},"width":140.19,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-7.png","element":"img","alt":" Ω(T∆2)","inline":true,"padRight":true},{"text":"lower bound.","element":"span"}],[{"id":"id-65","text":"We proceed by considering the cases ","element":"span"},{"style":{"height":17.39},"width":342.34,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-8.png","element":"img","alt":" E[RT ] ≥ c′′T∆2 and","inline":true},{"style":{"height":17.38},"width":272.88,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-9.png","element":"img","alt":"E[RT ] < c′′T∆2 ","inline":true,"padRight":true},{"text":"separately, with ","element":"span"},{"href":"#id-59","style":{"height":14.8},"width":401.47,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-10.png","element":"img","alt":" c′′ given in (30). The for-","inline":true,"padRight":true},{"text":"mer case will immediately give the lower bound in Theorem ","element":"span"},{"href":"#id-60","text":"2 ","element":"a"},{"text":"when we set ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-11.png","element":"img","alt":" ∆","inline":true},{"text":", whereas in the latter case, we can use (","element":"span"},{"href":"#id-59","text":"30","element":"a"},{"text":") to show that ","element":"span"},{"style":{"height":21.1},"width":340.34,"height":52.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-12.png","element":"img","alt":" E� �Tt=1 |xt − x∗0|2�","inline":true},{"text":"is upper bounded by a constant times ","element":"span"},{"style":{"height":13.39},"width":78.03,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-13.png","element":"img","alt":" T∆2","inline":true},{"text":", which means that the desired mutual information upper bound (see the previous paragraph) is attained under a choice of ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-14.png","element":"img","alt":" ∆","inline":true,"padRight":true},{"text":"scaling as","element":"span"},{"style":{"height":23.51},"width":129.33,"height":58.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-15.png","element":"img","alt":"� σ2T�1/4","inline":true},{"text":". Under this choice, the lower bound ","element":"span"},{"style":{"height":17.39},"width":301.95,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-16.png","element":"img","alt":" E[RT ] = Ω(T∆2)","inline":true,"padRight":true},{"text":"evaluates to ","element":"span"},{"style":{"height":18.3},"width":146.51,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-17.png","element":"img","alt":" Ω(σ√T)","inline":true},{"text":", as required.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.4. Proof of Lemma ","element":"span"},{"href":"#id-56","style":{"fontWeight":"bold"},"text":"4","element":"a"}],[{"text":"As mentioned above, the proof of Lemma ","element":"span"},{"href":"#id-56","text":"4 ","element":"a"},{"text":"follows along the lines of (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":"), which in turn builds on previous works using Fano’s inequality to establish minimax lower bounds in statistical estimation problems; see for example (","element":"span"},{"href":"#id-61","referenceIndex":26,"text":"Yu","element":"a"},{"href":"#id-61","referenceIndex":26,"text":", ","element":"a"},{"href":"#id-61","referenceIndex":26,"text":"1997","element":"a"},{"text":").","element":"span"}],[{"text":"In the following, we use ","element":"span"},{"style":{"height":20.4},"width":558.1,"height":51.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-18.png","element":"img","alt":" RT,+ = �Tt=1 r+(xt) and RT,− =","inline":true},{"style":{"height":20.4},"width":216.27,"height":50.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-19.png","element":"img","alt":"�Tt=1 r−(xt)","inline":true,"padRight":true},{"text":"to denote the cumulative regret associated ","element":"span"},{"text":"with ","element":"span"},{"style":{"height":14.79},"width":162.07,"height":36.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-20.png","element":"img","alt":" r+ and r−","inline":true},{"text":", respectively, and we generically write ","element":"span"},{"style":{"height":15.59},"width":77.11,"height":38.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-21.png","element":"img","alt":" RT,v","inline":true,"padRight":true},{"text":"to denote one of the two with ","element":"span"},{"style":{"height":16},"width":199,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-22.png","element":"img","alt":" v ∈ {+, −}.","inline":true}],[{"text":"We first use Markov’s inequality to write","element":"span"}],[{"id":"id-62","style":{"width":"97%"},"width":911,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-23.png","element":"img"}],[{"text":"for any ","element":"span"},{"style":{"height":16},"width":163.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-24.png","element":"img","alt":" α ∈ (0, 1)","inline":true},{"text":". We proceed by analyzing the probability on the right-hand side.","element":"span"}],[{"text":"Recall that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"is equiprobable on ","element":"span"},{"style":{"height":16},"width":433.78,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-25.png","element":"img","alt":" {+, −}, and (x, y) are gen-","inline":true,"padRight":true},{"text":"erated by running the optimization algorithm on ","element":"span"},{"style":{"height":13.19},"width":163.51,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-26.png","element":"img","alt":" rV . Given","inline":true,"padRight":true},{"text":"the sequence of inputs ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":", let ","element":"span"},{"style":{"height":14.83},"width":31,"height":37.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-27.png","element":"img","alt":"ˆV","inline":true,"padRight":true},{"text":"be the index ","element":"span"},{"style":{"height":16},"width":197.22,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-28.png","element":"img","alt":" ˆv ∈ {+, −}","inline":true,"padRight":true},{"text":"with the lower cumulative regret ","element":"span"},{"style":{"height":20.4},"width":341.49,"height":51.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-29.png","element":"img","alt":" RT,ˆv = �Tt=1 rˆv(xt)","inline":true},{"text":". By ","element":"span"},{"text":"Lemma ","element":"span"},{"href":"#id-54","text":"3","element":"a"},{"text":", ","element":"span"},{"style":{"height":13.19},"width":53.26,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-30.png","element":"img","alt":" RT","inline":true,"padRight":true},{"text":"can be less than ","element":"span"},{"style":{"height":17.37},"width":113.16,"height":43.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-31.png","element":"img","alt":" c2T∆2","inline":true,"padRight":true},{"text":"for at most one of the two functions, and hence, if ","element":"span"},{"style":{"height":18.17},"width":441.21,"height":45.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-32.png","element":"img","alt":" RT,v ≤ (1−α)c2T∆2 then","inline":true,"padRight":true},{"text":"we must have ","element":"span"},{"style":{"height":14.83},"width":104.24,"height":37.07,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-33.png","element":"img","alt":"ˆV = v","inline":true},{"text":". Therefore,","element":"span"}],[{"id":"id-64","style":{"width":"85%"},"width":799,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-34.png","element":"img"}],[{"text":"where, here and subsequently, ","element":"span"},{"style":{"height":13.19},"width":40.35,"height":32.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-35.png","element":"img","alt":" Pv","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.19},"width":42.57,"height":32.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-36.png","element":"img","alt":" Ev","inline":true,"padRight":true},{"text":"denote probabilities and expectations when the underlying instant regret function is ","element":"span"},{"style":{"height":9.19},"width":33.97,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-37.png","element":"img","alt":" rv","inline":true,"padRight":true},{"text":"(i.e., the underlying function that the algorithm seeks to maximize is ","element":"span"},{"style":{"height":14},"width":61.72,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-38.png","element":"img","alt":" fv).","inline":true}],[{"text":"Continuing, we can lower bound the probability appearing in (","element":"span"},{"href":"#id-62","text":"32","element":"a"},{"text":") as follows:","element":"span"}],[{"style":{"width":"90%"},"width":846,"height":373,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-39.png","element":"img"}],[{"text":"where (","element":"span"},{"href":"#id-63","text":"35","element":"a"},{"text":") follows from (","element":"span"},{"href":"#id-64","text":"33","element":"a"},{"text":"), and (","element":"span"},{"href":"#id-65","text":"36","element":"a"},{"text":") follows from Fano’s inequality for binary hypothesis testing with adaptive sampling (see Eq. (22) and (24) of (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":")). The proof is completed by combining (","element":"span"},{"href":"#id-62","text":"32","element":"a"},{"text":") and (","element":"span"},{"href":"#id-65","text":"36","element":"a"},{"text":"), and recalling that ","element":"span"},{"style":{"height":6.8},"width":26,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-40.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"can be arbitrarily small.","element":"span"}]]},{"heading":"5. Conclusion and Discussion","paragraphs":[[{"text":"We have established tight scaling laws on the regret for Bayesian optimization in one dimension, showing that the optimal scaling is ","element":"span"},{"style":{"height":18.3},"width":122.3,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-41.png","element":"img","alt":" Ω(√T)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16.84},"width":218.61,"height":42.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-42.png","element":"img","alt":" O(√T log T)","inline":true,"padRight":true},{"text":"under mild technical assumptions on the kernel. Our results highlight some limitations of the widespread upper bounds based on the information gain, as well as providing cases where the noisy Bayesian setting is provably less difficult than its non-Bayesian RKHS counterpart.","element":"span"}],[{"text":"An immediate direction for further work is to sharpen the constant factors in the upper and lower bounds, and to establish whether the upper bound is attained by any algorithm that can also provide state-of-the-art performance in practice. We re-iterate that our algorithm is certainly not suitable for this purpose, as its cumulative regret contains large constant factors, and the algorithm makes use of a variety of specific constants present in the assumptions (though they are merely a function of the kernel).","element":"span"}],[{"text":"We expect our techniques to extend to any constant dimension ","element":"span"},{"style":{"height":13.2},"width":93.88,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-43.png","element":"img","alt":" d ≥ 1","inline":true},{"text":"; the main ideas from the noiseless upper bound still apply (","element":"span"},{"href":"#id-14","referenceIndex":7,"text":"de Freitas et al.","element":"a"},{"href":"#id-14","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":7,"text":"2012","element":"a"},{"text":"), and in the lower bound we can choose an arbitrary single dimension and introduce a random shift in that direction as per Section ","element":"span"},{"href":"#id-66","text":"4.1","element":"a"},{"text":". While these extensions may still yield","element":"span"},{"style":{"height":18.3},"width":256.3,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/7-44.png","element":"img","alt":"√T poly(logT)","inline":true,"padRight":true},{"text":"regret, the dependence on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"would be exponential or worse in the upper bound, but constant in the lower bound, with the latter dependence certainly being suboptimal. Multi-dimensional lower bounding techniques based on Fano’s inequality (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Ra- ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"ginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":") may improve the latter to ","element":"span"},{"text":"poly(","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":")","element":"span"},{"text":", but overall, attaining a sharp joint dependence on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"appears to require different techniques.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Acknowledgments. ","element":"span"},{"text":"I would like to thank Ilija Bogunovic for his helpful comments and suggestions. This work was supported by an NUS startup grant.","element":"span"}]]},{"heading":"6. Errata","paragraphs":[[{"text":"This section has been added to the arXiv paper to correct two minor mistakes in the published ICML 2018 paper. I am grateful to Shogo Iwazaki for pointing these out.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"6.1. Correction to the Proof of Theorem ","element":"span"},{"href":"#id-37","style":{"fontWeight":"bold"},"text":"1","element":"a"}],[{"text":"The upper bound in Theorem ","element":"span"},{"href":"#id-37","text":"1 ","element":"a"},{"text":"is stated using a probability bound with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"and an expectation with respect to the noise. However, in the current analysis of the expected regret, the confidence bound in Lemma ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"is used, and such a confidence bound holds with respect to both the function and the noise. This means that Lemma ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"is applied in a part of the analysis where we are only meant to be studying the randomness of the noise. Specifically, the current analysis averages over the event ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"in (","element":"span"},{"href":"#id-67","text":"44","element":"a"},{"text":")–(","element":"span"},{"href":"#id-67","text":"46","element":"a"},{"text":") and this event inadvertently includes randomness in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":".","element":"span"}],[{"text":"However, it is stated following (","element":"span"},{"href":"#id-68","text":"51","element":"a"},{"text":") that, after repeatedly sampling a particular point, “we performed enough repetitions to attain a variance of at most","element":"span"},{"style":{"height":28.57},"width":52.95,"height":71.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-0.png","element":"img","alt":"η2(i)4βT","inline":true,"padRight":true},{"text":"based on those ","element":"span"},{"text":"samples alone”, i.e., any possible variance reduction from querying other points is ignored. As a result, we may avoid Lemma ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"altogether and instead use a simpler Gaussian concentration inequality with respect to the noise alone.","element":"span"},{"text":"3","element":"span"}],[{"text":"Specifically, if ","element":"span"},{"style":{"height":17.38},"width":234.85,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-1.png","element":"img","alt":" Z ∼ N(µ, σ2)","inline":true,"padRight":true},{"text":"and we let ","element":"span"},{"style":{"height":10},"width":52.01,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-2.png","element":"img","alt":" �µK","inline":true,"padRight":true},{"text":"be the empirical mean of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"independent observations of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Z","element":"span"},{"text":", then it holds with probability at least ","element":"span"},{"style":{"height":13.99},"width":174.88,"height":34.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-3.png","element":"img","alt":" 1 − δ0 that","inline":true}],[{"id":"id-71","style":{"width":"71%"},"width":671,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-4.png","element":"img"}],[{"text":"We can apply this result (separately) for each repeatedly-sampled point in the algorithm, with ","element":"span"},{"style":{"height":16.48},"width":159.4,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-5.png","element":"img","alt":" K = K(i)","inline":true,"padRight":true},{"text":"as specified in Line 8 of Algorithm ","element":"span"},{"href":"#id-69","text":"2","element":"a"},{"text":". We need to set ","element":"span"},{"style":{"height":13.99},"width":33.71,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-6.png","element":"img","alt":" δ0","inline":true,"padRight":true},{"text":"to be small enough to permit a union bound over ","element":"span"},{"style":{"height":17.68},"width":87.73,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-7.png","element":"img","alt":" |L(i)|","inline":true,"padRight":true},{"text":"points and all epochs ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . ","element":"span"},{"text":", and a similar argument to (","element":"span"},{"href":"#id-70","text":"43","element":"a"},{"text":") reveals that ","element":"span"},{"style":{"height":21.77},"width":179.47,"height":54.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-8.png","element":"img","alt":" δ0 = δ2c1T 3","inline":true,"padRight":true},{"text":"suffices, thus giving the same confidence ","element":"span"},{"text":"width as the original analysis. In Line 10 of Algorithm ","element":"span"},{"href":"#id-69","text":"2","element":"a"},{"text":", we can use the empirical mean of the ","element":"span"},{"style":{"height":14.18},"width":72.42,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-9.png","element":"img","alt":" K(i) ","inline":true,"padRight":true},{"text":"responses (to querying ","element":"span"},{"style":{"height":6.8},"width":36.78,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-10.png","element":"img","alt":" x′","inline":true},{"text":") in place of the GP posterior mean, in accordance with (","element":"span"},{"href":"#id-71","text":"37","element":"a"},{"text":"), and similarly replace all occurrences of ","element":"span"},{"style":{"height":16},"width":121.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-11.png","element":"img","alt":" µt−1(·)","inline":true,"padRight":true},{"text":"in the analysis. Apart from this, the analysis is unchanged and the same final result is obtained.","element":"span"}],[{"text":"In summary, Theorem ","element":"span"},{"href":"#id-37","text":"1 ","element":"a"},{"text":"still holds as stated, but the algorithm should use a separate confidence bound for each repeatedly-sampled point based on simple Gaussian concentration, rather than using GP-based confidence bounds. Note that for the “purely high-probability” version discussed after the statement of Theorem ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":", this change is no longer necessary (i.e., it can be done but doesn’t need to be).","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"6.2. Correction to the Statement of Theorem ","element":"span"},{"href":"#id-50","style":{"fontWeight":"bold"},"text":"2","element":"a"}],[{"text":"Theorem ","element":"span"},{"href":"#id-50","text":"2 ","element":"a"},{"text":"is stated as holding with probability at least ","element":"span"},{"style":{"height":15.56},"width":160.59,"height":38.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-12.png","element":"img","alt":"1−δ1−δ′2 ","inline":true,"padRight":true},{"text":"with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":", and as averaging the regret with ","element":"span"},{"text":"respect to only the noise. However, the proof is based on interpreting ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"as first drawing a function ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-13.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"and then shifting it in a random direction ","element":"span"},{"style":{"height":16},"width":211.81,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-14.png","element":"img","alt":" V ∈ {+, −}","inline":true},{"text":", and in the current analysis, the cumulative regret also averages over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"(not just the noise). Thus, the actual result proved is that in which the probability is with respect to ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-15.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"and the expectation is with respect to both ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"and the noise. Such a result is somewhat unnatural, so here we explain how to instead obtain a slight variant of Theorem ","element":"span"},{"href":"#id-50","text":"2 ","element":"a"},{"text":"that doesn’t change which random variables are involved in the probability and the expectation.","element":"span"}],[{"text":"The idea is to decompose the average regret given ","element":"span"},{"style":{"height":14},"width":81.04,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-16.png","element":"img","alt":" f0 as","inline":true}],[{"id":"id-72","style":{"width":"86%"},"width":807,"height":95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-17.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.37},"width":396.68,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-18.png","element":"img","alt":" PV (+) = PV (−) = 12","inline":true},{"text":". In the notation of (","element":"span"},{"href":"#id-49","text":"23","element":"a"},{"text":"), ","element":"span"},{"text":"our analysis shows that ","element":"span"},{"style":{"height":19.78},"width":459.88,"height":49.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-19.png","element":"img","alt":" E[RT | f0] ≥ C′�1 + σ√T�","inline":true},{"text":"with probability at least ","element":"span"},{"style":{"height":15.56},"width":186.56,"height":38.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-20.png","element":"img","alt":" 1 − δ1 − δ′2 ","inline":true,"padRight":true},{"text":"with respect to ","element":"span"},{"href":"#id-72","style":{"height":14.4},"width":192.45,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-21.png","element":"img","alt":" f0. By (38),","inline":true,"padRight":true},{"text":"the same lower bound must apply to at least one of the two values of ","element":"span"},{"style":{"height":16},"width":204.2,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-22.png","element":"img","alt":" E[RT | f0, v]","inline":true},{"text":". But ","element":"span"},{"style":{"height":16},"width":107.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-23.png","element":"img","alt":" (f0, v)","inline":true,"padRight":true},{"text":"collectively determine ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":", and whichever ","element":"span"},{"style":{"fontStyle":"italic"},"text":"v ","element":"span"},{"text":"value satisfies the desired lower bound, it has probability ","element":"span"},{"style":{"height":19.37},"width":16,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-24.png","element":"img","alt":"12","inline":true},{"text":". Consequently, the desired lower bound ","element":"span"},{"style":{"height":19.79},"width":441.19,"height":49.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-25.png","element":"img","alt":"E[RT | f] ≥ C′�1 + σ√T�","inline":true},{"text":"holds with probability at least ","element":"span"},{"style":{"height":16},"width":259.43,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-26.png","element":"img","alt":"(1 − δ1 − δ′2)/2","inline":true,"padRight":true},{"text":"with respect to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":".","element":"span"}],[{"text":"In summary, Theorem ","element":"span"},{"href":"#id-50","text":"2 ","element":"a"},{"text":"holds with the probability halved from ","element":"span"},{"style":{"height":15.56},"width":186.67,"height":38.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-27.png","element":"img","alt":" 1 − δ1 − δ′2","inline":true,"padRight":true},{"text":"to ","element":"span"},{"style":{"height":16},"width":259.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/8-28.png","element":"img","alt":" (1 − δ1 − δ′2)/2","inline":true},{"text":", and establishing this ","element":"span"},{"text":"only requires a small amount of additional reasoning.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-9","text":"Bogunovic, I., Scarlett, J., and Cevher, V. Time-varying ","element":"span"},{"text":"Gaussian process bandit optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Int. Conf. Art. Intel. Stats. (AISTATS)","element":"span"},{"text":", 2016a.","element":"span"}],[{"id":"id-17","text":"Bogunovic, I., Scarlett, J., Krause, A., and Cevher, V. Trun- ","element":"span"},{"text":"cated variance reduction: A unified approach to Bayesian optimization and level-set estimation. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conf. Neur. Inf. Proc. Sys. (NIPS)","element":"span"},{"text":", 2016b.","element":"span"}],[{"id":"id-21","text":"Bubeck, S. and Cesa-Bianchi, N. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Regret Analysis of Stochastic and Nonstochastic Multi-Armed Bandit Problems","element":"span"},{"text":". Found. Trend. Mach. Learn. Now Publishers, 2012.","element":"span"}],[{"id":"id-19","text":"Bull, A. D. Convergence rates of efficient global optimiza- ","element":"span"},{"text":"tion algorithms. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"J. Mach. Learn. Res.","element":"span"},{"text":", 12(Oct.):2879– 2904, 2011.","element":"span"}],[{"id":"id-10","text":"Contal, E., Buffoni, D., Robicquet, A., and Vayatis, N. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine Learning and Knowledge Discovery in Databases","element":"span"},{"text":", chapter Parallel Gaussian Process Optimization with Upper Confidence Bound and Pure Exploration, pp. 225–240. Springer Berlin Heidelberg, 2013.","element":"span"}],[{"id":"id-2","text":"Cover, T. M. and Thomas, J. A. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Elements of Information Theory","element":"span"},{"text":". John Wiley & Sons, Inc., 2001.","element":"span"}],[{"id":"id-14","text":"de Freitas, N., Zoghi, M., and Smola, A. J. Exponential ","element":"span"},{"text":"regret bounds for Gaussian process bandits with deterministic observations. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Int. Conf. Mach. Learn. (ICML)","element":"span"},{"text":", 2012.","element":"span"}],[{"id":"id-11","text":"Desautels, T., Krause, A., and Burdick, J. W. Parallelizing ","element":"span"},{"text":"exploration-exploitation tradeoffs in Gaussian process bandit optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"J. Mach. Learn. Res.","element":"span"},{"text":", 15(1):3873– 3923, 2014.","element":"span"}],[{"id":"id-18","text":"Grünewälder, S., Audibert, J.-Y., Opper, M., and Shawe- ","element":"span"},{"text":"Taylor, J. Regret bounds for Gaussian process bandit problems. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Int. Conf. Art. Intel. Stats. (AISTATS)","element":"span"},{"text":", pp. 273–280, 2010.","element":"span"}],[{"id":"id-4","text":"Hennig, P. and Schuler, C. J. Entropy search for information- ","element":"span"},{"text":"efficient global optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"J. Mach. Learn. Research","element":"span"},{"text":", 13(1):1809–1837, 2012.","element":"span"}],[{"id":"id-5","text":"Hernández-Lobato, J. M., Hoffman, M. W., and Ghahra- ","element":"span"},{"text":"mani, Z. Predictive entropy search for efficient global optimization of black-box functions. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Adv. Neur. Inf. Proc. Sys. (NIPS)","element":"span"},{"text":", pp. 918–926, 2014.","element":"span"}],[{"id":"id-12","text":"Kandasamy, K., Schneider, J., and Póczos, B. High di- ","element":"span"},{"text":"mensional Bayesian optimisation and bandits via additive models. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Int. Conf. Mach. Learn.","element":"span"},{"text":", 2015.","element":"span"}],[{"id":"id-15","text":"Kawaguchi, K., Kaelbling, L. P., and Lozano-Pérez, T. ","element":"span"},{"text":"Bayesian optimization with exponential convergence. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conf. Neur. Inf. Proc. Sys. (NIPS)","element":"span"},{"text":", 2015.","element":"span"}],[{"id":"id-22","text":"Kleinberg, R., Slivkins, A., and Upfal, E. Multi-armed ","element":"span"},{"text":"bandits in metric spaces. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proc. ACM Symp. Theory Comp. (STOC)","element":"span"},{"text":", pp. 681–690, 2008.","element":"span"}],[{"id":"id-8","text":"Krause, A. and Ong, C. S. Contextual Gaussian process ","element":"span"},{"text":"bandit optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conf. Neur. Inf. Proc. Sys. (NIPS)","element":"span"},{"text":", pp. 2447–2455. Curran Associates, Inc., 2011.","element":"span"}],[{"id":"id-26","text":"Raginsky, M. and Rakhlin, A. ","element":"span"},{"text":"Information-based complexity, feedback and dynamics in convex programming. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Trans. Inf. Theory","element":"span"},{"text":", 57(10):7036–7056, Oct. 2011.","element":"span"}],[{"id":"id-28","text":"Rasmussen, C. E. Gaussian processes for machine learning. ","element":"span"},{"text":"MIT Press, 2006.","element":"span"}],[{"id":"id-13","text":"Rolland, P., Scarlett, J., Bogunovic, I., and Cevher, V. High- ","element":"span"},{"text":"dimensional Bayesian optimization via additive models with overlapping groups. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Int. Conf. Art. Intel. Stats. (AISTATS)","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-6","text":"Russo, D. and Van Roy, B. ","element":"span"},{"text":"Learning to optimize via information-directed sampling. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conf. Neur. Inf. Proc. Sys. (NIPS)","element":"span"},{"text":", 2014.","element":"span"}],[{"id":"id-20","text":"Scarlett, J., Bogunovic, I., and Cevher, V. Lower bounds on ","element":"span"},{"text":"regret for noisy Gaussian process bandit optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conf. Learn. Theory (COLT)","element":"span"},{"text":". 2017.","element":"span"}],[{"id":"id-0","text":"Shahriari, B., Swersky, K., Wang, Z., Adams, R. P., and ","element":"span"},{"text":"de Freitas, N. Taking the human out of the loop: A review of Bayesian optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proc. IEEE","element":"span"},{"text":", 104(1): 148–175, 2016.","element":"span"}],[{"id":"id-23","text":"Shekhar, S. and Javidi, T. Gaussian process bandits with ","element":"span"},{"text":"adaptive discretization. http://arxiv.org/abs/1712.01447, 2017.","element":"span"}],[{"id":"id-1","text":"Srinivas, N., Krause, A., Kakade, S. M., and Seeger, M. ","element":"span"},{"text":"Gaussian process optimization in the bandit setting: No regret and experimental design. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Int. Conf. Mach. Learn. (ICML)","element":"span"},{"text":", 2010.","element":"span"}],[{"id":"id-16","text":"Wang, Z., Shakibi, B., Jin, L., and de Freitas, N. Bayesian ","element":"span"},{"text":"multi-scale optimistic optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Int. Conf. Art. Intel. Stats. (AISTATS)","element":"span"},{"text":", pp. 1005–1014, 2014.","element":"span"}],[{"id":"id-7","text":"Wang, Z., Zhou, B., and Jegelka, S. Optimization as esti- ","element":"span"},{"text":"mation with Gaussian processes in bandit settings. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Int. Conf. Art. Intel. Stats. (AISTATS)","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-61","text":"Yu, B. Assouad, Fano, and le Cam. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Festschrift for Lucien Le Cam","element":"span"},{"text":", pp. 423–435. Springer, 1997.","element":"span"}]]},{"heading":"Supplementary Material","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"Tight Regret Bounds for Bayesian Optimization in One Dimension (Jonathan Scarlett, ICML 2018)","element":"span"}],[{"id":"id-41","style":{"fontWeight":"bold"},"text":"A. Doubling Trick for an Unknown Time Horizon","element":"span"}],[{"text":"Suppose that we have an algorithm that depends on the time horizon ","element":"span"},{"style":{"height":10.8},"width":42.82,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-0.png","element":"img","alt":" T ′","inline":true,"padRight":true},{"text":"and achieves ","element":"span"},{"style":{"height":16.83},"width":385.16,"height":42.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-1.png","element":"img","alt":" E[RT ′] ≤ C√T ′ log T ′","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C > ","element":"span"},{"text":"0","element":"span"},{"text":". We show that we can also achieve ","element":"span"},{"style":{"height":19.2},"width":516.08,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-2.png","element":"img","alt":" E[RT ] = O�√T log T�when T","inline":true,"padRight":true},{"text":"is unknown.","element":"span"}],[{"text":"To see this, fix an arbitrary integer ","element":"span"},{"style":{"height":19.37},"width":195.68,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-3.png","element":"img","alt":" T0 ∈�1, T2�","inline":true},{"text":", and repeatedly run the algorithm with fixed time horizons ","element":"span"},{"style":{"height":13.2},"width":203,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-4.png","element":"img","alt":" T0, 2T0, 4T0","inline":true},{"text":", etc., until ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"points have been sampled. The number of stages is no more than ","element":"span"},{"style":{"height":20.96},"width":286.19,"height":52.41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-5.png","element":"img","alt":" ℓmax = ⌈log2TT0 ⌉","inline":true},{"text":". Moreover, we have","element":"span"}],[{"style":{"width":"84%"},"width":1647,"height":135,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-6.png","element":"img"}],[{"text":"where the first inequality uses ","element":"span"},{"style":{"height":17.38},"width":339.93,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-7.png","element":"img","alt":" log(2ℓ−1T0) ≤ log T","inline":true},{"text":", and the last inequality uses ","element":"span"},{"style":{"height":20.4},"width":363.48,"height":50.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-8.png","element":"img","alt":"�Nℓ=0 2ℓ/2 ≤ 4 · 2N/2.","inline":true}],[{"id":"id-40","style":{"fontWeight":"bold"},"text":"B. Proof of Theorem ","element":"span"},{"href":"#id-37","style":{"fontWeight":"bold"},"text":"1 ","element":"a"},{"style":{"fontWeight":"bold"},"text":"(Upper Bound)","element":"span"}],[{"text":"We continue from the auxiliary results given in Section ","element":"span"},{"text":"3","element":"span"},{"text":", proceeding in several steps. Algorithm ","element":"span"},{"href":"#id-69","text":"2 ","element":"a"},{"text":"gives a full description of the algorithm; the reader is encouraged to refer to this throughout the proof, rather than trying to understand all the steps therein immediately. Note that the constants ","element":"span"},{"style":{"height":14.4},"width":269.26,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-9.png","element":"img","alt":" c0, c1, c2, and ρ0","inline":true,"padRight":true},{"text":"used in the algorithm come from Assumptions ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-34","text":"3","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Reduction to a finite domain. ","element":"span"},{"text":"Our algorithm only samples ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"within a finite set ","element":"span"},{"style":{"height":13.2},"width":113.7,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-10.png","element":"img","alt":" L ⊆ D","inline":true,"padRight":true},{"text":"of pre-defined points. We choose these points to be regularly spaced, and close enough to ensure that the highest function value is within ","element":"span"},{"style":{"height":19.37},"width":23,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-11.png","element":"img","alt":"1T ","inline":true,"padRight":true},{"text":"of the maximum ","element":"span"},{"style":{"height":16},"width":96.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-12.png","element":"img","alt":"f(x∗)","inline":true},{"text":". Under condition (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":") in Assumption ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"(which implies that ","element":"span"},{"style":{"height":14},"width":103.55,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-13.png","element":"img","alt":" f is c1","inline":true},{"text":"-Lipschitz continuous), it suffices to choose","element":"span"}],[{"id":"id-73","style":{"width":"62%"},"width":1226,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-14.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"Z ","element":"span"},{"text":"denotes the integers. Here we add ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"because it will be notationally convenient to ensure that the endpoints ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"are both included in the set. Note that ","element":"span"},{"style":{"height":16},"width":409.02,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-15.png","element":"img","alt":" L satisfies |L| ≤ c1T + 1","inline":true},{"text":", which we crudely upper bound by ","element":"span"},{"style":{"height":16},"width":196.63,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-16.png","element":"img","alt":" |L| ≤ 2c1T.","inline":true}],[{"text":"Since ","element":"span"},{"style":{"height":19.37},"width":578.46,"height":48.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-17.png","element":"img","alt":" maxx∈L f(x) ≥ maxx∈D f(x) − 1T ","inline":true,"padRight":true},{"text":", the cumulative regret ","element":"span"},{"style":{"height":21.36},"width":77.1,"height":53.41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-18.png","element":"img","alt":" R(L)T","inline":true,"padRight":true},{"text":"with respect to the best point in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"is such that","element":"span"}],[{"id":"id-89","style":{"width":"56%"},"width":1106,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-19.png","element":"img"}],[{"text":"Hence, it suffices to bound ","element":"span"},{"style":{"height":21.36},"width":77.1,"height":53.41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-20.png","element":"img","alt":" R(L)T","inline":true,"padRight":true},{"text":"instead of ","element":"span"},{"style":{"height":13.19},"width":53.26,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-21.png","element":"img","alt":" RT","inline":true,"padRight":true},{"text":". For convenience, we henceforth let ","element":"span"},{"style":{"height":15.38},"width":44.78,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-22.png","element":"img","alt":" x∗L","inline":true,"padRight":true},{"text":"denote an arbitrary input that ","element":"span"},{"text":"achieves ","element":"span"},{"style":{"height":16},"width":222.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-23.png","element":"img","alt":" maxx∈L f(x)","inline":true},{"text":", and we define the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"instant regret ","element":"span"},{"text":"as","element":"span"}],[{"style":{"width":"83%"},"width":1622,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-24.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Conditioning on high-probability events. ","element":"span"},{"text":"By assumption, the events in Assumptions ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-34","text":"3 ","element":"a"},{"text":"simultaneously hold with probability at least ","element":"span"},{"style":{"height":13.99},"width":186.44,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-25.png","element":"img","alt":" 1 − δ1 − δ2","inline":true},{"text":". Moreover, by setting ","element":"span"},{"style":{"height":19.37},"width":100.14,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-26.png","element":"img","alt":" δ = 1T","inline":true,"padRight":true},{"text":"in Lemma ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"and letting ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"be as in (","element":"span"},{"href":"#id-73","text":"40","element":"a"},{"text":") with ","element":"span"},{"style":{"height":16},"width":186.8,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-27.png","element":"img","alt":" |L| ≤ 2c1T","inline":true},{"text":", we ","element":"span"},{"text":"deduce that (","element":"span"},{"href":"#id-74","text":"14","element":"a"},{"text":") holds with probability at least ","element":"span"},{"style":{"height":19.37},"width":211.58,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-28.png","element":"img","alt":" 1 − 1T when4","inline":true}],[{"id":"id-70","style":{"width":"58%"},"width":1140,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-29.png","element":"img"}],[{"text":"Denoting the intersection of all events in Assumptions ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-34","text":"3 ","element":"a"},{"text":"by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":", and the event in Lemma ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", we can write the average regret given ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"as follows:","element":"span"}],[{"id":"id-67","style":{"width":"99%"},"width":1946,"height":278,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/10-30.png","element":"img"}],[{"id":"id-69","style":{"fontWeight":"bold"},"text":"Algorithm 2 ","element":"span"},{"text":"Full description of our algorithm, based on reducing uncertainty in epochs via repeated sampling.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Require: ","element":"span"},{"text":"Domain ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":", GP prior (","element":"span"},{"style":{"height":14},"width":98.56,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-0.png","element":"img","alt":"µ0, k0","inline":true},{"text":"), time horizon ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":", constants ","element":"span"},{"style":{"height":10},"width":206.99,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-1.png","element":"img","alt":" c0, c1, c2, ρ0.","inline":true}],[{"style":{"width":"98%"},"width":1921,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-2.png","element":"img"}],[{"style":{"height":12.48},"width":148.95,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-3.png","element":"img","alt":"η(0) = c0","inline":true},{"text":", and initial potential maximizers ","element":"span"},{"style":{"height":16.48},"width":172.06,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-4.png","element":"img","alt":" M(0) = L.","inline":true,"padRight":true},{"text":"2: ","element":"span"},{"text":"Initialize time index ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and epoch number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":". ","element":"span"},{"text":"3: ","element":"span"},{"style":{"fontWeight":"bold"},"text":"while ","element":"span"},{"text":"less than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"samples have been taken ","element":"span"},{"style":{"fontWeight":"bold"},"text":"do","element":"span"}],[{"style":{"width":"97%"},"width":919,"height":627,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-5.png","element":"img"}],[{"text":"• ","element":"span"},{"text":"Initialize ","element":"span"},{"style":{"height":18.48},"width":157.58,"height":46.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-6.png","element":"img","alt":" L(i) ← ∅.","inline":true}],[{"text":"• ","element":"span"},{"text":"Construct ","element":"span"},{"style":{"height":16.48},"width":63.22,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-7.png","element":"img","alt":"�L(i)","inline":true,"padRight":true},{"text":"(not necessarily a subset of ","element":"span"},{"style":{"height":16.48},"width":57.43,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-8.png","element":"img","alt":" I(i)","inline":true,"padRight":true},{"text":"or ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":") containing regularly-spaced points within the interval","element":"span"},{"style":{"height":19.39},"width":540.95,"height":48.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-9.png","element":"img","alt":"�min{x ∈ I(i)}, max{x ∈ I(i)}�","inline":true},{"text":", with spacing ","element":"span"},{"style":{"height":23.5},"width":86.74,"height":58.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-10.png","element":"img","alt":"η(i)2L(i) .","inline":true}],[{"text":"• ","element":"span"},{"text":"For each ","element":"span"},{"style":{"height":16.48},"width":134.7,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-11.png","element":"img","alt":" x ∈ �L(i)","inline":true},{"text":", add its two nearest points in ","element":"span"},{"style":{"height":16.48},"width":186.46,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-12.png","element":"img","alt":" I(i) to L(i).","inline":true}],[{"style":{"width":"59%"},"width":1150,"height":194,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-13.png","element":"img"}],[{"text":"For each sample taken, increment ","element":"span"},{"style":{"height":12},"width":159.48,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-14.png","element":"img","alt":" t ← t + 1","inline":true},{"text":", and terminate if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t > T","element":"span"},{"text":". ","element":"span"},{"text":"9: ","element":"span"},{"text":"Update the posterior distribution ","element":"span"},{"style":{"height":16},"width":205.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-15.png","element":"img","alt":" (µt−1, σt−1)","inline":true,"padRight":true},{"text":"according to (","element":"span"},{"href":"#id-75","text":"5","element":"a"},{"text":")–(","element":"span"},{"href":"#id-76","text":"6","element":"a"},{"text":"), with ","element":"span"},{"style":{"height":17.38},"width":399.5,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-16.png","element":"img","alt":" xt−1 = [x1, . . . , xt−1]T","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":10.4},"width":128.78,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-17.png","element":"img","alt":" yt−1 =","inline":true},{"style":{"height":17.38},"width":245.47,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-18.png","element":"img","alt":"[y1, . . . , yt−1]T ","inline":true,"padRight":true},{"text":"respectively containing all the selected points and noisy samples so far. ","element":"span"},{"text":"10: ","element":"span"},{"text":"For each ","element":"span"},{"style":{"height":16.48},"width":196.47,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-19.png","element":"img","alt":" x ∈ I(i), set","inline":true}],[{"style":{"width":"50%"},"width":977,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-20.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":31.13},"width":680.48,"height":77.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-21.png","element":"img","alt":" x′ = arg minx′∈L(i) |x − x′|.11:","inline":true,"padRight":true},{"text":"Update the set of potential maximizers:","element":"span"}],[{"style":{"width":"45%"},"width":425,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/11-22.png","element":"img"}],[{"text":"12: ","element":"span"},{"text":"Increment ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":".","element":"span"}],[{"text":"13: ","element":"span"},{"style":{"fontWeight":"bold"},"text":"end while","element":"span"}],[{"text":"where (","element":"span"},{"href":"#id-67","text":"45","element":"a"},{"text":") follows since ","element":"span"},{"style":{"height":16},"width":208.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-0.png","element":"img","alt":" P[B|A] ≤ 1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":24.44},"width":535.75,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-1.png","element":"img","alt":" P[Bc|A] ≤ P[Bc]P[A] ≤ 1T (1−δ1−δ2)","inline":true},{"text":", and (","element":"span"},{"href":"#id-67","text":"46","element":"a"},{"text":") follows since condition (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":") in ","element":"span"},{"text":"Assumption ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"ensures that ","element":"span"},{"href":"#id-67","style":{"height":14.8},"width":362.59,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-2.png","element":"img","alt":" RT ≤ T · 2c0. By (46)","inline":true},{"text":", in order to prove Theorem ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":", it suffices to show that ","element":"span"},{"style":{"height":16.84},"width":327.11,"height":42.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-3.png","element":"img","alt":" RT = O(√T log T)","inline":true,"padRight":true},{"text":"whenever the conditions of Assumptions ","element":"span"},{"href":"#id-31","text":"2","element":"a"},{"text":"–","element":"span"},{"href":"#id-34","text":"3 ","element":"a"},{"text":"and Lemma ","element":"span"},{"href":"#id-38","text":"1 ","element":"a"},{"text":"hold true. We henceforth condition on this being the case.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Sampling mechanism. ","element":"span"},{"text":"Recall that ","element":"span"},{"style":{"height":12.48},"width":55.51,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-4.png","element":"img","alt":" η(i)","inline":true,"padRight":true},{"text":"represents the target confidence to attain by the end of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th epoch, and each such value is half of the previous value. For this interpretation to be valid, ","element":"span"},{"style":{"height":12.48},"width":60.13,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-5.png","element":"img","alt":" η(0)","inline":true,"padRight":true},{"text":"should be sufficient large so that the entire function is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a priori ","element":"span"},{"text":"known up to confidence ","element":"span"},{"href":"#id-30","style":{"height":16.88},"width":166.7,"height":42.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-6.png","element":"img","alt":" η(0); by (8","inline":true},{"text":") in Assumption ","element":"span"},{"href":"#id-31","text":"2","element":"a"},{"text":", the choice ","element":"span"},{"style":{"height":12.48},"width":148.95,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-7.png","element":"img","alt":" η(0) = c0","inline":true,"padRight":true},{"text":"certainly suffices for this purpose.","element":"span"}],[{"text":"In the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th epoch, we repeatedly sample a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"sufficiently fine ","element":"span"},{"text":"subset of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"style":{"fontStyle":"italic"},"text":"sufficiently many times ","element":"span"},{"text":"to attain an overall confidence of ","element":"span"},{"style":{"height":16.88},"width":564,"height":42.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-8.png","element":"img","alt":" η(i) within M(i−1) (with M(0) = L","inline":true},{"text":"). Specifically:","element":"span"}],[{"text":"• We sample each point ","element":"span"},{"style":{"height":16.48},"width":69.57,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-9.png","element":"img","alt":" K(i)","inline":true,"padRight":true},{"text":"times and average the resulting observations, yielding an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"effective noise variance ","element":"span"},{"text":"of ","element":"span"},{"style":{"height":26.32},"width":59.21,"height":65.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-10.png","element":"img","alt":"σ2K(i)","inline":true,"padRight":true},{"text":", ","element":"span"},{"text":"and we choose ","element":"span"},{"style":{"height":16.48},"width":69.57,"height":41.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-11.png","element":"img","alt":" K(i)","inline":true,"padRight":true},{"text":"large enough so that ","element":"span"},{"style":{"height":32.93},"width":589.92,"height":82.34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-12.png","element":"img","alt":"σ2K(i) ≤η2(i)4βT . Hence, K(i) = ⌈ 4σ2βTη2(i) ⌉","inline":true,"padRight":true},{"text":"is sufficient.","element":"span"}],[{"text":"• To design ","element":"span"},{"style":{"height":16.48},"width":146.81,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-13.png","element":"img","alt":" L(i) ⊆ L","inline":true},{"text":", we consider the interval","element":"span"}],[{"text":"which is the smallest interval (intersected with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":") containing ","element":"span"},{"style":{"height":16.48},"width":115.2,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-14.png","element":"img","alt":" M(i−1)","inline":true},{"text":". We select a Lipschitz constant ","element":"span"},{"style":{"height":16.48},"width":62.85,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-15.png","element":"img","alt":" L(i)","inline":true,"padRight":true},{"text":"(to be specified later) such that ","element":"span"},{"style":{"height":16.48},"width":133.15,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-16.png","element":"img","alt":" f is L(i)","inline":true},{"text":"-Lipschitz within ","element":"span"},{"style":{"height":16.48},"width":57.43,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-17.png","element":"img","alt":" I(i)","inline":true},{"text":", and then we choose ","element":"span"},{"style":{"height":16.48},"width":176.23,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-18.png","element":"img","alt":" L(i) ⊆ I(i)","inline":true,"padRight":true},{"text":"to ensure the following:","element":"span"}],[{"id":"id-78","style":{"width":"95%"},"width":1865,"height":284,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-19.png","element":"img"}],[{"id":"id-80","text":"is the width of the interval. With the restriction of sampling within the fine discretization ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":", we can simply “round” to the two nearest points,","element":"span"},{"text":"5 ","element":"span"},{"text":"yielding a suitable set ","element":"span"},{"style":{"height":16.48},"width":176.22,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-20.png","element":"img","alt":" L(i) ⊆ I(i)","inline":true,"padRight":true},{"text":"of cardinality at most ","element":"span"},{"style":{"height":26.7},"width":196.53,"height":66.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-21.png","element":"img","alt":" 2� 2w(i)L(i)η(i) �","inline":true}],[{"text":"Combining these, the total number of samples ","element":"span"},{"style":{"height":16.48},"width":59.02,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-22.png","element":"img","alt":" T(i)","inline":true,"padRight":true},{"text":"is given by","element":"span"}],[{"id":"id-68","text":"At the points that were sampled, we performed enough repetitions to attain a variance of at most ","element":"span"},{"style":{"height":15.74},"width":48.84,"height":39.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-23.png","element":"img","alt":"η2(i)","inline":true},{"style":{"height":10.4},"width":52.96,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-24.png","element":"img","alt":"4βT ","inline":true,"padRight":true},{"text":"based on those samples","element":"span"}],[{"text":"alone. The information from any earlier samples only reduces the variance further, so the overall posterior variance","element":"span"},{"text":"6 ","element":"span"},{"style":{"height":17.38},"width":131.87,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-25.png","element":"img","alt":"σ2t−1(x)","inline":true}],[{"id":"id-77","style":{"width":"99%"},"width":1944,"height":155,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-26.png","element":"img"}],[{"text":"For the points in ","element":"span"},{"style":{"height":16.48},"width":115.2,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-27.png","element":"img","alt":" M(i−1)","inline":true,"padRight":true},{"text":"that we didn’t sample, we note that the following confidence bounds are valid as long as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is ","element":"span"},{"style":{"height":16.48},"width":62.85,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-28.png","element":"img","alt":"L(i)","inline":true},{"text":"-Lipschitz continuous within ","element":"span"},{"style":{"height":16.48},"width":70.87,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-29.png","element":"img","alt":" I(i):","inline":true}],[{"id":"id-79","style":{"width":"99%"},"width":1946,"height":192,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/12-30.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":20.03},"width":458.92,"height":50.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-0.png","element":"img","alt":" x′ = arg minx′∈L(i) |x − x′|","inline":true,"padRight":true},{"text":"is the closest sampled point to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"is itself in ","element":"span"},{"style":{"height":16.48},"width":63.21,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-1.png","element":"img","alt":" L(i)","inline":true},{"text":", these expressions reduce to (","element":"span"},{"href":"#id-77","text":"52","element":"a"},{"text":").","element":"span"}],[{"text":"Now, since we have ensured the condition (","element":"span"},{"href":"#id-78","text":"48","element":"a"},{"text":"), we find that we can weaken (","element":"span"},{"href":"#id-79","text":"53","element":"a"},{"text":")–(","element":"span"},{"href":"#id-79","text":"54","element":"a"},{"text":") to","element":"span"}],[{"style":{"width":"75%"},"width":1463,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-2.png","element":"img"}],[{"text":"That is, as long as the Lipschitz constant ","element":"span"},{"style":{"height":16.48},"width":62.85,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-3.png","element":"img","alt":" L(i)","inline":true,"padRight":true},{"text":"is valid, we have ","element":"span"},{"style":{"height":12.48},"width":55.52,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-4.png","element":"img","alt":" η(i)","inline":true},{"text":"-confidence at the end of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th epoch. As a result, by Lemma ","element":"span"},{"href":"#id-47","text":"2","element":"a"},{"text":", the updated set of potential maximizers","element":"span"}],[{"style":{"width":"72%"},"width":1409,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-5.png","element":"img"}],[{"text":"with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"being the ending time of the epoch, must only contain points within ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"whose function value is within ","element":"span"},{"style":{"height":17.68},"width":242.29,"height":44.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-6.png","element":"img","alt":" 4η(i) of f(x∗L).","inline":true,"padRight":true},{"text":"Below, we will choose ","element":"span"},{"style":{"height":16.48},"width":62.85,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-7.png","element":"img","alt":" L(i)","inline":true,"padRight":true},{"text":"differently in different epochs, while still ensuring the required Lipschitz condition is valid.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Analysis of early epochs. ","element":"span"},{"text":"Recall the following:","element":"span"}],[{"text":"• By Assumption ","element":"span"},{"href":"#id-32","text":"1","element":"a"},{"text":", the constant ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-8.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"lower bounds the separation between ","element":"span"},{"style":{"height":16},"width":96.39,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-9.png","element":"img","alt":" f(x∗)","inline":true,"padRight":true},{"text":"and the function value at the second highest local maximum (if any).","element":"span"}],[{"text":"• By Assumption ","element":"span"},{"href":"#id-34","text":"3","element":"a"},{"text":", we either have ","element":"span"},{"style":{"height":10.98},"width":38.78,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-10.png","element":"img","alt":" x∗","inline":true,"padRight":true},{"text":"at an endpoint and the locally linear behavior (","element":"span"},{"href":"#id-32","text":"9","element":"a"},{"text":"), or we have ","element":"span"},{"style":{"height":16},"width":284.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-11.png","element":"img","alt":" x∗ ∈ (ρ0, 1 − ρ0)","inline":true,"padRight":true},{"text":"and the locally quadratic behavior (","element":"span"},{"href":"#id-33","text":"10","element":"a"},{"text":").","element":"span"}],[{"text":"In the epochs for which ","element":"span"},{"style":{"height":17.39},"width":157.51,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-12.png","element":"img","alt":" w(i) > ρ0","inline":true},{"text":", we choose ","element":"span"},{"href":"#id-30","style":{"height":16.48},"width":279.92,"height":41.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-13.png","element":"img","alt":" L(i) = c1 (cf., (8)","inline":true},{"text":"), which is clearly a valid Lipschitz constant. We claim that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"after a finite number of epochs","element":"span"},{"text":", all points ","element":"span"},{"style":{"height":17.68},"width":861.4,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-14.png","element":"img","alt":" x ∈ M(i) satisfy f(x) > f(x∗) − ϵ and |x − x∗| ≤ ρ02 ","inline":true,"padRight":true},{"text":", and therefore, ","element":"span"},{"style":{"height":12.88},"width":176.23,"height":32.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-15.png","element":"img","alt":" w(i) ceases","inline":true,"padRight":true},{"text":"to be greater than ","element":"span"},{"style":{"height":10},"width":36.6,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-16.png","element":"img","alt":" ρ0","inline":true},{"text":". We henceforth distinguish between the two cases using the terminology ","element":"span"},{"style":{"fontStyle":"italic"},"text":"early epochs ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"late epochs","element":"span"},{"text":".","element":"span"}],[{"text":"To see that the preceding claim is true, we consider the two cases of Assumption ","element":"span"},{"href":"#id-34","text":"3","element":"a"},{"text":":","element":"span"}],[{"text":"• In the first case, all points satisfying ","element":"span"},{"style":{"height":16},"width":224.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-17.png","element":"img","alt":" |x − x∗| > ρ0","inline":true,"padRight":true},{"text":"are at least ","element":"span"},{"style":{"height":16},"width":213.84,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-18.png","element":"img","alt":" min{c1ρ0, ϵ}","inline":true},{"text":"-suboptimal by the locally linear behavior (","element":"span"},{"href":"#id-32","text":"9","element":"a"},{"text":") and the ","element":"span"},{"href":"#id-29","style":{"height":14.4},"width":150.92,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-19.png","element":"img","alt":" ϵ gap (7);","inline":true}],[{"text":"• In the second case, all points satisfying ","element":"span"},{"style":{"height":16},"width":224.52,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-20.png","element":"img","alt":" |x − x∗| > ρ0","inline":true,"padRight":true},{"text":"are at least ","element":"span"},{"style":{"height":17.38},"width":213.83,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-21.png","element":"img","alt":" min{c2ρ20, ϵ}","inline":true},{"text":"-suboptimal by the locally quadratic ","element":"span"},{"text":"behavior (","element":"span"},{"href":"#id-32","text":"9","element":"a"},{"text":") and the ","element":"span"},{"href":"#id-29","style":{"height":14.4},"width":149.91,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-22.png","element":"img","alt":" ϵ gap (7).","inline":true}],[{"text":"Hence, in either case, all points satisfying ","element":"span"},{"style":{"height":16},"width":225.63,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-23.png","element":"img","alt":" |x − x∗| > ρ0","inline":true,"padRight":true},{"text":"are at least ","element":"span"},{"style":{"height":5.79},"width":30.18,"height":14.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-24.png","element":"img","alt":" ϵ′","inline":true},{"text":"-suboptimal, where ","element":"span"},{"style":{"height":17.39},"width":386.48,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-25.png","element":"img","alt":" ϵ′ = min{c1ρ0, c2ρ20, ϵ}","inline":true},{"text":". As a ","element":"span"},{"text":"result, to establish the desired claim, we only need to show that ","element":"span"},{"style":{"height":16.48},"width":74.39,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-26.png","element":"img","alt":" M(i)","inline":true,"padRight":true},{"text":"contains no points with instant regret ","element":"span"},{"style":{"height":16},"width":163.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-27.png","element":"img","alt":" r(x) ≥ ϵ′.","inline":true}],[{"text":"Since ","element":"span"},{"style":{"height":19.37},"width":328.74,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-28.png","element":"img","alt":" f(x∗L) ≥ f(x∗) − 1T","inline":true,"padRight":true},{"text":"(as stated following (","element":"span"},{"href":"#id-73","text":"40","element":"a"},{"text":")), we find that as long as ","element":"span"},{"style":{"height":19.37},"width":111.14,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-29.png","element":"img","alt":" T > 2ϵ′","inline":true,"padRight":true},{"text":",","element":"span"},{"text":"7 ","element":"span"},{"text":"it suffices that ","element":"span"},{"style":{"height":16.48},"width":74.39,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-30.png","element":"img","alt":" M(i)","inline":true,"padRight":true},{"text":"only contains","element":"span"}],[{"text":"points such that ","element":"span"},{"href":"#id-47","style":{"height":22.17},"width":437.47,"height":55.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-31.png","element":"img","alt":" r(L)t (x) ≤ ϵ′2 . By Lemma 2","inline":true},{"text":", this happens as soon as ","element":"span"},{"style":{"height":16.76},"width":294.82,"height":41.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-32.png","element":"img","alt":" η(i) < ϵ′8 . Since ϵ′","inline":true,"padRight":true},{"text":"is constant and we halve ","element":"span"},{"style":{"height":16.88},"width":155.1,"height":42.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-33.png","element":"img","alt":" η(i) at the","inline":true,"padRight":true},{"text":"end of each epoch, it must be that only a finite number of epochs ","element":"span"},{"style":{"height":15.19},"width":98,"height":37.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-34.png","element":"img","alt":" imax,1","inline":true,"padRight":true},{"text":"pass before this occurs, with ","element":"span"},{"style":{"height":15.19},"width":98,"height":37.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-35.png","element":"img","alt":" imax,1","inline":true,"padRight":true},{"text":"depending only on ","element":"span"},{"style":{"height":16.88},"width":177.4,"height":42.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-36.png","element":"img","alt":" η(0) and ϵ′.","inline":true}],[{"text":"For these early epochs, we simply upper bound ","element":"span"},{"href":"#id-68","style":{"height":16.88},"width":170.83,"height":42.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-37.png","element":"img","alt":" w(i) in (51","inline":true},{"text":") by one, meaning their overall cumulative time ","element":"span"},{"style":{"height":15.59},"width":230.65,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-38.png","element":"img","alt":" Tearly satisfies","inline":true}],[{"id":"id-81","style":{"width":"71%"},"width":1393,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-39.png","element":"img"}],[{"text":"where we have used the fact that ","element":"span"},{"style":{"height":16.88},"width":373.86,"height":42.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-40.png","element":"img","alt":" η(i) ≥ ϵ′8 and L(i) = c1","inline":true,"padRight":true},{"text":"in these epochs.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Analysis of late epochs. ","element":"span"},{"text":"Recall that we consider ourselves in a late epoch as soon as ","element":"span"},{"style":{"height":16.08},"width":156.46,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-41.png","element":"img","alt":" w(i) ≤ ρ0","inline":true},{"text":". This condition implies that all points in ","element":"span"},{"style":{"height":16.48},"width":115.2,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-42.png","element":"img","alt":" M(i−1)","inline":true,"padRight":true},{"text":"are within a distance ","element":"span"},{"style":{"height":10},"width":36.6,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-43.png","element":"img","alt":" ρ0","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"height":10.98},"width":38.78,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-44.png","element":"img","alt":" x∗","inline":true},{"text":",","element":"span"},{"text":"8 ","element":"span"},{"text":"yielding the locally linear behavior (","element":"span"},{"href":"#id-32","text":"9","element":"a"},{"text":") if ","element":"span"},{"style":{"height":10.98},"width":38.78,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-45.png","element":"img","alt":" x∗","inline":true,"padRight":true},{"text":"is an endpoint, and the locally quadratic behavior (","element":"span"},{"href":"#id-33","text":"10","element":"a"},{"text":") otherwise. Moreover, Assumption ","element":"span"},{"href":"#id-34","text":"3 ","element":"a"},{"text":"assumes ","element":"span"},{"style":{"height":16},"width":284.69,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-46.png","element":"img","alt":" x∗ ∈ (ρ0, 1 − ρ0)","inline":true,"padRight":true},{"text":"in the latter case, and as a result, the algorithm can identify which case has occurred: If ","element":"span"},{"style":{"height":16.48},"width":57.43,"height":41.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-47.png","element":"img","alt":" I(i)","inline":true,"padRight":true},{"text":"contains an endpoint, then we are in the first case, whereas if ","element":"span"},{"style":{"height":17.68},"width":202.07,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/13-48.png","element":"img","alt":" I(i) ⊆ (0, 1)","inline":true},{"text":", then we are in the second case.","element":"span"}],[{"text":"Accordingly, the algorithm can choose the Lipschitz constant ","element":"span"},{"style":{"height":16.48},"width":62.85,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-0.png","element":"img","alt":" L(i)","inline":true,"padRight":true},{"text":"differently in the two cases. In the first case, we simply continue to use the global choice ","element":"span"},{"style":{"height":16.48},"width":151.67,"height":41.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-1.png","element":"img","alt":" L(i) = c1","inline":true,"padRight":true},{"text":"from (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":"). In the second case, we observe that ","element":"span"},{"style":{"height":16},"width":180.2,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-2.png","element":"img","alt":" f ′(x∗) = 0","inline":true},{"text":", and recall from (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":") that ","element":"span"},{"style":{"height":14},"width":114.53,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-3.png","element":"img","alt":" f ′ is c2","inline":true},{"text":"-Lipschitz continuous. Since the width of the interval of interest ","element":"span"},{"style":{"height":16.48},"width":170.44,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-4.png","element":"img","alt":" I(i) is w(i)","inline":true},{"text":", we conclude that ","element":"span"},{"style":{"height":17.68},"width":263.4,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-5.png","element":"img","alt":" |f ′(x)| ≤ c2w(i)","inline":true,"padRight":true},{"text":"within ","element":"span"},{"style":{"height":16.48},"width":57.43,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-6.png","element":"img","alt":" I(i)","inline":true},{"text":", and accordingly, we can set","element":"span"}],[{"id":"id-87","style":{"width":"55%"},"width":1088,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-7.png","element":"img"}],[{"text":"We initially focus on this second case (which is the more interesting of the two), and later return to the first case.","element":"span"}],[{"text":"Recall that within the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th epoch, all points with ","element":"span"},{"style":{"height":17.68},"width":398.36,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-8.png","element":"img","alt":" f(x) < f(x∗L) − 4η(i−1)","inline":true,"padRight":true},{"text":"have already been removed from the potential ","element":"span"},{"text":"maximizers (","element":"span"},{"style":{"fontStyle":"italic"},"text":"cf.","element":"span"},{"text":", Lemma ","element":"span"},{"href":"#id-47","text":"2","element":"a"},{"text":"). This implies that the points sampled incur instant regret at most","element":"span"}],[{"id":"id-82","style":{"width":"56%"},"width":1098,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-9.png","element":"img"}],[{"text":"and hence, since we have established that ","element":"span"},{"style":{"height":19.37},"width":342.05,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-10.png","element":"img","alt":" f(x∗L) ≥ f(x∗) − 1T ,","inline":true}],[{"text":"From this fact and the locally quadratic behavior (","element":"span"},{"href":"#id-33","text":"10","element":"a"},{"text":"), we deduce that the width ","element":"span"},{"style":{"height":12.48},"width":64.26,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-11.png","element":"img","alt":" w(i)","inline":true,"padRight":true},{"text":"defined in (","element":"span"},{"href":"#id-80","text":"49","element":"a"},{"text":") satisfies ","element":"span"},{"style":{"height":16.08},"width":115.05,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-12.png","element":"img","alt":" w(i) ≤","inline":true},{"style":{"height":38.4},"width":194.54,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-13.png","element":"img","alt":"�4η(i−1)+ 1T","inline":true},{"style":{"height":15},"width":139.61,"height":37.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-14.png","element":"img","alt":"c2 =","inline":true}],[{"style":{"width":"67%"},"width":1315,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-15.png","element":"img"}],[{"text":"Grouping all the constants together and writing ","element":"span"},{"style":{"height":16},"width":196.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-16.png","element":"img","alt":" ⌈z⌉ ≤ 1 + z","inline":true},{"text":", we can simplify this to","element":"span"}],[{"id":"id-83","style":{"width":"67%"},"width":1304,"height":109,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-17.png","element":"img"}],[{"text":"for suitably-chosen ","element":"span"},{"style":{"height":11.6},"width":111.49,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-18.png","element":"img","alt":" c′ > 0.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Bounding the cumulative regret. ","element":"span"},{"text":"In the early epochs, we crudely upper bound the regret at each time instant by ","element":"span"},{"style":{"height":13.19},"width":53.17,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-19.png","element":"img","alt":" 2c0","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"cf.","element":"span"},{"text":", (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":")). Hence, since the total cumulative time of these epochs satisfies (","element":"span"},{"href":"#id-81","text":"57","element":"a"},{"text":") for bounded ","element":"span"},{"style":{"height":15.19},"width":98,"height":37.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-20.png","element":"img","alt":" imax,1","inline":true},{"text":", and ","element":"span"},{"style":{"height":16},"width":251.62,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-21.png","element":"img","alt":" βT = O(log T)","inline":true,"padRight":true},{"text":"as per (","element":"span"},{"href":"#id-70","text":"43","element":"a"},{"text":"), the corresponding total cumulative regret ","element":"span"},{"style":{"height":23.89},"width":98.78,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-22.png","element":"img","alt":" R(L)early ","inline":true,"padRight":true},{"text":"is upper bounded by","element":"span"}],[{"id":"id-90","style":{"width":"60%"},"width":1185,"height":60,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-23.png","element":"img"}],[{"text":"for some ","element":"span"},{"style":{"height":11.6},"width":120.67,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-24.png","element":"img","alt":" c′′ > 0.","inline":true}],[{"text":"For the late epochs, we make use of the instant regret bound in (","element":"span"},{"href":"#id-82","text":"59","element":"a"},{"text":"), depending on the epoch index ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":". Since this upper bound is decreasing in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":", and the epoch lengths satisfy (","element":"span"},{"href":"#id-83","text":"62","element":"a"},{"text":"), we can upper bound ","element":"span"},{"style":{"height":21.37},"width":77.1,"height":53.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-25.png","element":"img","alt":" R(L)T","inline":true,"padRight":true},{"text":"by considering the hypothetical case that the epoch lengths are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"exactly ","element":"span"},{"text":"the right-hand side of (","element":"span"},{"href":"#id-83","text":"62","element":"a"},{"text":"), and the instant regret incurred at time ","element":"span"},{"style":{"height":22.36},"width":432.8,"height":55.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-26.png","element":"img","alt":" t is exactly r(L)t = 4η(i−1).","inline":true}],[{"text":"In this situation, we can easily upper bound the total number of epochs: The last epoch must certainly be no larger than ","element":"span"},{"style":{"height":15.19},"width":98,"height":37.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-27.png","element":"img","alt":"imax,2","inline":true},{"text":", defined to be the smallest ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"such that the term ","element":"span"},{"style":{"height":29.77},"width":105.3,"height":74.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-28.png","element":"img","alt":" c′ σ2βTη2(i)","inline":true,"padRight":true},{"text":"on the right-hand side of (","element":"span"},{"href":"#id-83","text":"62","element":"a"},{"text":") is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"or higher. Substituting","element":"span"}],[{"id":"id-84","style":{"width":"99%"},"width":1944,"height":212,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-29.png","element":"img"}],[{"text":"For technical reasons, here and subsequently we can assume without loss of generality that ","element":"span"},{"style":{"height":28.8},"width":142.65,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-30.png","element":"img","alt":" σ ≤ κ�","inline":true}],[{"style":{"height":21.77},"width":69.67,"height":54.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-31.png","element":"img","alt":"Tlog T","inline":true,"padRight":true},{"text":"for arbitrarily ","element":"span"},{"text":"small ","element":"span"},{"style":{"height":11.6},"width":96.24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-32.png","element":"img","alt":" κ > 0","inline":true,"padRight":true},{"text":"and sufficiently large ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T","element":"span"},{"text":"; otherwise, Theorem ","element":"span"},{"href":"#id-37","text":"1 ","element":"a"},{"text":"states the trivial bound ","element":"span"},{"style":{"height":16},"width":217.71,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-33.png","element":"img","alt":" E[RT ] ≤ CT","inline":true},{"text":". Since ","element":"span"},{"style":{"height":16},"width":250.35,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/14-34.png","element":"img","alt":" βT = Θ(log T)","inline":true},{"text":", this technical condition means the right-hand side of (","element":"span"},{"href":"#id-84","text":"64","element":"a"},{"text":") exceeds one.","element":"span"}],[{"id":"id-85","style":{"width":"99%"},"width":1944,"height":884,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-0.png","element":"img"}],[{"id":"id-86","text":"where ","element":"span"},{"text":"(","element":"span"},{"href":"#id-85","text":"66","element":"a"},{"text":") follows from (","element":"span"},{"href":"#id-83","text":"62","element":"a"},{"text":") and the fact that ","element":"span"},{"style":{"height":16.48},"width":245.58,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-1.png","element":"img","alt":" η(i−1) = 2η(i)","inline":true},{"text":", (","element":"span"},{"href":"#id-85","text":"67","element":"a"},{"text":") follows since ","element":"span"},{"style":{"height":16.08},"width":230.27,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-2.png","element":"img","alt":" η(i−1) ≤ η(0)","inline":true},{"text":", (","element":"span"},{"href":"#id-86","text":"68","element":"a"},{"text":") follows since ","element":"span"},{"style":{"height":19.55},"width":167.67,"height":48.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-3.png","element":"img","alt":"η(i) =η(0)2i","inline":true,"padRight":true},{"text":", (","element":"span"},{"href":"#id-86","text":"69","element":"a"},{"text":") follows since ","element":"span"},{"style":{"height":20.4},"width":284.26,"height":50.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-4.png","element":"img","alt":" �Ni=1 2i ≤ 2 · 2N","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20.4},"width":284.25,"height":50.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-5.png","element":"img","alt":" �Ni=1 4i ≤ 2 · 4N","inline":true},{"text":", and (","element":"span"},{"href":"#id-86","text":"70","element":"a"},{"text":") follows by substituting the upper bound ","element":"span"},{"text":"on ","element":"span"},{"href":"#id-84","style":{"height":15.99},"width":250.47,"height":39.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-6.png","element":"img","alt":" imax,2 from (64","inline":true},{"text":"). Using the fact that ","element":"span"},{"style":{"height":16},"width":250.72,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-7.png","element":"img","alt":" βT = O(log T)","inline":true},{"text":", and recalling that ","element":"span"},{"style":{"height":12.48},"width":148.95,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-8.png","element":"img","alt":" η(0) = c0","inline":true,"padRight":true},{"text":"is constant, we simplify (","element":"span"},{"href":"#id-86","text":"70","element":"a"},{"text":") to","element":"span"}],[{"id":"id-88","style":{"width":"62%"},"width":1209,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-9.png","element":"img"}],[{"text":"for some ","element":"span"},{"style":{"height":14.18},"width":112.04,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-10.png","element":"img","alt":" c† > 0","inline":true},{"text":". Note that we can safely drop the ","element":"span"},{"style":{"height":21.77},"width":528,"height":54.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-11.png","element":"img","alt":" O�log Tσ2βT�= O�log Tσ2 log T�","inline":true},{"text":"term in (","element":"span"},{"href":"#id-86","text":"70","element":"a"},{"text":") due to the assumption ","element":"span"},{"style":{"height":19.08},"width":171.26,"height":47.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-12.png","element":"img","alt":"σ2 ≥ cσT 1−ζ","inline":true,"padRight":true},{"text":"in Theorem ","element":"span"},{"href":"#id-37","text":"1","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Handling the first case in Assumption ","element":"span"},{"href":"#id-34","style":{"fontWeight":"bold"},"text":"3","element":"a"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"From (","element":"span"},{"href":"#id-87","text":"58","element":"a"},{"text":") onwards, we focused only on the second case of Assumption ","element":"span"},{"href":"#id-34","text":"3","element":"a"},{"text":". In the first case, we have a worse Lipschitz constant ","element":"span"},{"style":{"height":16.48},"width":152.05,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-13.png","element":"img","alt":" L(i) = c1","inline":true},{"text":", but the width also shrinks faster: By the locally linear behavior (","element":"span"},{"href":"#id-32","text":"9","element":"a"},{"text":"), achieving ","element":"span"},{"style":{"height":12.48},"width":55.52,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-14.png","element":"img","alt":" η(i)","inline":true},{"text":"-confidence not only brings the interval width ","element":"span"},{"style":{"height":12.48},"width":64.26,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-15.png","element":"img","alt":" w(i)","inline":true,"padRight":true},{"text":"down to at most ","element":"span"},{"style":{"height":18.55},"width":154.17,"height":46.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-16.png","element":"img","alt":" O(√η(i))","inline":true},{"text":", but also further down to ","element":"span"},{"style":{"height":17.68},"width":120.96,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-17.png","element":"img","alt":"O(η(i))","inline":true},{"text":". Hence, we lose a factor of ","element":"span"},{"style":{"height":16},"width":88.73,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-18.png","element":"img","alt":"√η(i)","inline":true,"padRight":true},{"text":"in the Lipschitz constant, but we gain a factor of ","element":"span"},{"style":{"height":16},"width":88.72,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-19.png","element":"img","alt":"√η(i)","inline":true,"padRight":true},{"text":"in the upper bound on ","element":"span"},{"style":{"height":12.48},"width":76.7,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-20.png","element":"img","alt":" w(i).","inline":true,"padRight":true},{"text":"Since the number of points sampled in (","element":"span"},{"href":"#id-68","text":"51","element":"a"},{"text":") contains the product of the two, the final result remains unchanged, i.e., we still have (","element":"span"},{"href":"#id-88","text":"71","element":"a"},{"text":"), possibly with a different constant ","element":"span"},{"style":{"height":13.78},"width":43.85,"height":34.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-21.png","element":"img","alt":" c†.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Completion of the proof. ","element":"span"},{"text":"Combining (","element":"span"},{"href":"#id-89","text":"41","element":"a"},{"text":"), (","element":"span"},{"href":"#id-67","text":"46","element":"a"},{"text":"), (","element":"span"},{"href":"#id-90","text":"63","element":"a"},{"text":") and (","element":"span"},{"href":"#id-88","text":"71","element":"a"},{"text":"), we obtain","element":"span"}],[{"id":"id-91","style":{"width":"67%"},"width":1314,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-22.png","element":"img"}],[{"text":"for some constant ","element":"span"},{"style":{"height":13.38},"width":46.34,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-23.png","element":"img","alt":" C†","inline":true},{"text":". As stated following (","element":"span"},{"href":"#id-84","text":"64","element":"a"},{"text":"), we can assume without loss of generality that ","element":"span"},{"style":{"height":28.8},"width":171.81,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-24.png","element":"img","alt":" σ ≤ O��","inline":true}],[{"style":{"height":7.6},"width":23,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-25.png","element":"img","alt":"T","inline":true},{"style":{"height":10},"width":69.66,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-26.png","element":"img","alt":"log T","inline":true},{"style":{"height":19.2},"width":18,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-27.png","element":"img","alt":"�","inline":true},{"text":", which means that the third term of (","element":"span"},{"href":"#id-91","text":"72","element":"a"},{"text":") dominates the second, and the proof is compete.","element":"span"}],[{"id":"id-53","style":{"fontWeight":"bold"},"text":"C. Proof of Lemma ","element":"span"},{"href":"#id-54","style":{"fontWeight":"bold"},"text":"3","element":"a"}],[{"text":"For the first part, we consider ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-28.png","element":"img","alt":" ∆","inline":true,"padRight":true},{"text":"sufficiently small so that ","element":"span"},{"style":{"height":17.37},"width":159.38,"height":43.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-29.png","element":"img","alt":" c2∆2 < ϵ","inline":true},{"text":", for ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-30.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"given in Assumption ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"and ","element":"span"},{"style":{"height":10.79},"width":33.25,"height":26.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-31.png","element":"img","alt":" c2","inline":true,"padRight":true},{"text":"in Assumption ","element":"span"},{"href":"#id-35","text":"4","element":"a"},{"text":". Since all local maxima are at least ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-32.png","element":"img","alt":" ϵ","inline":true},{"text":"-suboptimal, achieving ","element":"span"},{"style":{"height":17.38},"width":235.67,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-33.png","element":"img","alt":" r+(x) < c2∆2 ","inline":true,"padRight":true},{"text":"requires that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"lies within a small interval around ","element":"span"},{"style":{"height":16.52},"width":47.78,"height":41.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-34.png","element":"img","alt":"x∗+","inline":true},{"text":". Moreover, the locally quadratic behavior (","element":"span"},{"href":"#id-92","text":"12","element":"a"},{"text":") in Assumption ","element":"span"},{"href":"#id-35","text":"4 ","element":"a"},{"text":"yields ","element":"span"},{"style":{"height":18.92},"width":354.01,"height":47.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-35.png","element":"img","alt":" r+(x) ≥ c2(x − x∗+)2","inline":true,"padRight":true},{"text":"within this interval when ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-36.png","element":"img","alt":"∆","inline":true,"padRight":true},{"text":"is sufficiently small. Combining this with ","element":"span"},{"style":{"height":17.39},"width":235.67,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-37.png","element":"img","alt":" r+(x) < c2∆2","inline":true,"padRight":true},{"text":"gives ","element":"span"},{"style":{"height":17.54},"width":228.97,"height":43.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-38.png","element":"img","alt":" |x − x∗+| < ∆","inline":true},{"text":", and since ","element":"span"},{"style":{"height":17.54},"width":275.81,"height":43.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-39.png","element":"img","alt":" |x∗+ − x∗−| = 2∆","inline":true},{"text":", the triangle ","element":"span"},{"text":"inequality yields ","element":"span"},{"style":{"height":16},"width":229.43,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-40.png","element":"img","alt":" |x − x∗−| > ∆","inline":true},{"text":". Again using (","element":"span"},{"href":"#id-92","text":"12","element":"a"},{"text":"), we conclude that ","element":"span"},{"style":{"height":17.38},"width":236.11,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-41.png","element":"img","alt":" r−(x) > c2∆2","inline":true},{"text":", as required.","element":"span"}],[{"text":"For the second part, we recall from (","element":"span"},{"href":"#id-93","text":"24","element":"a"},{"text":")–(","element":"span"},{"href":"#id-94","text":"25","element":"a"},{"text":") that ","element":"span"},{"style":{"height":16},"width":352.8,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-42.png","element":"img","alt":" r+(x) = r0(x + ∆)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":353.22,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-43.png","element":"img","alt":" r−(x) = r0(x − ∆)","inline":true},{"text":", where ","element":"span"},{"style":{"height":16},"width":142.19,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-44.png","element":"img","alt":" r0(x) =","inline":true},{"style":{"height":16},"width":251.99,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-45.png","element":"img","alt":"f0(x∗0) − f0(x)","inline":true},{"text":". Again assuming ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-46.png","element":"img","alt":" ∆","inline":true,"padRight":true},{"text":"is sufficiently small (i.e., less than ","element":"span"},{"style":{"height":10},"width":36.6,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-47.png","element":"img","alt":" ρ0","inline":true},{"text":"), we can apply the general Taylor expansion ","element":"span"},{"id":"id-96","text":"according to (","element":"span"},{"href":"#id-30","text":"11","element":"a"},{"text":") to obtain","element":"span"}],[{"id":"id-95","style":{"width":"67%"},"width":1305,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/15-48.png","element":"img"}],[{"id":"id-104","text":"where ","element":"span"},{"style":{"height":16.79},"width":410.57,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-0.png","element":"img","alt":" c2,max = max{|c′2|, |c′2|}","inline":true},{"text":". Since ","element":"span"},{"style":{"height":16},"width":90.13,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-1.png","element":"img","alt":" r′0(x)","inline":true,"padRight":true},{"text":"is ","element":"span"},{"style":{"height":9.19},"width":33.25,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-2.png","element":"img","alt":" c2","inline":true},{"text":"-Lipschitz continuous (see (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":")) and equals zero at ","element":"span"},{"style":{"height":14.94},"width":38.78,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-3.png","element":"img","alt":" x∗0","inline":true},{"text":", we must have ","element":"span"},{"style":{"height":16},"width":334.67,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-4.png","element":"img","alt":"|r′0(x)| ≤ c2|x − x∗0|","inline":true},{"text":". Hence, and using the triangle inequality along with (","element":"span"},{"href":"#id-95","text":"73","element":"a"},{"text":")–(","element":"span"},{"href":"#id-96","text":"74","element":"a"},{"text":"), we have","element":"span"}],[{"style":{"width":"69%"},"width":1358,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-5.png","element":"img"}],[{"text":"which proves (","element":"span"},{"href":"#id-57","text":"29","element":"a"},{"text":").","element":"span"}],[{"text":"Using the locally quadratic behavior in (","element":"span"},{"href":"#id-92","text":"12","element":"a"},{"text":"), we deduce that (","element":"span"},{"href":"#id-59","text":"30","element":"a"},{"text":") holds for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"within distance ","element":"span"},{"style":{"height":10},"width":36.6,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-6.png","element":"img","alt":" ρ0","inline":true,"padRight":true},{"text":"of the respective function optimizer. On the other hand, if the distance from the optimizer is more than ","element":"span"},{"style":{"height":10},"width":36.6,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-7.png","element":"img","alt":" ρ0","inline":true},{"text":", then a combination of (","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":") and (","element":"span"},{"href":"#id-92","text":"12","element":"a"},{"text":") reveals that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"is bounded away from zero. Since the quadratic terms in (","element":"span"},{"href":"#id-59","text":"30","element":"a"},{"text":") are also bounded from above due to the fact that ","element":"span"},{"style":{"height":16},"width":151.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-8.png","element":"img","alt":"x ∈ [0, 1]","inline":true},{"text":", we conclude that (","element":"span"},{"href":"#id-59","text":"30","element":"a"},{"text":") holds for sufficiently small ","element":"span"},{"style":{"height":7.2},"width":47.62,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-9.png","element":"img","alt":" c′′.","inline":true}],[{"id":"id-55","style":{"fontWeight":"bold"},"text":"D. Proof of Theorem ","element":"span"},{"href":"#id-50","style":{"fontWeight":"bold"},"text":"2 ","element":"a"},{"style":{"fontWeight":"bold"},"text":"(Lower Bound)","element":"span"}],[{"text":"We continue from the reduction to binary hypothesis testing and auxiliary results given in Section ","element":"span"},{"href":"#id-25","text":"4","element":"a"},{"text":". These results hold for an arbitrary given (deterministic) BO algorithm, which in general is simply a sequence of mappings that return the next point ","element":"span"},{"style":{"height":9.19},"width":34.78,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-10.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"based on the previous samples ","element":"span"},{"style":{"height":10},"width":198.46,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-11.png","element":"img","alt":" y1, . . . , yt−1","inline":true},{"text":". Recall also that we implicitly condition on an arbitrary realization of ","element":"span"},{"style":{"height":14},"width":35.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-12.png","element":"img","alt":"f0","inline":true,"padRight":true},{"text":"satisfying the events in Assumptions ","element":"span"},{"href":"#id-31","text":"2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-35","text":"4","element":"a"},{"text":", meaning that all expectations and probabilities are only with respect to the random index ","element":"span"},{"style":{"height":16},"width":200.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-13.png","element":"img","alt":" V ∈ {+, −}","inline":true,"padRight":true},{"text":"and/or the noise. We proceed in two main steps.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Bounding the mutual information. ","element":"span"},{"text":"To bound the mutual information term ","element":"span"},{"style":{"fontStyle":"italic"},"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"; ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"style":{"fontWeight":"bold"},"text":"y","element":"span"},{"text":") ","element":"span"},{"text":"appearing in (","element":"span"},{"href":"#id-58","text":"31","element":"a"},{"text":"), we first apply the following tensorization bound for adaptive sampling, which is based on the chain rule for mutual information (e.g., see (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":")):","element":"span"},{"text":"9","element":"span"}],[{"id":"id-98","style":{"width":"62%"},"width":1212,"height":116,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-14.png","element":"img"}],[{"text":"It is well known that the conditional mutual information ","element":"span"},{"style":{"height":16},"width":259.31,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-15.png","element":"img","alt":" I(V ; yt|xt = x)","inline":true,"padRight":true},{"text":"is upper bounded by the maximum KL divergence ","element":"span"},{"style":{"height":17.68},"width":730.82,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-16.png","element":"img","alt":"maxv,v′ D(PY |V,X(· | v, x)∥PY |V,X(· | v′, x))","inline":true,"padRight":true},{"text":"between the resulting conditional output distributions ","element":"span"},{"style":{"height":16.48},"width":117.66,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-17.png","element":"img","alt":" PY |V,X","inline":true,"padRight":true},{"text":"(e.g., see Eq. (31) of (","element":"span"},{"href":"#id-26","referenceIndex":16,"text":"Raginsky & Rakhlin","element":"a"},{"href":"#id-26","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":16,"text":"2011","element":"a"},{"text":")). In our setting, there are only two values of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"v","element":"span"},{"text":", and since we are considering Gaussian noise, their conditional output distributions are ","element":"span"},{"style":{"height":17.38},"width":527.35,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-18.png","element":"img","alt":" N(r+(x), σ2) and N(r−(x), σ2)","inline":true},{"text":". Using the standard property that the KL divergence between the ","element":"span"},{"style":{"height":17.38},"width":416.04,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-19.png","element":"img","alt":" N(µ1, σ2) and N(µ2, σ2)","inline":true,"padRight":true},{"text":"density functions is ","element":"span"},{"style":{"height":23.64},"width":133.67,"height":59.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-20.png","element":"img","alt":"(µ2−µ1)22σ2","inline":true,"padRight":true},{"text":", we deduce that","element":"span"}],[{"style":{"width":"66%"},"width":1286,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-21.png","element":"img"}],[{"text":"Substituting property (","element":"span"},{"href":"#id-57","text":"29","element":"a"},{"text":") in Lemma ","element":"span"},{"href":"#id-54","text":"3 ","element":"a"},{"text":"gives","element":"span"}],[{"id":"id-97","text":"where ","element":"span"},{"text":"(","element":"span"},{"href":"#id-97","text":"80","element":"a"},{"text":") follows since ","element":"span"},{"style":{"height":17.38},"width":341.6,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-22.png","element":"img","alt":" (a+b)2 ≤ 3(a2+b2)","inline":true},{"text":". Averaging over ","element":"span"},{"style":{"height":9.19},"width":34.78,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-23.png","element":"img","alt":" xt","inline":true},{"text":", we obtain ","element":"span"},{"style":{"height":23.64},"width":733.47,"height":59.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-24.png","element":"img","alt":" I(X; yt|xt) ≤ 3(c′)22σ2 �∆2E�|xt−x∗0|2�+∆4�,","inline":true,"padRight":true},{"text":"and substitution into (","element":"span"},{"href":"#id-98","text":"77","element":"a"},{"text":") gives","element":"span"}],[{"id":"id-105","style":{"width":"99%"},"width":1946,"height":145,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/16-25.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Bounding the regret. ","element":"span"},{"text":"We consider the cases ","element":"span"},{"style":{"height":17.38},"width":277.59,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-0.png","element":"img","alt":" E[RT ] ≥ c′′T∆2","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.38},"width":277.59,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-1.png","element":"img","alt":" E[RT ] < c′′T∆2","inline":true,"padRight":true},{"text":"separately, where ","element":"span"},{"style":{"height":6.8},"width":40.42,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-2.png","element":"img","alt":" c′′","inline":true,"padRight":true},{"text":"is defined in Lemma ","element":"span"},{"href":"#id-54","text":"3","element":"a"},{"text":". In the former case, we immediately have a lower bound on the average cumulative regret, whereas in the latter case, the following lemma is useful.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 5. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If ","element":"span"},{"style":{"height":17.39},"width":403.74,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-3.png","element":"img","alt":" E[RT ] < c′′T∆2 with c′′ ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"defined in Lemma ","element":"span"},{"href":"#id-54","style":{"fontStyle":"italic"},"text":"3","element":"a"},{"style":{"fontStyle":"italic"},"text":", then ","element":"span"},{"style":{"height":21.1},"width":499.1,"height":52.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-4.png","element":"img","alt":" E� �Tt=1 |xt − x∗0|2�< 4T∆2.","inline":true}],[{"id":"id-99","style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Since ","element":"span"},{"style":{"fontStyle":"italic"},"text":"V ","element":"span"},{"text":"is equiprobable on ","element":"span"},{"style":{"height":16},"width":270.06,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-5.png","element":"img","alt":" {+, −}, we have","inline":true}],[{"id":"id-100","text":"where ","element":"span"},{"text":"(","element":"span"},{"href":"#id-99","text":"83","element":"a"},{"text":") follows from (","element":"span"},{"href":"#id-59","text":"30","element":"a"},{"text":") in Lemma ","element":"span"},{"href":"#id-54","text":"3","element":"a"},{"text":", and (","element":"span"},{"href":"#id-100","text":"84","element":"a"},{"text":") follows by expanding the square and lower bounding the cross-term by its negative absolute value.","element":"span"}],[{"id":"id-101","text":"Substituting the assumption ","element":"span"},{"href":"#id-100","style":{"height":17.38},"width":422.52,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-6.png","element":"img","alt":" E[RT ] < c′′T∆2 into (84)","inline":true},{"text":", and canceling the term ","element":"span"},{"style":{"height":13.38},"width":115.65,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-7.png","element":"img","alt":" c′′T∆2 ","inline":true,"padRight":true},{"text":"appearing on both sides, we obtain","element":"span"}],[{"style":{"width":"71%"},"width":1392,"height":442,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-8.png","element":"img"}],[{"id":"id-102","text":"where ","element":"span"},{"text":"(","element":"span"},{"href":"#id-101","text":"86","element":"a"},{"text":") follows from the Cauchy-Schwartz inequality, and (","element":"span"},{"href":"#id-102","text":"87","element":"a"},{"text":") follows from Jensen’s inequality. ","element":"span"},{"text":"Solving for ","element":"span"},{"style":{"height":21.1},"width":338.74,"height":52.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-9.png","element":"img","alt":"E� �Tt=1(xt − x∗0)2�","inline":true},{"text":"yields the desired claim.","element":"span"}],[{"text":"In the case ","element":"span"},{"style":{"height":17.38},"width":274.06,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-10.png","element":"img","alt":" E[RT ] < c′′T∆2","inline":true},{"text":", we claim that under the choice ","element":"span"},{"style":{"height":22.17},"width":150.52,"height":55.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-11.png","element":"img","alt":" ∆ =� σ2","inline":true},{"style":{"height":0},"width":14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-12.png","element":"img","alt":"�","inline":true},{"style":{"height":25.86},"width":119.18,"height":64.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-13.png","element":"img","alt":"CT�1/4","inline":true,"padRight":true},{"text":"with a sufficiently large constant ","element":"span"},{"style":{"height":10.8},"width":31,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-14.png","element":"img","alt":" �C","inline":true},{"text":", it holds","element":"span"}],[{"text":"that ","element":"span"},{"style":{"height":18.3},"width":260.86,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-15.png","element":"img","alt":" E[RT ] ≥ �cσ√T","inline":true,"padRight":true},{"text":"for some constant ","element":"span"},{"style":{"height":6.8},"width":20,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-16.png","element":"img","alt":" �c","inline":true},{"text":". Once this is established, combining the two cases with the choice of ","element":"span"},{"style":{"height":15.2},"width":126.2,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-17.png","element":"img","alt":" ∆ gives","inline":true}],[{"id":"id-103","style":{"width":"62%"},"width":1225,"height":104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-18.png","element":"img"}],[{"text":"which yields Theorem ","element":"span"},{"href":"#id-50","text":"2","element":"a"},{"text":". We also note that by the assumption ","element":"span"},{"href":"#id-50","style":{"height":15.78},"width":444.63,"height":39.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-19.png","element":"img","alt":" σ2 ≤ cσT 1−ζ in Theorem 2","inline":true},{"text":", we have for sufficiently large ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"that ","element":"span"},{"style":{"height":11.6},"width":33,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-20.png","element":"img","alt":" ∆","inline":true,"padRight":true},{"text":"is indeed arbitrarily small under the above choice, as was assumed throughout the proof.","element":"span"},{"text":"10","element":"span"}],[{"text":"It only remains to establish the claim stated above (","element":"span"},{"href":"#id-103","text":"88","element":"a"},{"text":") when ","element":"span"},{"style":{"height":18.3},"width":286.04,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-21.png","element":"img","alt":" E[RT ] < c′′σ√T","inline":true},{"text":". By Lemma ","element":"span"},{"href":"#id-104","text":"5","element":"a"},{"text":", we have ","element":"span"},{"style":{"height":21.1},"width":241.6,"height":52.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-22.png","element":"img","alt":" E� �Tt=1 |xt −","inline":true},{"style":{"height":19.2},"width":237.74,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-23.png","element":"img","alt":"x∗0|2�< 4T∆2","inline":true},{"text":", and substitution into (","element":"span"},{"href":"#id-105","text":"81","element":"a"},{"text":") gives","element":"span"}],[{"style":{"width":"61%"},"width":1192,"height":89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-24.png","element":"img"}],[{"text":"Since ","element":"span"},{"style":{"height":16.68},"width":153.1,"height":41.71,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-25.png","element":"img","alt":" ∆4 = σ2","inline":true},{"style":{"height":0},"width":14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-26.png","element":"img","alt":"�","inline":true},{"style":{"height":7.6},"width":47.8,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-27.png","element":"img","alt":"CT","inline":true,"padRight":true},{"text":", we deduce that ","element":"span"},{"style":{"height":20.21},"width":292.93,"height":50.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-28.png","element":"img","alt":" I(V ; x, y) ≤ log 24","inline":true,"padRight":true},{"text":"(say) when ","element":"span"},{"style":{"height":10.8},"width":31,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-29.png","element":"img","alt":"�C","inline":true,"padRight":true},{"text":"is sufficiently large. As a result, (","element":"span"},{"href":"#id-58","text":"31","element":"a"},{"text":") gives ","element":"span"},{"style":{"height":16},"width":148.23,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-30.png","element":"img","alt":" E[RT ] ≥","inline":true}],[{"style":{"width":"100%"},"width":1947,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-31.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":28.8},"width":145.74,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1805.11792/images/17-32.png","element":"img","alt":" �c = c2�","inline":true}]]}],"_version":"3.3.4"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]