1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2021-02-23T23:33:43.000Z","paperID":"2003.01703","published":"2020-03-03T18:46:29.000Z","authors":"[\"Allen Liu\",\"Renato Paes Leme\",\"Jon Schneider\"]","title":"Optimal Contextual Pricing and Extensions","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-04T23:32:56.270Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9jb250ZXh0dWFsLXNlYXJjaC1mb3ItZ2VuZXJhbC1oeXBvdGhlc2lzIn0=","type":"pwc","url":"https://paperswithcode.com/paper/contextual-search-for-general-hypothesis","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"renato paes leme","node":{"id":"eyJhZGRyZXNzIjoicmVuYXRvcHBsQGdvb2dsZS5jb20ifQ==","address":"renatoppl@google.com","name":"Renato Paes Leme","avatar":"https://img.fullcontact.com/static/fe9e7c9d79a3cb605b2bebf59343a8ba_60a9322ea1cfdec6ad7408443cdff25725e81f9fa0e9c43c1e9d59c02f885a3b","linkedin":"https://www.linkedin.com/in/renato-paes-leme-05922322","bio":"Research Scientist at Google","site":"http://www.cs.cornell.edu/~renatoppl","override":null,"membership":[{"name":"Google"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"RiLCRDwAAAAJ"}],"twitter":[],"location":[{"formatted":"New York, NY, USA"}],"owner":[{"id":"eyJ1aWQiOiJiOWFjYjBhZi1lZDU5LTRjNDAtOGVhYi1lZjg2NGFlYjkzMzAifQ==","name":"Renato Paes Leme","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/fe9e7c9d79a3cb605b2bebf59343a8ba_60a9322ea1cfdec6ad7408443cdff25725e81f9fa0e9c43c1e9d59c02f885a3b"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTYxMS4wMDgyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.00829"},{"id":"eyJwYXBlcklEIjoiMTUxMi4wODYwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1512.08602"},{"id":"eyJwYXBlcklEIjoiMTgwMy4wOTM1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1803.09353"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMjI4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.02287"},{"id":"eyJwYXBlcklEIjoiMjMwNy4wMDE2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.00168"},{"id":"eyJwYXBlcklEIjoiMjMwNy4xMTY1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.11655"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNzUyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.07528"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01703"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wNDY4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.04689"}]}]}},{"author":"jon schneider","node":{"id":"eyJhZGRyZXNzIjoianNjaG5laUBnb29nbGUuY29tIn0=","address":"jschnei@google.com","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"Jc97EyAAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI3MDIwMWQ5MS0yODQyLTQ1YzgtYjhiOC0xNzQwZjI5NWNjYWYifQ==","name":"jon schneider","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgwOS4wOTU4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1809.09582"},{"id":"eyJwYXBlcklEIjoiMTcxMS4wOTE3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1711.09176"},{"id":"eyJwYXBlcklEIjoiMTYwNS4wMzkzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.03933"},{"id":"eyJwYXBlcklEIjoiMTkwOS4xMzg2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.13861"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wOTA2MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.09060"},{"id":"eyJwYXBlcklEIjoiMjQwMS4xNjE5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.16198"},{"id":"eyJwYXBlcklEIjoiMjAwOS4wNTEzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.05138"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wNzM2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.07363"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wNzU4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.07585"},{"id":"eyJwYXBlcklEIjoiMjMwNy4wMDE2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.00168"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xNDUxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.14519"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xOTQ5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.19496"},{"id":"eyJwYXBlcklEIjoiMjQwMS4wMTg1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.01857"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNzUyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.07528"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTcwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01703"},{"id":"eyJwYXBlcklEIjoiMjIxMC4xMjE5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.12198"},{"id":"eyJwYXBlcklEIjoiNTMxNDgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53148"},{"id":"eyJwYXBlcklEIjoiNzA2OTQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70694"},{"id":"eyJwYXBlcklEIjoiNzIzOTgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72398"},{"id":"eyJwYXBlcklEIjoiMjQwNy4wMDU3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.00571"}]}]}},{"author":"allen liu","node":{"id":"eyJhZGRyZXNzIjoiY2xpdTU2OEBnbWFpbC5jb20ifQ==","address":"cliu568@gmail.com","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"O8UYsMYAAAAJ"}],"twitter":[],"location":[],"owner":[]}}]},"__typename":"paper","authorArray":["Allen Liu","Renato Paes Leme","Jon Schneider"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2003.01703","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2003.01703","publisher":"arxiv","paperJSON":{"title":"Optimal Contextual Pricing and Extensions","paperID":"2003.01703","avgLineHeight":13.56,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"In the contextual pricing problem a seller repeatedly obtains products described by an adversarially chosen feature vector in ","element":"span"},{"style":{"height":13.36},"width":45.8,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/0-0.png","element":"img","alt":" Rd ","inline":true,"padRight":true},{"text":"and only observes the purchasing decisions of a buyer with a fixed but unknown linear valuation over the products. The regret measures the difference between the revenue the seller could have obtained knowing the buyer valuation and what can be obtained by the learning algorithm.","element":"span"}],[{"text":"We give a poly-time algorithm for contextual pricing with ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log log ","element":"span"},{"text":"T ","element":"span"},{"text":"+","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") regret which matches the Ω(","element":"span"},{"text":"d ","element":"span"},{"text":"log log ","element":"span"},{"text":"T ","element":"span"},{"text":") lower bound up to the ","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d ","element":"span"},{"text":"additive factor. If we replace pricing loss by the symmetric loss, we obtain an algorithm with nearly optimal regret of ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") matching the Ω(","element":"span"},{"text":"d","element":"span"},{"text":") lower bound up to log ","element":"span"},{"text":"d","element":"span"},{"text":". These algorithms are based on a novel technique of bounding the value of the Steiner polynomial of a convex region at various scales. The Steiner polynomial is a degree ","element":"span"},{"text":"d ","element":"span"},{"text":"polynomial with intrinsic volumes as the coefficients.","element":"span"}],[{"text":"We also study a generalized version of contextual search where the hidden linear function over the Euclidean space is replaced by a hidden function ","element":"span"},{"style":{"height":14},"width":196.52,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/0-1.png","element":"img","alt":" f : X → Y","inline":true,"padRight":true},{"text":"in a certain hypothesis class ","element":"span"},{"text":"H","element":"span"},{"text":". We provide a generic algorithm with ","element":"span"},{"style":{"height":17.36},"width":83.2,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/0-2.png","element":"img","alt":" O(d2","inline":true},{"text":") regret where ","element":"span"},{"text":"d ","element":"span"},{"text":"is the covering dimension of this class. This leads in particular to a ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":17.36},"width":81.76,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/0-3.png","element":"img","alt":"O(s2","inline":true},{"text":") regret algorithm for linear contextual search if the linear function is guaranteed to be ","element":"span"},{"text":"s","element":"span"},{"text":"-sparse. Finally we also extend our results to the noisy feedback model, where each round our feedback is flipped with a fixed probability ","element":"span"},{"text":"p < ","element":"span"},{"text":"1","element":"span"},{"text":"/","element":"span"},{"text":"2.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"In the ","element":"span"},{"text":"contextual search problem ","element":"span"},{"text":"a learner tries to learn a hidden linear function ","element":"span"},{"style":{"height":19.54},"width":361.64,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-0.png","element":"img","alt":" x ∈ Rd �→ ⟨v, x⟩ for","inline":true,"padRight":true},{"text":"some unknown ","element":"span"},{"style":{"height":15.94},"width":126,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-1.png","element":"img","alt":" v ∈ Rd","inline":true},{"text":". In every round, the learner is presented with an adversarially chosen vector ","element":"span"},{"style":{"height":17.82},"width":145.68,"height":44.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-2.png","element":"img","alt":"xt ∈ Rd","inline":true},{"text":", and is asked to provide a guess ","element":"span"},{"style":{"height":16},"width":124.16,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-3.png","element":"img","alt":" yt ∈ R","inline":true,"padRight":true},{"text":"for the dot-product ","element":"span"},{"style":{"height":17.6},"width":115.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-4.png","element":"img","alt":" ⟨v, xt⟩","inline":true},{"text":", subsequently learning whether ","element":"span"},{"style":{"height":17.6},"width":494.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-5.png","element":"img","alt":" yt ≤ ⟨v, xt⟩ or yt > ⟨v, xt⟩","inline":true,"padRight":true},{"text":"and incurring a loss ","element":"span"},{"style":{"height":17.6},"width":205.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-6.png","element":"img","alt":" ℓ(yt, ⟨v, xt⟩","inline":true},{"text":"). The goal of the learner is to minimize the total loss (the ","element":"span"},{"text":"regret","element":"span"},{"text":"), which is given by ","element":"span"},{"style":{"height":18.38},"width":301.92,"height":45.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-7.png","element":"img","alt":"�t ℓ(yt, ⟨v, xt⟩).","inline":true},{"text":"A special case of this problem is ","element":"span"},{"text":"contextual pricing ","element":"span"},{"href":"#id-0","referenceIndex":2,"text":"[2, ","element":"a"},{"href":"#id-1","referenceIndex":7,"text":"8, ","element":"a"},{"href":"#id-2","referenceIndex":11,"text":"11, ","element":"a"},{"href":"#id-3","referenceIndex":14,"text":"14, ","element":"a"},{"href":"#id-4","referenceIndex":16,"text":"16, ","element":"a"},{"href":"#id-5","referenceIndex":17,"text":"17, ","element":"a"},{"href":"#id-6","referenceIndex":19,"text":"19, ","element":"a"},{"href":"#id-7","referenceIndex":23,"text":"23]","element":"a"},{"text":". In this setup, the vectors ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-8.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"are features representing differentiated products and the learner is a seller whose decision at round ","element":"span"},{"text":"t ","element":"span"},{"text":"is how to price item ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-9.png","element":"img","alt":" xt","inline":true},{"text":". Given a price ","element":"span"},{"style":{"height":12},"width":33.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-10.png","element":"img","alt":" yt","inline":true,"padRight":true},{"text":"a buyer with valuation ","element":"span"},{"style":{"height":17.6},"width":220.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-11.png","element":"img","alt":" ut = ⟨v, xt⟩","inline":true,"padRight":true},{"text":"buys the product if ","element":"span"},{"style":{"height":15.2},"width":138.24,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-12.png","element":"img","alt":" ut ≥ yt","inline":true,"padRight":true},{"text":"and doesn’t buy otherwise. The seller only observes the purchase or no-purchase decision. The loss in each round is the difference between the revenue made by the seller ","element":"span"},{"style":{"height":17.6},"width":283.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-13.png","element":"img","alt":" yt · 1{yt ≤ ut}","inline":true,"padRight":true},{"text":"and the revenue the seller could have made if ","element":"span"},{"text":"v ","element":"span"},{"text":"was known. Formally, the pricing loss is given by:","element":"span"}],[{"style":{"width":"30%"},"width":563,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-14.png","element":"img"}],[{"text":"A second important case is called ","element":"span"},{"text":"symmetric contextual search ","element":"span"},{"text":"where the loss is the difference between the guess ","element":"span"},{"style":{"height":12},"width":33.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-15.png","element":"img","alt":" yt","inline":true,"padRight":true},{"text":"and the actual dot product ","element":"span"},{"style":{"height":17.6},"width":114.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-16.png","element":"img","alt":" ⟨v, xt⟩","inline":true},{"text":". This loss function arises in ","element":"span"},{"text":"personalized medicine ","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"[4] ","element":"a"},{"text":"where the learner chooses the dosage of a medicine and observes whether the patient was over-dosed or under-dosed. Another application is one-bit compressed sensing ","element":"span"},{"href":"#id-9","referenceIndex":9,"text":"[9,","element":"a"},{"href":"#id-10","referenceIndex":22,"text":"22] ","element":"a"},{"text":"where the learner only observes the sign of a measurement. In either case, we will consider the following loss:","element":"span"}],[{"style":{"width":"18%"},"width":352,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-17.png","element":"img"}],[{"text":"Optimal Regret Bounds for Contextual Search ","element":"span"},{"text":"In one dimension, both contextual pricing and symmetric contextual search reduce to non-contextual problems and are well understood. For the symmetric loss the optimal regret in the one-dimensional case is Θ(1) using binary search. For pricing, the optimal regret in the one-dimensional case is Θ(log log ","element":"span"},{"text":"T","element":"span"},{"text":") using the algorithm of Kleinberg and Leighton ","element":"span"},{"href":"#id-3","referenceIndex":14,"text":"[14]","element":"a"},{"text":". These results immediately imply Ω(","element":"span"},{"style":{"height":17.6},"width":369.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-18.png","element":"img","alt":"d) and Ω(d log log T","inline":true},{"text":") lower bounds for the general contextual case (see Section ","element":"span"},{"href":"#id-11","text":"2.4 ","element":"a"},{"text":"for details on optimality).","element":"span"}],[{"text":"In this paper, we design polynomial-time algorithms with regret ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log log ","element":"span"},{"text":"T ","element":"span"},{"text":"+ ","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") for contextual pricing and ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") for symmetric contextual search, matching these lower bounds up to a log ","element":"span"},{"text":"d ","element":"span"},{"text":"factor. This improves over the previously known bounds in ","element":"span"},{"href":"#id-5","referenceIndex":17,"text":"[17]","element":"a"},{"text":", which are ","element":"span"},{"style":{"height":19.14},"width":276.2,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-19.png","element":"img","alt":" O(d4 log log T)","inline":true,"padRight":true},{"text":"for pricing and ","element":"span"},{"style":{"height":19.13},"width":90.92,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-20.png","element":"img","alt":" O(d4","inline":true},{"text":") for symmetric contextual search.","element":"span"}],[{"text":"Steiner polynomial ","element":"span"},{"text":"The main technique driving these results is a new potential function based on the ","element":"span"},{"text":"Steiner polynomial","element":"span"},{"text":". This is an object from integral geometry that is closely connected with the notion of ","element":"span"},{"text":"intrinsic volumes ","element":"span"},{"text":"which were used in ","element":"span"},{"href":"#id-5","referenceIndex":17,"text":"[17] ","element":"a"},{"text":"to derive the previously known bounds for this problem.","element":"span"}],[{"text":"Given a convex set ","element":"span"},{"style":{"height":17.54},"width":137.04,"height":43.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-21.png","element":"img","alt":" S ⊆ Rd","inline":true},{"text":", Steiner showed that the volume of the Minkowski sum ","element":"span"},{"text":"Vol","element":"span"},{"text":"(","element":"span"},{"text":"S ","element":"span"},{"text":"+ ","element":"span"},{"text":"z","element":"span"},{"text":"B","element":"span"},{"text":") is a polynomial of degree ","element":"span"},{"text":"d ","element":"span"},{"text":"in ","element":"span"},{"text":"t ","element":"span"},{"text":"(where ","element":"span"},{"text":"B ","element":"span"},{"text":"is the unit ball). The intrinsic volumes ","element":"span"},{"style":{"height":17.49},"width":132.2,"height":43.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-22.png","element":"img","alt":" Vj of S","inline":true,"padRight":true},{"text":"correspond to the coefficients of this polynomial after normalization by volume ","element":"span"},{"style":{"height":17.49},"width":200.92,"height":43.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-23.png","element":"img","alt":" κj of the j","inline":true},{"text":"-dimensional ball:","element":"span"}],[{"style":{"width":"33%"},"width":628,"height":137,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/1-24.png","element":"img"}],[{"text":"Both our algorithm and the the algorithm in ","element":"span"},{"href":"#id-5","referenceIndex":17,"text":"[17] ","element":"a"},{"text":"keep track of the set ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-0.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"of vectors consistent with observations seen so far. The intrinsic volumes approach keeps track of ","element":"span"},{"style":{"height":18.29},"width":98.4,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-1.png","element":"img","alt":" Vj(St","inline":true},{"text":") and shows that the loss incurred in round ","element":"span"},{"text":"t ","element":"span"},{"text":"is proportional to the decrease of one of the suitably normalized intrinsic volumes (i.e. ","element":"span"},{"style":{"height":21.02},"width":170.04,"height":52.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-2.png","element":"img","alt":" Vj(Sj)1/j ","inline":true,"padRight":true},{"text":"for some index ","element":"span"},{"style":{"height":17.6},"width":265.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-3.png","element":"img","alt":" j ∈ {1, . . . , d}","inline":true},{"text":"). In this paper, instead of keeping track of each coefficient individually, we control the value of the Steiner polynomial itself at different values of ","element":"span"},{"text":"z","element":"span"},{"text":". Specifically, we show that for some set of ","element":"span"},{"style":{"height":17.6},"width":439.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-4.png","element":"img","alt":" d values {z1, z2, . . . , zd}","inline":true,"padRight":true},{"text":"it is possible to always choose an ","element":"span"},{"text":"i ","element":"span"},{"text":"(based on the current width of our set) so that ","element":"span"},{"style":{"height":17.6},"width":221.96,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-5.png","element":"img","alt":" Vol(S + ziB","inline":true},{"text":") decreases by a constant fraction. This leads to nearly optimal bounds on regret (via a much simpler proof than that in ","element":"span"},{"href":"#id-5","referenceIndex":17,"text":"[17]","element":"a"},{"text":").","element":"span"}],[{"text":"Framework for learning with binary feedback ","element":"span"},{"text":"While the Steiner polynomial technique largely resolves the classical problem of contextual search, there is a wide class of learning problems with binary feedback that either do not fit within the framework of learning a linear function, or which impose additional constraints on the linear function that one would hope to leverage.","element":"span"}],[{"text":"One example is ","element":"span"},{"text":"sparse contextual search","element":"span"},{"text":", where the hidden vector ","element":"span"},{"style":{"height":15.94},"width":131.76,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-6.png","element":"img","alt":" v ∈ Rd ","inline":true,"padRight":true},{"text":"is guaranteed to be ","element":"span"},{"text":"s","element":"span"},{"text":"-sparse, i.e., ","element":"span"},{"style":{"height":17.6},"width":180.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-7.png","element":"img","alt":" ∥v∥0 ≤ s","inline":true},{"text":". This captures settings where we expect few features to matter to the buyer. Another interesting problem is when we are asked to guess max","element":"span"},{"style":{"height":10.69},"width":92.64,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-8.png","element":"img","alt":"i vixi","inline":true,"padRight":true},{"text":"instead of ","element":"span"},{"style":{"height":17.6},"width":217.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-9.png","element":"img","alt":" ⟨v, x⟩. This","inline":true,"padRight":true},{"text":"corresponds to learning the valuation of an ","element":"span"},{"text":"unit-demand buyer","element":"span"},{"text":". This problem is challenging since the set of vectors ","element":"span"},{"style":{"height":15.94},"width":125.52,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-10.png","element":"img","alt":" v ∈ Rd ","inline":true,"padRight":true},{"text":"consistent with observations seen so far is not necessarily convex.","element":"span"}],[{"text":"Both of these examples are special cases of a general framework for online learning problems under binary feedback. ","element":"span"},{"text":"In our general setup, the learner is trying to learn a function ","element":"span"},{"text":"f ","element":"span"},{"text":"in a hypothesis class ","element":"span"},{"text":"H ","element":"span"},{"text":"containing functions mapping from a context space ","element":"span"},{"text":"X ","element":"span"},{"text":"to an outcome space ","element":"span"},{"text":"Y","element":"span"},{"text":". In each step a context ","element":"span"},{"style":{"height":14.69},"width":128.64,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-11.png","element":"img","alt":" xt ∈ X","inline":true,"padRight":true},{"text":"is chosen adversarially and the learner is asked to submit a guess ","element":"span"},{"style":{"height":12},"width":33.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-12.png","element":"img","alt":" yt","inline":true,"padRight":true},{"text":"for the value of ","element":"span"},{"style":{"height":17.6},"width":80.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-13.png","element":"img","alt":" f(xt","inline":true},{"text":") and incurs a loss ","element":"span"},{"style":{"height":17.6},"width":170.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-14.png","element":"img","alt":" ℓ(yt, f(xt","inline":true},{"text":")). The goal of the learner is to minimize the total loss ","element":"span"},{"style":{"height":21.86},"width":280.8,"height":54.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-15.png","element":"img","alt":"�Tt=1 ℓ(yt, f(xt","inline":true},{"text":")). The original setup corresponds to the case where ","element":"span"},{"text":"X ","element":"span"},{"text":"is some subset of ","element":"span"},{"style":{"height":19.14},"width":128.64,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-16.png","element":"img","alt":" Rd e.g.","inline":true,"padRight":true},{"text":"the unit ball ","element":"span"},{"style":{"height":19.54},"width":440.68,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-17.png","element":"img","alt":" B ⊆ Rd, Y = [−1, 1], H","inline":true,"padRight":true},{"text":"is the class of all linear functions ","element":"span"},{"style":{"height":17.6},"width":457.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-18.png","element":"img","alt":" fv(x) = ⟨v, x⟩ for v ∈ B.","inline":true}],[{"text":"Our main result in this space is an algorithm with regret ","element":"span"},{"style":{"height":19.14},"width":265.88,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-19.png","element":"img","alt":" O(d2) where d","inline":true,"padRight":true},{"text":"is the covering dimension of the hypothesis class ","element":"span"},{"text":"H ","element":"span"},{"text":"(see Definition ","element":"span"},{"href":"#id-12","text":"3.3)","element":"a"},{"text":". ","element":"span"},{"text":"This result immediately improves the regret of symmetric contextual search from ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":20.42},"width":305.48,"height":51.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-20.png","element":"img","alt":"O(d) to ˜O(s2) 1","inline":true},{"text":". Similarly, this result immediately implies an ","element":"span"},{"style":{"height":19.13},"width":90.92,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-21.png","element":"img","alt":"O(d2","inline":true},{"text":") regret algorithm for the unit-demand buyer problem. We accomplish this by generalizing the Steiner polynomial idea for linear contextual search to a general “Steiner potential” defined for any hypothesis class (see the Techniques subsection below).","element":"span"}],[{"text":"We contrast these results in Section ","element":"span"},{"text":"6 ","element":"span"},{"text":"with the full feedback case in which the algorithm learns ","element":"span"},{"style":{"height":17.6},"width":80.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-22.png","element":"img","alt":"f(xt","inline":true},{"text":") each round. In this full-feedback setting, we give matching upper and lower bounds (up to constant factors) on the achievable regret. Our results here are based on a notion we introduce of ","element":"span"},{"text":"tree-dimension ","element":"span"},{"text":"of a hypothesis class, which is a continuous analogue of Littlestone dimension.","element":"span"}],[{"text":"Techniques in the general case ","element":"span"},{"text":"The Steiner polynomial is defined for convex sets living in the Euclidean space. Intriguingly, it is possible to generalize (in some sense) this geometric technique to arbitrary classes ","element":"span"},{"text":"H ","element":"span"},{"text":"of hypotheses. Instead of keeping track of all functions in the hypothesis class that are consistent with the feedback so far, we keep track of an expanded set of functions that don’t violate the feedback up to a certain margin (in the linear case, this is exactly ","element":"span"},{"style":{"height":17.6},"width":270.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-23.png","element":"img","alt":" S + λB). This","inline":true,"padRight":true},{"text":"has the effect of regularizing the set of consistent hypotheses and allows for faster progress. Instead of volume, in the general case we control the size of an ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/2-24.png","element":"img","alt":" ǫ","inline":true},{"text":"-net of the set of these approximately valid hypotheses.","element":"span"}],[{"text":"A second technique we use is ","element":"span"},{"text":"adaptive scaling","element":"span"},{"text":", which involves keeping track of multiple levels of discretization. For the linear case, this boils down to controlling the value of the Steiner polynomial at different values of ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-0.png","element":"img","alt":" λ","inline":true},{"text":". More generally, at each step, we can estimate the maximum possible loss achievable in this round given the previous feedback. Based on this value, we will choose a scale, which will dictate the granularity of the ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-1.png","element":"img","alt":" ǫ","inline":true},{"text":"-net and the margin with which we prune inconsistent hypotheses. After picking the scale we show that it is possible to pick a (random) cut that will either: (i) reduce the number of valid hypotheses in the chosen granularity by half; or (ii) eliminate one valid hypothesis at a much coarser granularity. This will require a careful coupling between the discretizations at two different levels. This coupling between two levels is what allows us to overcome the fact that in the general case we can’t rely on techniques from convex geometry. See Section ","element":"span"},{"href":"#id-13","text":"4.2 ","element":"a"},{"text":"for details.","element":"span"}],[{"text":"One important feature of all our algorithms (not shared by previous algorithms) will be our use of randomness, in particular ","element":"span"},{"text":"perturbed guesses","element":"span"},{"text":". Every round, we compute the median ","element":"span"},{"style":{"height":15.09},"width":108.16,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-2.png","element":"img","alt":" mt of","inline":true,"padRight":true},{"text":"the set ","element":"span"},{"style":{"height":17.6},"width":267.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-3.png","element":"img","alt":" f(xt) where f","inline":true,"padRight":true},{"text":"ranges over the set of approximately valid hypotheses. However, instead of guessing the median ","element":"span"},{"style":{"height":10.69},"width":50.4,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-4.png","element":"img","alt":" mt","inline":true,"padRight":true},{"text":"directly we guess one of the two values (chosen uniformly at random) in ","element":"span"},{"style":{"height":17.6},"width":315.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-5.png","element":"img","alt":"{mt − δ, mt + δ}","inline":true},{"text":", where the size of perturbation ","element":"span"},{"style":{"height":12.8},"width":20,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-6.png","element":"img","alt":" δ","inline":true,"padRight":true},{"text":"depends on our current scale ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-7.png","element":"img","alt":" λ","inline":true},{"text":". Our guarantee is that the potential function will decrease significantly for one of the two choices (and thus in expectation).","element":"span"}],[{"text":"Noisy Contextual Search ","element":"span"},{"text":"The final direction in which we extend the original contextual search problem is by considering noisy binary feedback, i.e., the feedback of the algorithm is flipped with probability ","element":"span"},{"text":"p < ","element":"span"},{"text":"1","element":"span"},{"text":"/","element":"span"},{"text":"2. In this setting we move from keeping track of a set of approximately valid hypotheses to a pseudo-Bayesian approach, where we maintain a distribution ","element":"span"},{"text":"w ","element":"span"},{"text":"over approximately valid hypotheses and update it as we receive feedback. By carefully bounding the weight of hypotheses within a ball of radius 1","element":"span"},{"text":"/T","element":"span"},{"text":", this results in an algorithm with regret ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"T","element":"span"},{"text":").","element":"span"}],[{"text":"Ideally, it would be possible to combine this algorithm with the adaptive scaling technique of the noiseless setting, resulting in an ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")) regret algorithm for general hypothesis classes. One such approach is to replace the notion of width with a fuzzier notion, based on how tightly concentrated the distribution is along the current context vector (e.g. the width of the smallest strip in this direction which contains 1 ","element":"span"},{"style":{"height":8},"width":61.68,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/3-8.png","element":"img","alt":" − ǫ","inline":true,"padRight":true},{"text":"of the mass of the distribution). We can then choose a scale based on this distributional width, and choose the size of the perturbation based on this scale (as in the deterministic case).","element":"span"}],[{"text":"This type of approach works, conditional on being able to show that when the distribution concentrates along a thin strip, the true hypothesis is close to this thin strip with high probability. Unfortunately, doing this for general hypothesis classes seems hard – fortunately, it is possible to do this for the specific case of symmetric contextual search by leveraging the Euclidean geometry of the ambient space (see Section ","element":"span"},{"href":"#id-14","text":"5.2 ","element":"a"},{"text":"for more details). This leads to an algorithm for noisy linear contextual search which gets ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")) regret, and is the first algorithm we are aware of for contextual search in the noisy setting which gets any regret independent of ","element":"span"},{"text":"T ","element":"span"},{"text":"for ","element":"span"},{"text":"d > ","element":"span"},{"text":"1.","element":"span"}],[{"text":"Summary of main results ","element":"span"},{"text":"To summarize, our results include:","element":"span"}],[{"text":"• ","element":"span"},{"text":"Algorithms with regret ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") for symmetric contextual search (Section ","element":"span"},{"href":"#id-15","text":"2.1) ","element":"a"},{"text":"and ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log log ","element":"span"},{"text":"T","element":"span"},{"text":"+ ","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") for contextual pricing (Section ","element":"span"},{"href":"#id-16","text":"2.2)","element":"a"},{"text":". Both algorithms are optimal (up to log ","element":"span"},{"text":"d","element":"span"},{"text":") and have only ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d","element":"span"},{"text":") overhead with respect to the non-contextual case. Both algorithms can be implemented efficiently in poly(","element":"span"},{"text":"d, T","element":"span"},{"text":") time.","element":"span"}],[{"text":"• ","element":"span"},{"text":"General algorithm for learning a function from a hypothesis class ","element":"span"},{"text":"H ","element":"span"},{"text":"under binary feedback with regret ","element":"span"},{"style":{"height":19.14},"width":274.04,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-0.png","element":"img","alt":" O(d2) where d","inline":true,"padRight":true},{"text":"is the covering dimension of the hypothesis class (Section ","element":"span"},{"href":"#id-17","text":"4.3)","element":"a"},{"text":".","element":"span"}],[{"text":"• ","element":"span"},{"text":"An algorithm for symmetric contextual search with noisy binary feedback with ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")) regret (Section ","element":"span"},{"href":"#id-14","text":"5.2)","element":"a"},{"text":".","element":"span"}],[{"text":"Related work ","element":"span"},{"text":"Core to our results is the idea of coupling together potentials at many different scales. Similar ideas of “adaptive discretization”, “zooming”, and “chaining” exist throughout the online learning literature ","element":"span"},{"href":"#id-18","referenceIndex":6,"text":"[6,","element":"a"},{"href":"#id-19","referenceIndex":15,"text":"15,","element":"a"},{"href":"#id-7","referenceIndex":23,"text":"24] ","element":"a"},{"text":"and statistical learning theory literature ","element":"span"},{"href":"#id-18","referenceIndex":6,"text":"[7,","element":"a"},{"href":"#id-20","referenceIndex":10,"text":"10]","element":"a"},{"text":". Algorithms in these works also often construct several layers of discretizations and have learning rates parameterized by the covering dimension of the ambient space. However, these algorithms are usually designed for settings where (1) one cannot hope for better than ","element":"span"},{"style":{"height":19.98},"width":118.84,"height":49.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-1.png","element":"img","alt":" O(√T","inline":true},{"text":") regret (let alone regret independent of ","element":"span"},{"text":"T","element":"span"},{"text":"), (2) feedback is not binary but rather zeroth-order ( ","element":"span"},{"href":"#id-21","referenceIndex":20,"text":"[21] ","element":"a"},{"text":"study a pricing setting where feedback is binary, but where the hypothesis class is large enough that one must incur ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"T","element":"span"},{"text":")) regret). In particular, we believe our technique of coupling together the potentials for different scales in the analysis of Theorem ","element":"span"},{"href":"#id-22","text":"4.3 ","element":"a"},{"text":"is novel.","element":"span"}],[{"text":"Our results in the full feedback case (Section ","element":"span"},{"text":"6) ","element":"span"},{"text":"– parameterizing the optimal regret in terms of the tree dimension – can be seen as a generalization of similar results for Littlestone dimension ","element":"span"},{"href":"#id-23","referenceIndex":18,"text":"[18] ","element":"a"},{"text":"(indeed, in the case where ","element":"span"},{"text":"Y ","element":"span"},{"text":"= ","element":"span"},{"text":"{","element":"span"},{"text":"0","element":"span"},{"text":", ","element":"span"},{"text":"1","element":"span"},{"text":"}","element":"span"},{"text":", our notion of tree dimension reduces to Littlestone dimension). ","element":"span"},{"text":"While there do exist measures which capture the learnability of functions taking values over a metric space (for example, the fat-shattering dimension ","element":"span"},{"href":"#id-24","referenceIndex":3,"text":"[3] ","element":"a"},{"text":"for real-valued functions), as far as we are aware the notion of tree dimension we introduce does not currently exist in the literature. It is an interesting open direction to connect the notion of tree dimension we present with previously studied measures.","element":"span"}]]},{"heading":"2 Optimal Contextual Search","paragraphs":[[{"text":"We start by describing the contextual search setup and establishing some useful notation. The hidden object is a vector ","element":"span"},{"style":{"height":10.69},"width":38.12,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-2.png","element":"img","alt":" v0","inline":true,"padRight":true},{"text":"belonging to the unit ball ","element":"span"},{"style":{"height":19.54},"width":477.04,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-3.png","element":"img","alt":" B = {v ∈ Rd; ∥v∥2 ≤ 1}","inline":true},{"text":". In each round ","element":"span"},{"style":{"height":17.6},"width":262.96,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-4.png","element":"img","alt":"t ∈ {1, . . . , T}","inline":true,"padRight":true},{"text":"the learner is provided an (adversarially chosen) vector ","element":"span"},{"style":{"height":15.09},"width":121.64,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-5.png","element":"img","alt":" xt ∈ B","inline":true,"padRight":true},{"text":"and asked to provide a guess ","element":"span"},{"style":{"height":18.74},"width":160.52,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-6.png","element":"img","alt":" yt ∈ R 2","inline":true},{"text":". Upon guessing, the learner incurs loss ","element":"span"},{"style":{"height":17.6},"width":222.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-7.png","element":"img","alt":" ℓ(yt, ⟨v0, xt⟩","inline":true},{"text":") and receives feedback ","element":"span"},{"style":{"height":12.29},"width":83.24,"height":30.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-8.png","element":"img","alt":" σt ∈","inline":true},{"style":{"height":17.6},"width":174.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-9.png","element":"img","alt":"{−1, +1}","inline":true,"padRight":true},{"text":"corresponding to whether ","element":"span"},{"style":{"height":17.6},"width":1166.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-10.png","element":"img","alt":" yt > ⟨v0, xt⟩ (σt = +1) or yt < ⟨v0, xt⟩ (σt = −1). If yt =","inline":true},{"style":{"height":17.6},"width":132.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-11.png","element":"img","alt":"⟨v0, xt⟩","inline":true},{"text":", then the feedback is arbitrary. In other words:","element":"span"}],[{"style":{"width":"22%"},"width":419,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-12.png","element":"img"}],[{"text":"This allows the learner to keep track of the set ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-13.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"of vectors consistent with observations seen so far:","element":"span"}],[{"style":{"width":"47%"},"width":891,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-14.png","element":"img"}],[{"text":"It is clear from the above setup that for both pricing and symmetric loss, it suffices to consider when ","element":"span"},{"style":{"height":17.6},"width":104.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/4-15.png","element":"img","alt":" ||xt||2","inline":true,"padRight":true},{"text":"= 1 for all rounds.","element":"span"}],[{"text":"Throughout the execution of the algorithm we will keep track of the Steiner potential ","element":"span"},{"style":{"height":17.6},"width":220.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-0.png","element":"img","alt":" Vol(St+zB)","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":17.6},"width":88.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-1.png","element":"img","alt":" Vol(·","inline":true},{"text":") is the standard volume in ","element":"span"},{"style":{"height":15.14},"width":49.68,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-2.png","element":"img","alt":" Rd ","inline":true,"padRight":true},{"text":"and the sum is the Minkowski sum:","element":"span"}],[{"style":{"width":"40%"},"width":761,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-3.png","element":"img"}],[{"text":"We will evaluate the potential at different points depending on the width of ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-4.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"in the direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-5.png","element":"img","alt":"xt","inline":true},{"text":". We define the width as:","element":"span"}],[{"style":{"width":"38%"},"width":728,"height":67,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-6.png","element":"img"}],[{"id":"id-15","text":"2.1 ","element":"span"},{"text":"Symmetric loss with ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") ","element":"span"},{"text":"regret","element":"span"}],[{"text":"We start with the symmetric loss function ","element":"span"},{"style":{"height":17.6},"width":366.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-7.png","element":"img","alt":" ℓ(yt, ut) = |yt − ut|","inline":true},{"text":", where we will show it is possible to obtain ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") regret. The main idea of this algorithm is to choose a value ","element":"span"},{"style":{"height":10.69},"width":32.16,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-8.png","element":"img","alt":" zi","inline":true,"padRight":true},{"text":"based on the width of ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-9.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"in the direction of the current context and then choose a guess ","element":"span"},{"style":{"height":12},"width":33.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-10.png","element":"img","alt":" yt","inline":true,"padRight":true},{"text":"that splits the set ","element":"span"},{"style":{"height":15.09},"width":152.84,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-11.png","element":"img","alt":"St + ziB","inline":true,"padRight":true},{"text":"in two parts of equal volume. By doing this, we will show that ","element":"span"},{"style":{"height":17.6},"width":230.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-12.png","element":"img","alt":" Vol(St + ziB","inline":true},{"text":") (the “Steiner potential”) decreases by a constant multiplicative fraction. Since ","element":"span"},{"style":{"height":17.6},"width":236.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-13.png","element":"img","alt":" Vol(St + ziB","inline":true},{"text":") is bounded below by ","element":"span"},{"style":{"height":17.6},"width":139.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-14.png","element":"img","alt":" Vol(ziB","inline":true},{"text":"), we can only do this some number of times (roughly ","element":"span"},{"style":{"height":17.6},"width":178.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-15.png","element":"img","alt":" d log(1/zi","inline":true},{"text":") times), from which our regret bound will follows.","element":"span"}],[{"text":"We describe the algorithm below (ignore for now issues of computational efficiency; we will address these in Section ","element":"span"},{"href":"#id-25","text":"2.3)","element":"a"},{"text":":","element":"span"}],[{"id":"id-27","style":{"width":"100%"},"width":1874,"height":481,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-16.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Assume the feedback is ","element":"span"},{"style":{"height":10.29},"width":36.96,"height":25.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-17.png","element":"img","alt":" σt","inline":true,"padRight":true},{"text":"= +1 (the other case is analogous). Then ","element":"span"},{"style":{"height":17.6},"width":461.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-18.png","element":"img","alt":" St+1 = {v ∈ St; ⟨v, xt⟩ ≥","inline":true},{"style":{"height":17.6},"width":57.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-19.png","element":"img","alt":"yt}","inline":true},{"text":". In Figure ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"we depict the set ","element":"span"},{"style":{"height":16.29},"width":446.12,"height":40.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-20.png","element":"img","alt":" St + ziB and St+1 + ziB","inline":true},{"text":". The part of ","element":"span"},{"style":{"height":17.6},"width":514.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-21.png","element":"img","alt":" St+1 + ziB with ⟨v, xt⟩ ≥ yt","inline":true,"padRight":true},{"text":"is exactly ","element":"span"},{"style":{"height":17.6},"width":505.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-22.png","element":"img","alt":" {v ∈ St + ziB; ⟨v, xt⟩ ≥ yt}","inline":true,"padRight":true},{"text":"which has volume ","element":"span"},{"style":{"height":21.26},"width":284.64,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-23.png","element":"img","alt":"12Vol(St + ziB).","inline":true}],[{"text":"To bound the remaining part, let ","element":"span"},{"text":"C ","element":"span"},{"text":"be the largest volume of a section of ","element":"span"},{"style":{"height":15.09},"width":145.16,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-24.png","element":"img","alt":" St+ziB","inline":true,"padRight":true},{"text":"in the direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-25.png","element":"img","alt":"xt","inline":true,"padRight":true},{"text":"(see the right part of Figure ","element":"span"},{"href":"#id-26","text":"1)","element":"a"},{"text":". The total volume of ","element":"span"},{"style":{"height":15.09},"width":144.68,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-26.png","element":"img","alt":" St+ziB","inline":true,"padRight":true},{"text":"can be bounded below by comparing it to the two cones formed by taking the convex hull of the section of largest volume inside the band and the extreme points ","element":"span"},{"style":{"height":16},"width":177.8,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-27.png","element":"img","alt":" q1 and q2","inline":true},{"text":", which are at least 2","element":"span"},{"style":{"height":12.8},"width":107.08,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-28.png","element":"img","alt":"−(i+1) ","inline":true,"padRight":true},{"text":"apart in the ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-29.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"direction. The volume of the two cones is at least:","element":"span"}],[{"style":{"width":"55%"},"width":1040,"height":100,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-30.png","element":"img"}],[{"text":"Finally, note that the region of ","element":"span"},{"style":{"height":17.6},"width":499.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-31.png","element":"img","alt":" St+1+ziB with ⟨v, xt⟩ ≤ yt","inline":true,"padRight":true},{"text":"has cross-section with volume at most ","element":"span"},{"style":{"height":15.09},"width":468.96,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-32.png","element":"img","alt":"C and width zi in the xt","inline":true,"padRight":true},{"text":"direction so its volume is at most ","element":"span"},{"style":{"height":21.27},"width":387.08,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-33.png","element":"img","alt":" Czi ≤ 14Vol(St + ziB","inline":true},{"text":"), thus completing ","element":"span"},{"text":"the proof. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/5-34.png","element":"img","alt":"■","inline":true,"padRight":true},{"id":"id-32","text":"Theorem 2.2. ","element":"span"},{"text":"The regret of the ","element":"span"},{"text":"Multiscale Steiner Potential ","element":"span"},{"text":"algorithm is at most ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":").","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"Every time we choose index ","element":"span"},{"text":"i","element":"span"},{"text":", the loss at is most 2","element":"span"},{"style":{"height":8.4},"width":38.4,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-0.png","element":"img","alt":"−i ","inline":true,"padRight":true},{"text":"and the volume of ","element":"span"},{"style":{"height":17.6},"width":216.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-1.png","element":"img","alt":" Vol(St+ziB","inline":true},{"text":") decreases by a constant factor. The set ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-2.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"is never empty since ","element":"span"},{"style":{"height":15.09},"width":303.52,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-3.png","element":"img","alt":" v0 ∈ St for all t","inline":true},{"text":", therefore ","element":"span"},{"style":{"height":17.6},"width":302.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-4.png","element":"img","alt":" Vol(St + ziB) ≥","inline":true},{"style":{"height":19.94},"width":371.44,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-5.png","element":"img","alt":"Vol(ziB) = zdi Vol(B","inline":true},{"text":"). For this reason we can’t pick index ","element":"span"},{"text":"i ","element":"span"},{"text":"by more than ","element":"span"},{"style":{"height":17.6},"width":229.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-6.png","element":"img","alt":" O(d log(1/zi","inline":true},{"text":")) times, so ","element":"span"},{"text":"the total regret is at most:","element":"span"}],[{"id":"id-26","style":{"width":"94%"},"width":1775,"height":821,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-7.png","element":"img"}],[{"text":"Figure 1: ","element":"figcaption","subtype":"caption"},{"text":"Illustration of the proof of Lemma ","element":"figcaption","subtype":"caption"},{"href":"#id-27","text":"2.1","element":"a","subtype":"caption"}],[{"text":"2.1.1 ","element":"span"},{"text":"Comparison with other approaches","element":"span"}],[{"text":"Is the Steiner potential necessary? One natural algorithm for this problem is to query ","element":"span"},{"style":{"height":16.4},"width":226.28,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-8.png","element":"img","alt":" yt such that","inline":true},{"style":{"height":21.27},"width":700.32,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-9.png","element":"img","alt":"Vol({v ∈ St; ⟨v, xt⟩ ≥ yt}) = 12Vol(St","inline":true},{"text":") (i.e. guess the median without inflating the set). The best ","element":"span"},{"text":"upper bound from ","element":"span"},{"href":"#id-5","referenceIndex":17,"text":"[17] ","element":"a"},{"text":"shows only that this has regret at most 2","element":"span"},{"style":{"height":12.8},"width":142.6,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-10.png","element":"img","alt":"O(d log d) ","inline":true,"padRight":true},{"text":"and a lower bound given in the example in Section 8 of ","element":"span"},{"href":"#id-6","referenceIndex":19,"text":"[19] ","element":"a"},{"text":"shows that this algorithm has regret at least Ω(","element":"span"},{"style":{"height":15.14},"width":39.56,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-11.png","element":"img","alt":"d2","inline":true},{"text":"). Inflating the set by taking the Minkowski sum with a ball seems to be the appropriate regularization that allows us to overcome the ","element":"span"},{"style":{"height":15.14},"width":39.56,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-12.png","element":"img","alt":" d2 ","inline":true,"padRight":true},{"text":"lower bound.","element":"span"}],[{"text":"Another natural algorithm is to guess ","element":"span"},{"style":{"height":21.46},"width":735.08,"height":53.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-13.png","element":"img","alt":" yt = 12 (minv∈St⟨v, xt⟩ + maxv∈St⟨v, xt⟩","inline":true},{"text":"). This algorithm ","element":"span"},{"text":"was shown to have regret at least 2","element":"span"},{"href":"#id-1","referenceIndex":7,"style":{"height":20.34},"width":193.92,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-14.png","element":"img","alt":"Ω(d) in [8].","inline":true}],[{"id":"id-16","text":"2.2 ","element":"span"},{"text":"Pricing loss with ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log log ","element":"span"},{"text":"T ","element":"span"},{"text":"+ ","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") ","element":"span"},{"text":"regret","element":"span"}],[{"text":"We now study the pricing loss ","element":"span"},{"style":{"height":17.6},"width":591.28,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-15.png","element":"img","alt":" ℓ(yt, ut) = ut − yt · 1{yt ≤ ut}","inline":true},{"text":". Unlike the previous case the loss function is discontinuous. While the loss when under-estimating ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-16.png","element":"img","alt":" ut","inline":true,"padRight":true},{"text":"is small, the loss when over-estimating is very large. In the one-dimensional setting, ","element":"span"},{"href":"#id-3","referenceIndex":14,"text":"[14] ","element":"a"},{"text":"obtains a ","element":"span"},{"text":"O","element":"span"},{"text":"(log log ","element":"span"},{"text":"T","element":"span"},{"text":")-regret algorithm by a conservative variant of binary search that avoids over-estimating the actual value as much as possible.","element":"span"}],[{"text":"As before, our algorithm will keep track of ","element":"span"},{"style":{"height":17.6},"width":240.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-17.png","element":"img","alt":" Vol(St + ziB","inline":true},{"text":") for different values ","element":"span"},{"style":{"height":10.69},"width":32.16,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/6-18.png","element":"img","alt":" zi","inline":true},{"text":". This time, however, we will guess more conservatively so that in the case of a no-purchase event, the potential will decrease by a large amount. We will do this in a way that each ","element":"span"},{"style":{"height":10.69},"width":32.16,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-0.png","element":"img","alt":" zi","inline":true,"padRight":true},{"text":"can lead to a no-purchase event approximately ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d","element":"span"},{"text":") times.","element":"span"}],[{"id":"id-30","style":{"width":"100%"},"width":1874,"height":797,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-1.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Note that in this case, the set ","element":"span"},{"style":{"height":17.6},"width":522.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-2.png","element":"img","alt":" {v ∈ St + ziB; ⟨v, xt⟩ > mt}","inline":true,"padRight":true},{"text":"is disjoint from ","element":"span"},{"style":{"height":16.29},"width":313.76,"height":40.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-3.png","element":"img","alt":" St+1 + ziB. The","inline":true,"padRight":true},{"text":"definition of ","element":"span"},{"style":{"height":10.69},"width":50.4,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-4.png","element":"img","alt":" mt","inline":true,"padRight":true},{"text":"immediately gives the desired result. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-5.png","element":"img","alt":"■","inline":true}],[{"id":"id-31","style":{"width":"91%"},"width":1713,"height":197,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-6.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"First we upper bound the volume ","element":"span"},{"text":"V ","element":"span"},{"text":"of the strip ","element":"span"},{"style":{"height":17.6},"width":827.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-7.png","element":"img","alt":" Vol({v ∈ St +ziB; mt −2zi ≤ ⟨v, xt⟩ ≤ mt}).","inline":true,"padRight":true},{"text":"Let ","element":"span"},{"text":"C ","element":"span"},{"text":"be the largest volume of a section of ","element":"span"},{"style":{"height":15.09},"width":153.8,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-8.png","element":"img","alt":" St + ziB","inline":true,"padRight":true},{"text":"in the direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-9.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"(see right part of Figure ","element":"span"},{"href":"#id-26","text":"1)","element":"a"},{"text":". Then ","element":"span"},{"style":{"height":15.09},"width":181.92,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-10.png","element":"img","alt":" V ≤ 2Czi","inline":true},{"text":". On the other hand since ","element":"span"},{"style":{"height":15.09},"width":162.16,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-11.png","element":"img","alt":" St + ziB","inline":true,"padRight":true},{"text":"is a convex set and has width at least 2","element":"span"},{"style":{"height":20.75},"width":107.04,"height":51.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-12.png","element":"img","alt":"−2i+1,","inline":true}],[{"style":{"width":"27%"},"width":511,"height":104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-13.png","element":"img"}],[{"text":"Thus","element":"span"}],[{"style":{"width":"87%"},"width":1634,"height":200,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-14.png","element":"img"}],[{"text":"Since the entire set ","element":"span"},{"style":{"height":17.6},"width":621.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-15.png","element":"img","alt":" {v ∈ St + ziB; ⟨v, xt⟩ ≤ mt − 2zi}","inline":true,"padRight":true},{"text":"is disjoint from ","element":"span"},{"style":{"height":16.29},"width":195.08,"height":40.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-16.png","element":"img","alt":" St+1 + ziB","inline":true},{"text":", this completes the proof of the lemma. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-17.png","element":"img","alt":"■","inline":true}],[{"id":"id-33","text":"Theorem 2.5. ","element":"span"},{"text":"The regret of the ","element":"span"},{"text":"Multiscale Steiner Potential for Pricing ","element":"span"},{"text":"algorithm is at most ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log log ","element":"span"},{"text":"T ","element":"span"},{"text":"+ ","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":").","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"First, note that the regret contributed from all rounds where ","element":"span"},{"style":{"height":17.6},"width":537.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-18.png","element":"img","alt":" width(St; xt) ≤ 1/T is O(1).","inline":true,"padRight":true},{"text":"Next, we consider rounds with ","element":"span"},{"style":{"height":17.6},"width":367,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-19.png","element":"img","alt":" width(St; xt) > 1/T","inline":true},{"text":". Note that for all of these rounds ","element":"span"},{"style":{"height":16.4},"width":273.12,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/7-20.png","element":"img","alt":" i ≤ 2 log log T.","inline":true,"padRight":true},{"text":"If we choose index ","element":"span"},{"style":{"height":17.6},"width":342.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-0.png","element":"img","alt":" i and yt > ⟨v0, xt⟩","inline":true,"padRight":true},{"text":"(leading to a no-purchase), the volume of ","element":"span"},{"style":{"height":16.4},"width":344.6,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-1.png","element":"img","alt":" St + ziB is cut by","inline":true,"padRight":true},{"text":"a factor of 2","element":"span"},{"style":{"height":22.54},"width":709.36,"height":56.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-2.png","element":"img","alt":"−2i−1. Since Vol(St + ziB) ≥ zdi Vol(B","inline":true},{"text":") always, this can happen at most","element":"span"}],[{"style":{"width":"67%"},"width":1271,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-3.png","element":"img"}],[{"text":"times. The loss from each such query is at most 1. If we choose index ","element":"span"},{"style":{"height":17.6},"width":453.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-4.png","element":"img","alt":" i and yt ≤ ⟨v0, xt⟩, the","inline":true,"padRight":true},{"text":"volume of ","element":"span"},{"style":{"height":15.09},"width":162.16,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-5.png","element":"img","alt":" St + ziB","inline":true,"padRight":true},{"text":"is cut by a factor of","element":"span"},{"style":{"height":31.6},"width":264.08,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-6.png","element":"img","alt":"�1 − 110·22i−1�","inline":true},{"text":". This can happen at most","element":"span"}],[{"style":{"width":"90%"},"width":1692,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-7.png","element":"img"}],[{"text":"times. The loss from each such query is at most 2","element":"span"},{"style":{"height":11.41},"width":54.2,"height":28.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-8.png","element":"img","alt":"−2i","inline":true},{"text":". Thus, our regret is at most","element":"span"}],[{"style":{"width":"78%"},"width":1473,"height":208,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-9.png","element":"img"}],[{"id":"id-25","text":"2.3 ","element":"span"},{"text":"Polynomial time implementation","element":"span"}],[{"text":"We note that ","element":"span"},{"style":{"height":12},"width":33.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-10.png","element":"img","alt":" yt","inline":true,"padRight":true},{"text":"can be approximated using binary search as long as we can compute the volume of (","element":"span"},{"style":{"height":17.6},"width":278.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-11.png","element":"img","alt":"St + ziB) ∩ H","inline":true,"padRight":true},{"text":"for any half-space ","element":"span"},{"text":"H","element":"span"},{"text":". ","element":"span"},{"text":"It is enough to notice that we only need a constant approximation of the volume in the previous proof and in order to approximate the volume we only need access to a separation oracle ","element":"span"},{"href":"#id-28","referenceIndex":5,"text":"[5,","element":"a"},{"href":"#id-29","referenceIndex":13,"text":"13,","element":"a"},{"href":"#id-21","referenceIndex":20,"text":"20]","element":"a"},{"text":". Since ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-12.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"is a ball intersected with at most ","element":"span"},{"text":"t ","element":"span"},{"text":"halfspaces, it is trivial to obtain a separation oracle for it.","element":"span"}],[{"text":"To obtain a separation oracle for ","element":"span"},{"style":{"height":15.09},"width":159.08,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-13.png","element":"img","alt":" St + ziB","inline":true,"padRight":true},{"text":"it is enough to solve the problem of computing the distance from a query point to the convex set ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-14.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"which is itself a convex problem. Technically, this requires ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-15.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"to not be too small since the guarantee of cutting plane methods (like the ellipsoid algorithm) tells us that (given an initial ellipsoid ","element":"span"},{"style":{"height":15.09},"width":136.8,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-16.png","element":"img","alt":" E ⊇ St","inline":true},{"text":"), it is possible to compute an ","element":"span"},{"style":{"height":16},"width":177.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-17.png","element":"img","alt":" ǫ-optimal","inline":true,"padRight":true},{"text":"solution in time ","element":"span"},{"style":{"height":31.6},"width":589.45,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-18.png","element":"img","alt":" O�T · poly(d) · log�Vol(E)ǫVol(St)��.","inline":true}],[{"text":"We can take ","element":"span"},{"text":"E ","element":"span"},{"text":"to be the unit ball; then this is algorithm is efficient as long as ","element":"span"},{"style":{"height":17.6},"width":184.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-19.png","element":"img","alt":" Vol(St) is","inline":true,"padRight":true},{"text":"never too small (anything at least exp(","element":"span"},{"style":{"height":17.6},"width":164.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-20.png","element":"img","alt":"−poly(T","inline":true},{"text":")) is fine). Here we present a simple modification of Algorithms ","element":"span"},{"href":"#id-27","text":"1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-30","text":"2 ","element":"a"},{"text":"that makes sure that the volume of ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-21.png","element":"img","alt":" St","inline":true,"padRight":true},{"text":"stays large enough throughout by preserving a small ball around ","element":"span"},{"style":{"height":10.69},"width":51.84,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-22.png","element":"img","alt":" v0.","inline":true}],[{"text":"Initialize ","element":"span"},{"style":{"height":19.14},"width":319.88,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-23.png","element":"img","alt":" S1 = (1 + T −4)B","inline":true,"padRight":true},{"text":"to ensure that ","element":"span"},{"style":{"height":19.14},"width":566.24,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-24.png","element":"img","alt":" B(v0, T −4) ⊆ S1 (where B(c, r","inline":true},{"text":") denotes the ball with center ","element":"span"},{"text":"c ","element":"span"},{"text":"and radius ","element":"span"},{"text":"r","element":"span"},{"text":"). Now change the guess ","element":"span"},{"style":{"height":16.4},"width":406.08,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-25.png","element":"img","alt":" yt to yt − δt where δt","inline":true,"padRight":true},{"text":"is sampled from the uniform distribution over [0","element":"span"},{"style":{"height":19.14},"width":120.48,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-26.png","element":"img","alt":", T −2].","inline":true,"padRight":true},{"text":"The total additional loss from this perturbation is ","element":"span"},{"text":"O","element":"span"},{"text":"(1). ","element":"span"},{"text":"Since the perturbation is smaller than ","element":"span"},{"style":{"height":10.69},"width":32.16,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-27.png","element":"img","alt":" zi","inline":true,"padRight":true},{"text":"we can use the same argument in Lemmas ","element":"span"},{"href":"#id-27","text":"2.1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-31","text":"2.4 ","element":"a"},{"text":"with 2","element":"span"},{"style":{"height":10.69},"width":32.16,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-28.png","element":"img","alt":"zi","inline":true,"padRight":true},{"text":"instead of ","element":"span"},{"style":{"height":10.69},"width":32.16,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-29.png","element":"img","alt":" zi","inline":true,"padRight":true},{"text":"to bound the volume of the band making sure there is constant progress in the Steiner potential.","element":"span"}],[{"text":"The advantage of this perturbation is that the probability that the cut passes through the ball of radius 1","element":"span"},{"style":{"height":19.14},"width":266.6,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-30.png","element":"img","alt":"/T 4 around v0","inline":true,"padRight":true},{"text":"is at most 1","element":"span"},{"style":{"height":19.14},"width":70.28,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-31.png","element":"img","alt":"/T 2 ","inline":true,"padRight":true},{"text":"per period. So with probability 1","element":"span"},{"style":{"height":19.14},"width":521.48,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-32.png","element":"img","alt":"−1/T, B(v0, 1/T 4) ⊆ St for","inline":true,"padRight":true},{"text":"all periods ","element":"span"},{"text":"t","element":"span"},{"text":". It follows that ","element":"span"},{"style":{"height":22.22},"width":364.99,"height":55.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/8-33.png","element":"img","alt":" Vol(St) ≥ 1T 4d Vol(B","inline":true},{"text":"), and with this, the convex minimization problem ","element":"span"},{"text":"can be solved in poly(","element":"span"},{"text":"d, T","element":"span"},{"text":") time.","element":"span"}],[{"id":"id-11","text":"2.4 ","element":"span"},{"text":"Optimality","element":"span"}],[{"text":"Here we briefly discuss the optimality of our results, namely Theorem ","element":"span"},{"href":"#id-32","text":"2.2 ","element":"a"},{"text":"and Theorem ","element":"span"},{"href":"#id-33","text":"2.5. ","element":"a"},{"text":"Note that we set up the problem by assuming that the hidden vector ","element":"span"},{"style":{"height":10.69},"width":38.12,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-0.png","element":"img","alt":" v0","inline":true,"padRight":true},{"text":"and all of the adversarially chosen contexts ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-1.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"are drawn from the ","element":"span"},{"style":{"height":14.74},"width":46.76,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-2.png","element":"img","alt":" L2 ","inline":true,"padRight":true},{"text":"unit ball. We may alternatively set up the problem by assuming that the hidden vector ","element":"span"},{"text":"v ","element":"span"},{"text":"is drawn from the cube [","element":"span"},{"style":{"height":19.53},"width":126.48,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-3.png","element":"img","alt":"−1, 1]d ","inline":true,"padRight":true},{"text":"and the contexts ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-4.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"are chosen from the ","element":"span"},{"style":{"height":19.14},"width":558.4,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-5.png","element":"img","alt":" L1 ball i.e. ∥xt∥ ≤ 1 for all t","inline":true},{"text":". With this setup, it follows from a direct reduction to the one-dimensional case that we cannot do better than ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d","element":"span"},{"text":") for symmetric loss and ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log log ","element":"span"},{"text":"T","element":"span"},{"text":") for pricing loss.","element":"span"}],[{"text":"Now we show how our proofs can be modified to achieve the same results, ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") for symmetric loss and ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log log ","element":"span"},{"text":"T ","element":"span"},{"text":"+","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") for pricing loss, which are optimal in this modified setting up to logarithmic factors. We will run exactly the same algorithms. To see that the analysis remains the same, it suffices to note that our maximum loss in any round is still bounded by ","element":"span"},{"text":"O","element":"span"},{"text":"(1) and the volumes are scaled by a factor of poly(","element":"span"},{"style":{"height":19.54},"width":57.36,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-6.png","element":"img","alt":"d)d ","inline":true,"padRight":true},{"text":"which becomes an additive ","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"d ","element":"span"},{"text":"factor after taking the logarithm.","element":"span"}]]},{"heading":"3 Framework for Online Learning with Binary Feedback","paragraphs":[[{"text":"We will now consider a general model of learning with binary feedback that has contextual search as a special case. We will derive regret bounds based on the covering dimension of the hypothesis class. The driving technique will still be the Steiner potential together with two new ideas: (i) a new adaptive scaling that will either make a lot of progress in a finer granularity or slower progress in coarser granularity; (ii) randomized cuts that will reduce the potential with constant probability.","element":"span"}],[{"text":"Consider a hypothesis class ","element":"span"},{"text":"H ","element":"span"},{"text":"consisting of functions mapping ","element":"span"},{"text":"X ","element":"span"},{"text":"to ","element":"span"},{"text":"Y","element":"span"},{"text":". We refer to ","element":"span"},{"text":"X ","element":"span"},{"text":"as the context space and ","element":"span"},{"text":"Y ","element":"span"},{"text":"as the output space. We assume that the output space ","element":"span"},{"text":"Y ","element":"span"},{"text":"is a totally ordered set, i.e, for each ","element":"span"},{"style":{"height":16.8},"width":437.48,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-7.png","element":"img","alt":" y1, y2 ∈ Y with y1 ̸= y2","inline":true,"padRight":true},{"text":"we have either ","element":"span"},{"style":{"height":13.6},"width":357.6,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-8.png","element":"img","alt":" y1 < y2 or y2 < y1.","inline":true}],[{"text":"The learning protocol is as follows: an adversary chooses some ","element":"span"},{"style":{"height":16.4},"width":133.56,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-9.png","element":"img","alt":" f0 ∈ H","inline":true,"padRight":true},{"text":"and in each round they choose some ","element":"span"},{"style":{"height":14.69},"width":132,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-10.png","element":"img","alt":" xt ∈ X","inline":true},{"text":". The learner makes a prediction ","element":"span"},{"style":{"height":16},"width":124.64,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-11.png","element":"img","alt":" yt ∈ Y","inline":true,"padRight":true},{"text":"and incurs loss ","element":"span"},{"style":{"height":17.6},"width":184.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-12.png","element":"img","alt":" ℓ(yt, f0(xt","inline":true},{"text":")) for some loss function ","element":"span"},{"style":{"height":17.6},"width":276,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-13.png","element":"img","alt":" ℓ : Y ×Y → [0,","inline":true,"padRight":true},{"text":"1]. Upon making a prediction, the learner receives feedback on whether ","element":"span"},{"style":{"height":17.6},"width":477.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-14.png","element":"img","alt":"yt ≤ f0(xt) or yt ≥ f0(xt","inline":true},{"text":") (the feedback is arbitrary in case of equality). It will be convenient to represent the feedback as a variable ","element":"span"},{"style":{"height":17.6},"width":1169.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-15.png","element":"img","alt":" σt ∈ {−1, +1} such that σt = +1 if yt > f0(xt) and σt = −1","inline":true,"padRight":true},{"text":"otherwise.","element":"span"}],[{"text":"We make the following assumptions about the loss function throughout the paper:","element":"span"}],[{"text":"• ","element":"span"},{"style":{"height":17.6},"width":303.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-16.png","element":"img","alt":" Reflexive: ℓ(y, y","inline":true},{"text":") = 0 for all ","element":"span"},{"style":{"height":16},"width":120.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-17.png","element":"img","alt":" y ∈ Y.","inline":true}],[{"text":"• ","element":"span"},{"style":{"height":17.6},"width":929.28,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-18.png","element":"img","alt":" Symmetry: ℓ(y1, y2) = ℓ(y2, y1) for all y1, y2 ∈ Y.","inline":true}],[{"text":"• ","element":"span"},{"style":{"height":17.6},"width":1339.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-19.png","element":"img","alt":" Triangle inequality: ℓ(y1, y2) ≤ ℓ(y1, y′) + ℓ(y′, y2) for all y1, y2, y′ ∈ Y.","inline":true}],[{"text":"• ","element":"span"},{"style":{"height":17.6},"width":1441.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-20.png","element":"img","alt":" Order consistency: If y1 < y2 < y3 then max{ℓ(y1, y2), ℓ(y2, y3)} ≤ ℓ(y1, y3).","inline":true}],[{"text":"• ","element":"span"},{"style":{"height":17.6},"width":571.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-21.png","element":"img","alt":" Continuity: If 0 < ℓ < ℓ(y1, y2","inline":true},{"text":") then there are ","element":"span"},{"style":{"height":17.6},"width":838.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-22.png","element":"img","alt":" y′, y′′ ∈ Y such that ℓ = ℓ(y1, y′) = ℓ(y′′, y2).","inline":true}],[{"text":"If ","element":"span"},{"text":"Y ","element":"span"},{"text":"= ","element":"span"},{"text":"R","element":"span"},{"text":", then for any continuous increasing function ","element":"span"},{"style":{"height":17.6},"width":211.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-23.png","element":"img","alt":" φ : R → [0,","inline":true,"padRight":true},{"text":"1] and parameter ","element":"span"},{"style":{"height":15.6},"width":199.04,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-24.png","element":"img","alt":" α ≤ 1, the","inline":true,"padRight":true},{"text":"loss ","element":"span"},{"style":{"height":17.6},"width":510.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/9-25.png","element":"img","alt":" ℓ(y1, y2) = |φ(y1) − φ(y2)|α ","inline":true,"padRight":true},{"text":"satisfies the desired properties. Note that while the symmetric loss function is easily cast in this framework the pricing loss is not captured. In Section ","element":"span"},{"href":"#id-34","text":"4.4 ","element":"a"},{"text":"we give an impossibility result showing it is impossible to obtain covering-dimension bounds for the pricing loss.","element":"span"}],[{"text":"3.1 ","element":"span"},{"text":"Covering Dimension","element":"span"}],[{"text":"Our loss bounds will be in terms of the covering dimension of the hypothesis class ","element":"span"},{"text":"H","element":"span"},{"text":". We start by defining a metric ","element":"span"},{"style":{"height":17.6},"width":248.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-0.png","element":"img","alt":" d∞(·, ·) on H","inline":true,"padRight":true},{"text":"induced by the loss function:","element":"span"}],[{"text":"Definition 3.1. ","element":"span"},{"text":"For two hypotheses ","element":"span"},{"style":{"height":18.22},"width":979.88,"height":45.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-1.png","element":"img","alt":" f1, f2 ∈ H, let d∞(f1, f2) = supx∈X (ℓ(f1(x), f2(x)))","inline":true}],[{"text":"We can now introduce the notions of ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-2.png","element":"img","alt":" ǫ","inline":true},{"text":"-net and covering dimension.","element":"span"}],[{"style":{"height":17.6},"width":638.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-3.png","element":"img","alt":"Definition 3.2 (ǫ-net). For an ǫ","inline":true},{"text":", we say that a set ","element":"span"},{"style":{"height":15.09},"width":860.96,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-4.png","element":"img","alt":" S ⊆ H is an ǫ-net of H under the d∞ metric","inline":true,"padRight":true},{"text":"if for every ","element":"span"},{"style":{"height":17.6},"width":1393.96,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-5.png","element":"img","alt":" h ∈ H, there is h′ ∈ S with d∞(h, h′) ≤ ǫ. Let Nǫ(H) be an ǫ-net of H","inline":true,"padRight":true},{"text":"of minimum cardinality.","element":"span"}],[{"id":"id-12","text":"Definition 3.3 ","element":"span"},{"text":"(Covering Dimension)","element":"span"},{"text":". ","element":"span"},{"text":"Define the covering dimension ","element":"span"},{"text":"H ","element":"span"},{"text":"as","element":"span"}],[{"style":{"width":"29%"},"width":552,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-6.png","element":"img"}],[{"text":"Note that this definition of ","element":"span"},{"text":"Cdim ","element":"span"},{"text":"differs from Hausdorff dimension in that we care not just about the limit ","element":"span"},{"style":{"height":9.6},"width":73.76,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-7.png","element":"img","alt":" ǫ →","inline":true,"padRight":true},{"text":"0, but the largest value for any ","element":"span"},{"style":{"height":17.6},"width":167.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-8.png","element":"img","alt":" ǫ ∈ [0, 1/","inline":true},{"text":"2]); importantly, this guarantees us that, for any ","element":"span"},{"style":{"height":17.6},"width":179.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-9.png","element":"img","alt":" ǫ ∈ (0, 1),","inline":true}],[{"style":{"width":"28%"},"width":538,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-10.png","element":"img"}],[{"text":"Note that we specify ","element":"span"},{"style":{"height":21.26},"width":105.8,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-11.png","element":"img","alt":" ǫ ≤ 12 ","inline":true,"padRight":true},{"text":"to avoid issues when ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-12.png","element":"img","alt":" ǫ","inline":true,"padRight":true},{"text":"is close to 1 - any other fixed constant upper ","element":"span"},{"text":"bound ","element":"span"},{"text":"p ","element":"span"},{"text":"only changes this dimension by a constant factor of at most 1","element":"span"},{"text":"/ ","element":"span"},{"text":"log(1","element":"span"},{"text":"/p","element":"span"},{"text":").","element":"span"}],[{"text":"We give a few quick examples to give intuition about covering dimension.","element":"span"}],[{"text":"Example 3.4. ","element":"span"},{"text":"The space of functions ","element":"span"},{"style":{"height":17.6},"width":304.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-13.png","element":"img","alt":" f : [d] → {0, 1}","inline":true,"padRight":true},{"text":"with loss function ","element":"span"},{"style":{"height":18.88},"width":418.76,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-14.png","element":"img","alt":" ℓ(y1, y2) = 1y1̸=y2 has","inline":true,"padRight":true},{"text":"covering dimension ","element":"span"},{"text":"d","element":"span"},{"text":".","element":"span"}],[{"text":"Example 3.5 ","element":"span"},{"text":"(Contextual Search)","element":"span"},{"text":". ","element":"span"},{"text":"Let ","element":"span"},{"text":"B ","element":"span"},{"text":"be the unit-ball in ","element":"span"},{"style":{"height":19.53},"width":692.36,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-15.png","element":"img","alt":" Rd, i.e. B = {x ∈ Rd; ∥x∥2 ≤ 1}. For","inline":true,"padRight":true},{"text":"each ","element":"span"},{"style":{"height":16.4},"width":431.84,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-16.png","element":"img","alt":" v ∈ B, let fv : B → R","inline":true,"padRight":true},{"text":"be defined by the dot product ","element":"span"},{"style":{"height":17.6},"width":267.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-17.png","element":"img","alt":" fv(x) = ⟨v, x⟩","inline":true},{"text":". The linear contextual search problem is defined as the learning problem for class ","element":"span"},{"style":{"height":17.6},"width":332.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-18.png","element":"img","alt":" H = {fv; v ∈ B}","inline":true,"padRight":true},{"text":"with loss function ","element":"span"},{"style":{"height":17.6},"width":368.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-19.png","element":"img","alt":"ℓ(y1, y2) = |y1 − y2|","inline":true},{"text":". This class has covering dimension ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d","element":"span"},{"text":").","element":"span"}],[{"style":{"width":"96%"},"width":1797,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-20.png","element":"img"}],[{"text":"show that for any 0 ","element":"span"},{"style":{"height":21.45},"width":151.4,"height":53.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-21.png","element":"img","alt":" < ǫ ≤ 12","inline":true},{"text":", there is an ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-22.png","element":"img","alt":" ǫ","inline":true},{"text":"-net of the sphere of size (1","element":"span"},{"style":{"height":22.06},"width":126.76,"height":55.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-23.png","element":"img","alt":"/ǫ)O(d)","inline":true},{"text":". To do this, we can ","element":"span"},{"text":"greedily place points in the unit ball such that no two are within ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-24.png","element":"img","alt":" ǫ","inline":true,"padRight":true},{"text":"of each other. If we draw an","element":"span"}],[{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-25.png","element":"img","alt":"2","inline":true},{"text":"-radius ball around each point, these balls must be disjoint and contained in a ball centered at ","element":"span"},{"text":"the origin of radius 1 + ","element":"span"},{"style":{"height":12.86},"width":17.44,"height":32.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-26.png","element":"img","alt":"ǫ2","inline":true},{"text":". Thus, the maximum number of points we will place is","element":"span"}],[{"style":{"width":"23%"},"width":433,"height":119,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-27.png","element":"img"}],[{"text":"giving us an ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/10-28.png","element":"img","alt":" ǫ","inline":true},{"text":"-net of the same size.","element":"span"}],[{"id":"id-38","text":"Example 3.6 ","element":"span"},{"text":"(Sparse Contextual Search)","element":"span"},{"text":". ","element":"span"},{"text":"The sparse version of the contextual search problem is given by class ","element":"span"},{"style":{"height":17.6},"width":1036.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-0.png","element":"img","alt":" H = {fv; v ∈ B, ∥v∥0 ≤ s} where ∥v∥0 := |{i; vi ̸= 0}|","inline":true},{"text":". The loss function is still ","element":"span"},{"style":{"height":17.6},"width":391.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-1.png","element":"img","alt":"ℓ(y1, y2) = |y1 − y2|","inline":true},{"text":". The covering dimension of this class is ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"s ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":") (where we treat ","element":"span"},{"text":"s ","element":"span"},{"text":"as a constant and ","element":"span"},{"text":"d ","element":"span"},{"text":"as tending to ","element":"span"},{"style":{"height":8},"width":44,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-2.png","element":"img","alt":" ∞","inline":true},{"text":"). To see that the covering dimension is ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"s ","element":"span"},{"text":"log ","element":"span"},{"text":"d","element":"span"},{"text":"), note that for any ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-3.png","element":"img","alt":" ǫ","inline":true,"padRight":true},{"text":"we can use the result in the previous example to obtain an ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-4.png","element":"img","alt":" ǫ","inline":true},{"text":"-net of size","element":"span"}],[{"style":{"width":"31%"},"width":583,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-5.png","element":"img"}],[{"text":"Example 3.7 ","element":"span"},{"text":"(Unit demand)","element":"span"},{"text":". ","element":"span"},{"text":"In the unit demand version of contextual search the set ","element":"span"},{"style":{"height":19.35},"width":219.12,"height":48.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-6.png","element":"img","alt":" X = {0, 1}d","inline":true,"padRight":true},{"text":"and the hypothesis class consists of functions ","element":"span"},{"style":{"height":21.98},"width":696.24,"height":54.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-7.png","element":"img","alt":" fw(x) = maxi∈[d] wixi for w ∈ [0, 1]d","inline":true},{"text":". The covering dimension of this class is ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d","element":"span"},{"text":"). This example corresponds to the puzzle in the introduction and corresponds to the economic situation where a seller wants to price a bundle of goods (represented by the context) but the buyer has an unit-demand valuation, i.e., only cares about the highestvalued item in the bundle.","element":"span"}],[{"text":"To see that the covering dimension is ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d","element":"span"},{"text":"), note that the set ","element":"span"},{"style":{"height":19.54},"width":601.36,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-8.png","element":"img","alt":" S = {ǫx|x ∈ {0, 1, . . . , ⌊1/ǫ⌋}d}","inline":true,"padRight":true},{"text":"forms an ","element":"span"},{"style":{"height":24.93},"width":401.76,"height":62.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-9.png","element":"img","alt":" ǫ-net and |S| ∼� 1ǫ�d.","inline":true}]]},{"heading":"4 Loss Bounds based on Covering Dimension","paragraphs":[[{"text":"For simplicity, in the following theorems we will assume our hypotheses map ","element":"span"},{"style":{"height":12.4},"width":137.6,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-10.png","element":"img","alt":" X → R","inline":true,"padRight":true},{"text":"and that our loss is given by ","element":"span"},{"style":{"height":17.6},"width":321.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-11.png","element":"img","alt":" ℓ(y, y′) = |y − y′|","inline":true},{"text":". We will then remark on how to generalize our proof to other loss functions.","element":"span"}],[{"text":"4.1 ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log(","element":"span"},{"text":"T","element":"span"},{"text":")) ","element":"span"},{"text":"regret via Single-scale Steiner Potential","element":"span"}],[{"text":"Bounds that depend on log(","element":"span"},{"text":"T","element":"span"},{"text":") are typically easy for learning with binary feedback and can be obtained using different algorithms. ","element":"span"},{"text":"The interesting question in this setting is how to obtain bounds that are constant in ","element":"span"},{"text":"T","element":"span"},{"text":". Nevertheless, it is instructive to start with a simpler algorithm with regret ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"T","element":"span"},{"text":") for ","element":"span"},{"text":"d ","element":"span"},{"text":"= ","element":"span"},{"text":"Cdim","element":"span"},{"text":"(","element":"span"},{"text":"H","element":"span"},{"text":"). It will illustrate how the ","element":"span"},{"text":"Steiner potential ","element":"span"},{"text":"can be generalized to an abstract setting. Instead of keeping track of the hypotheses that are consistent with the feedback so far, we will keep an inflated version of that set.","element":"span"}],[{"text":"This algorithm starts with a ","element":"span"},{"style":{"height":14.74},"width":75.08,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-12.png","element":"img","alt":" T −1","inline":true},{"text":"-net of the hypothesis class and keeps a set of candidate hypotheses that are approximately consistent with observations seen so far. For each ","element":"span"},{"style":{"height":10.69},"width":36.97,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-13.png","element":"img","alt":" xt","inline":true},{"text":", it queries a random point around the median, halving the set of hypotheses with at least half probability.","element":"span"}],[{"style":{"width":"100%"},"width":1874,"height":410,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/11-14.png","element":"img"}],[{"text":"The analysis is based on the following lemma:","element":"span"}],[{"id":"id-36","style":{"width":"88%"},"width":1648,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-0.png","element":"img"}],[{"style":{"height":17.6},"width":668.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-1.png","element":"img","alt":"Proof. Assume f0(xt) > mt + 2/T","inline":true,"padRight":true},{"text":"(the other case is analogous), then with probability half, the algorithm guesses ","element":"span"},{"style":{"height":17.6},"width":279.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-2.png","element":"img","alt":" yt = mt + 2/T","inline":true,"padRight":true},{"text":"and gets the feedback that ","element":"span"},{"style":{"height":17.6},"width":208.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-3.png","element":"img","alt":" f0(xt) ≥ yt","inline":true},{"text":", which eliminates all the hypotheses ","element":"span"},{"style":{"height":17.6},"width":571,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-4.png","element":"img","alt":" f such that f(xt) < mt + 1/T","inline":true},{"text":". These hypotheses constitute at least half of ","element":"span"},{"style":{"height":14.69},"width":190.48,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-5.png","element":"img","alt":" Ht. ■","inline":true}],[{"text":"Corollary 4.2. ","element":"span"},{"text":"The ","element":"span"},{"text":"Single-scale Steiner Potential ","element":"span"},{"text":"algorithm obtains regret ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"T","element":"span"},{"text":") ","element":"span"},{"text":"in expectation.","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"The regret from periods where ","element":"span"},{"style":{"height":17.6},"width":390.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-6.png","element":"img","alt":" |mt − f0(xt)| ≤ 2/T","inline":true,"padRight":true},{"text":"is at most ","element":"span"},{"text":"O","element":"span"},{"text":"(1). For the remaining periods, the size of ","element":"span"},{"style":{"height":17.6},"width":74.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-7.png","element":"img","alt":" |Ht|","inline":true,"padRight":true},{"text":"is halved with at least ","element":"span"},{"style":{"height":21.26},"width":17,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-8.png","element":"img","alt":"12 ","inline":true,"padRight":true},{"text":"probability. Note that for such a ","element":"span"},{"text":"t","element":"span"},{"text":",","element":"span"}],[{"style":{"width":"24%"},"width":456,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-9.png","element":"img"}],[{"text":"Next, ","element":"span"},{"style":{"height":17.6},"width":341.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-10.png","element":"img","alt":" |Ht| ≥ 1 for all t","inline":true,"padRight":true},{"text":"since there is some element in ","element":"span"},{"style":{"height":21.26},"width":475.88,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-11.png","element":"img","alt":" H1 that is 1T -close to f0","inline":true,"padRight":true},{"text":"which is never ","element":"span"},{"text":"eliminated. However, ","element":"span"},{"style":{"height":24.66},"width":586.72,"height":61.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-12.png","element":"img","alt":"1|H1| ≥ 1T d and 1|Ht| ≤ 1 for all t","inline":true},{"text":". Thus for any integer ","element":"span"},{"text":"c","element":"span"},{"text":", the probability that ","element":"span"},{"text":"there are ","element":"span"},{"text":"c ","element":"span"},{"text":"periods with loss greater than ","element":"span"},{"style":{"height":32.32},"width":317.8,"height":80.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-13.png","element":"img","alt":"2T is at most T d( 32)c","inline":true,"padRight":true},{"text":". Thus, the expected number of periods","element":"span"}],[{"text":"with loss larger than 2","element":"span"},{"text":"/T ","element":"span"},{"text":"is at most","element":"span"}],[{"style":{"width":"70%"},"width":1321,"height":209,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-14.png","element":"img"}],[{"id":"id-13","text":"4.2 ","element":"span"},{"text":"Strategy for achieving constant (in ","element":"span"},{"text":"T","element":"span"},{"text":") regret","element":"span"}],[{"text":"In this subsection we provide some intuition on how to improve the regret of our algorithm from ","element":"span"},{"style":{"height":19.13},"width":1173.56,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-15.png","element":"img","alt":"O(d log T) to O(d2). In Single-scale Steiner Potential","inline":true,"padRight":true},{"text":"a loss larger than 1","element":"span"},{"style":{"height":17.6},"width":310.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-16.png","element":"img","alt":"/T causes Ht to","inline":true,"padRight":true},{"text":"half in size, but whenever it halves in size the only bound we can get for the loss is 1. To improve this bound, we need to guarantee that a loss of 1 can’t occur very often. Our strategy for doing that involves keeping multiple levels of discretization. Given ","element":"span"},{"style":{"height":12},"width":33.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-17.png","element":"img","alt":" yt","inline":true,"padRight":true},{"text":"and the feedback ","element":"span"},{"style":{"height":17.6},"width":330.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-18.png","element":"img","alt":" σt ∈ {−1, +1} we","inline":true,"padRight":true},{"text":"will keep for each ","element":"span"},{"text":"z > ","element":"span"},{"text":"0:","element":"span"}],[{"style":{"width":"60%"},"width":1125,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-19.png","element":"img"}],[{"text":"In other words, we keep an ","element":"span"},{"text":"z","element":"span"},{"text":"-discretization of hypotheses along with all the hypotheses that are consistent with the feedback so far with an ","element":"span"},{"text":"z","element":"span"},{"text":"-margin. The ","element":"span"},{"text":"z","element":"span"},{"text":"-margin is important to guarantee that any hypothesis that is ","element":"span"},{"style":{"height":16.4},"width":234.92,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-20.png","element":"img","alt":" z-close to f0","inline":true,"padRight":true},{"text":"will never be eliminated. We will also refer to ","element":"span"},{"style":{"height":19.15},"width":189.92,"height":47.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-21.png","element":"img","alt":" H0t as the","inline":true,"padRight":true},{"text":"set of hypotheses consistent with observations so far without any discretization or margin.","element":"span"}],[{"text":"Our strategy will be to choose in each round some discretization level ","element":"span"},{"text":"z ","element":"span"},{"text":"based on the maximum possible loss achievable in this round. We will divide the space of losses in exponentially sized buckets and define ","element":"span"},{"text":"i ","element":"span"},{"text":"to be the index of the bucket where the maximum loss falls:","element":"span"}],[{"style":{"width":"66%"},"width":1246,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-22.png","element":"img"}],[{"text":"Now we choose ","element":"span"},{"style":{"height":15.09},"width":324.12,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-23.png","element":"img","alt":" z = zi based on i","inline":true},{"text":", compute the median ","element":"span"},{"style":{"height":18.08},"width":427.6,"height":45.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-24.png","element":"img","alt":" mt of {f(xt); f ∈ Hzit }","inline":true,"padRight":true},{"text":"and guess either ","element":"span"},{"style":{"height":15.6},"width":482.88,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/12-25.png","element":"img","alt":"yt = mt − 2zi or mt + 2zi","inline":true,"padRight":true},{"text":"with half probability each. Now one of two things can happen:","element":"span"}],[{"text":"• ","element":"span"},{"text":"If the loss is larger than 2","element":"span"},{"style":{"height":18.08},"width":273.56,"height":45.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-0.png","element":"img","alt":"zi, the set Hzit","inline":true,"padRight":true},{"text":"will decrease by a factor of 2 with half probability. This should happen at most log ","element":"span"},{"style":{"height":17.81},"width":89.76,"height":44.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-1.png","element":"img","alt":" |Nzi|","inline":true,"padRight":true},{"text":"times in expectation, generating loss 10 ","element":"span"},{"style":{"height":19.34},"width":288.4,"height":48.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-2.png","element":"img","alt":" · 2−i log |Nzi| =","inline":true},{"style":{"height":19.13},"width":371.04,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-3.png","element":"img","alt":"O(2−i · d log(1/zi)).","inline":true}],[{"text":"• ","element":"span"},{"text":"If the loss is smaller than 2","element":"span"},{"style":{"height":10.69},"width":32.16,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-4.png","element":"img","alt":"zi","inline":true,"padRight":true},{"text":"we will show that the set ","element":"span"},{"style":{"height":21.97},"width":90.68,"height":54.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-5.png","element":"img","alt":" H2−it","inline":true,"padRight":true},{"text":"will decrease by at least 1 element in expectation (Lemma ","element":"span"},{"href":"#id-35","text":"4.5)","element":"a"},{"text":", so we get loss ","element":"span"},{"style":{"height":19.54},"width":392.36,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-6.png","element":"img","alt":" zi · |N2−i| = O(zi2di)","inline":true}],[{"text":"This leads to a regret of:","element":"span"}],[{"style":{"width":"60%"},"width":1138,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-7.png","element":"img"}],[{"id":"id-17","text":"4.3 ","element":"span"},{"text":"Analysis of the ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d","element":"span"},{"text":"2","element":"span"},{"text":") ","element":"span"},{"text":"algorithm","element":"span"}],[{"id":"id-22","text":"Theorem 4.3. ","element":"span"},{"text":"Let ","element":"span"},{"text":"d ","element":"span"},{"text":"= ","element":"span"},{"text":"Cdim","element":"span"},{"text":"(","element":"span"},{"text":"H","element":"span"},{"text":"). ","element":"span"},{"text":"The ","element":"span"},{"text":"Multi-scale Steiner Potential ","element":"span"},{"text":"algorithm incurs expected regret ","element":"span"},{"style":{"height":19.14},"width":90.92,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-8.png","element":"img","alt":" O(d2","inline":true},{"text":") in the binary feedback model.","element":"span"}],[{"style":{"width":"100%"},"width":1874,"height":415,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-9.png","element":"img"}],[{"text":"Note if there does not exist an index ","element":"span"},{"style":{"height":19.14},"width":661.44,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-10.png","element":"img","alt":" i such that width(H0t ; xt) ≤ 10 · 2−i","inline":true,"padRight":true},{"text":"then we must actually ","element":"span"},{"text":"have max","element":"span"},{"style":{"height":21.65},"width":678.24,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-11.png","element":"img","alt":"f∈H0t f(xt) = minf∈H0t f(xt) = f0(xt","inline":true},{"text":"). In this case we know the value of ","element":"span"},{"style":{"height":17.6},"width":94.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-12.png","element":"img","alt":" f0(xt","inline":true},{"text":") for sure so ","element":"span"},{"text":"we simply query this value and incur 0 loss.","element":"span"}],[{"id":"id-37","text":"Lemma 4.4. ","element":"span"},{"text":"If ","element":"span"},{"text":"i ","element":"span"},{"text":"is the index chosen in the ","element":"span"},{"text":"t","element":"span"},{"text":"-th step and ","element":"span"},{"style":{"height":21.51},"width":765.6,"height":53.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-13.png","element":"img","alt":" |mt − f0(xt)| > 2zi then |Hzit | ≤ 12|Hzit+1|","inline":true,"padRight":true},{"text":"with probability at least ","element":"span"},{"text":"1","element":"span"},{"text":"/","element":"span"},{"text":"2","element":"span"},{"text":".","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"The same as the proof of Lemma ","element":"span"},{"href":"#id-36","text":"4.1 ","element":"a"},{"text":"replacing 1","element":"span"},{"style":{"height":17.6},"width":797.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-14.png","element":"img","alt":"/T by zi. ■","inline":true}],[{"text":"The new ingredient is a “potential” argument when the loss is small:","element":"span"}],[{"id":"id-35","text":"Lemma 4.5. ","element":"span"},{"text":"If ","element":"span"},{"text":"i ","element":"span"},{"text":"is the index chosen in the ","element":"span"},{"text":"t","element":"span"},{"text":"-th step and ","element":"span"},{"style":{"height":17.6},"width":355.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-15.png","element":"img","alt":" |mt − f0(xt)| ≤ 2zi","inline":true},{"text":", then with probability at least ","element":"span"},{"text":"1","element":"span"},{"style":{"height":21.74},"width":699.4,"height":54.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-16.png","element":"img","alt":"/2, |Hrt | ≤ |Hrt+1| − 1 for r = 2−(i+1)","inline":true}],[{"text":"Proof. ","element":"span"},{"text":"By the choice of the index ","element":"span"},{"style":{"height":20.34},"width":680.48,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-17.png","element":"img","alt":" i, width(H0t ; xt) > 10 · 2−(i+1) = 10r","inline":true},{"text":", so there must exist ","element":"span"},{"style":{"height":18.94},"width":132.68,"height":47.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-18.png","element":"img","alt":" f ∈ Ht0","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":17.6},"width":342.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-19.png","element":"img","alt":" |f(xt) − mt| ≥ 5r","inline":true},{"text":". Let’s assume that ","element":"span"},{"style":{"height":17.6},"width":323.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-20.png","element":"img","alt":" f(xt) ≥ mt + 5r","inline":true,"padRight":true},{"text":"(the other case is analogous). The algorithm will query ","element":"span"},{"style":{"height":14.29},"width":160.8,"height":35.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-21.png","element":"img","alt":" mt + 2zi","inline":true,"padRight":true},{"text":"with half probability and learn that ","element":"span"},{"style":{"height":17.6},"width":493.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-22.png","element":"img","alt":" f0(xt) ≤ mt + 2zi (by the","inline":true,"padRight":true},{"text":"assumption that ","element":"span"},{"style":{"height":17.6},"width":385.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-23.png","element":"img","alt":" |mt − f0(xt)| ≤ 2zi).","inline":true}],[{"text":"Such a query must eliminate some hypothesis ","element":"span"},{"style":{"height":16.82},"width":147.96,"height":42.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-24.png","element":"img","alt":" f ′ ∈ Hrt ","inline":true,"padRight":true},{"text":"since there must be some ","element":"span"},{"style":{"height":16.82},"width":250.08,"height":42.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-25.png","element":"img","alt":" f ′ ∈ Hrt with","inline":true},{"style":{"height":17.6},"width":250.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-26.png","element":"img","alt":"d∞(f, f ′) ≤ r","inline":true},{"text":", so this hypothesis must satisfy ","element":"span"},{"style":{"height":17.6},"width":338.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-27.png","element":"img","alt":" f ′(xt) ≥ mt + 4 · r","inline":true,"padRight":true},{"text":"and hence will be ruled out by the information from querying ","element":"span"},{"style":{"height":14.29},"width":1340.56,"height":35.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/13-28.png","element":"img","alt":" mt + 2zi. ■","inline":true}],[{"text":"We can now proceed to prove Theorem ","element":"span"},{"href":"#id-22","text":"4.3.","element":"a"}],[{"href":"#id-22","style":{"height":16.8},"width":565.44,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-0.png","element":"img","alt":"Proof of Theorem 4.3. Let Ai","inline":true,"padRight":true},{"text":"be the number of times that index ","element":"span"},{"text":"i ","element":"span"},{"text":"is chosen by the algorithm and ","element":"span"},{"style":{"height":17.6},"width":513.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-1.png","element":"img","alt":"|mt − f0(xt)| > 2zi. Let Bi","inline":true,"padRight":true},{"text":"be the number of times that index ","element":"span"},{"text":"i ","element":"span"},{"text":"is chosen and ","element":"span"},{"style":{"height":17.6},"width":370.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-2.png","element":"img","alt":" |mt − f0(xt)| ≤ 2zi.","inline":true,"padRight":true},{"text":"Combining the previous two claims (Lemmas ","element":"span"},{"href":"#id-37","text":"4.4 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-35","text":"4.5)","element":"a"},{"text":", we have that","element":"span"}],[{"style":{"width":"21%"},"width":404,"height":114,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-3.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.42},"width":183.84,"height":43.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-4.png","element":"img","alt":" ri = 2−i.","inline":true,"padRight":true},{"text":"For each query with index ","element":"span"},{"text":"i","element":"span"},{"text":", the loss is at most 10","element":"span"},{"style":{"height":10.69},"width":45.6,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-5.png","element":"img","alt":"ri.","inline":true,"padRight":true},{"text":"Also for queries with ","element":"span"},{"style":{"height":17.6},"width":360,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-6.png","element":"img","alt":"|mt − f0(xt)| ≤ 2zi","inline":true},{"text":", the loss is at most 2","element":"span"},{"style":{"height":10.69},"width":32.16,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-7.png","element":"img","alt":"zi","inline":true},{"text":". Thus the total loss is at most ","element":"span"},{"style":{"height":18.38},"width":440.84,"height":45.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-8.png","element":"img","alt":"�i(10riAi + 2ziBi). It","inline":true,"padRight":true},{"text":"remains to note that","element":"span"}],[{"style":{"width":"88%"},"width":1650,"height":203,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-9.png","element":"img"}],[{"text":"Corollary 4.6. ","element":"span"},{"text":"In the Contextual Search with Symmetric Loss, if the hidden vector ","element":"span"},{"style":{"height":15.94},"width":195.16,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-10.png","element":"img","alt":" v ∈ Rd is","inline":true,"padRight":true},{"text":"guaranteed to be ","element":"span"},{"text":"s","element":"span"},{"text":"-sparse then there is an algorithm with total regret ","element":"span"},{"style":{"height":19.9},"width":259.24,"height":49.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-11.png","element":"img","alt":" O(s2 log2(d)).","inline":true}],[{"text":"Proof. ","element":"span"},{"text":"By combining Theorem ","element":"span"},{"href":"#id-22","text":"4.3 ","element":"a"},{"text":"and Example ","element":"span"},{"href":"#id-38","text":"3.6. ","element":"a"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-12.png","element":"img","alt":"■","inline":true}],[{"text":"Remark. ","element":"span"},{"text":"To adjust our proof to deal with any loss functions satisfying the assumptions outlined in Section ","element":"span"},{"text":"3 ","element":"span"},{"text":"we make the following adjustments. We replace max","element":"span"},{"style":{"height":21.46},"width":621.6,"height":53.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-13.png","element":"img","alt":"f∈H0t f(xt) − minf∈H0t f(xt) with","inline":true},{"style":{"height":21.65},"width":596.16,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-14.png","element":"img","alt":"L(maxf∈H0t f(xt), minf∈H0t f(xt","inline":true},{"text":")). Also, we replace ","element":"span"},{"style":{"height":16.62},"width":889.92,"height":41.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-15.png","element":"img","alt":" mt + 2zit with any y ∈ Y such that mt < y and","inline":true},{"style":{"height":17.6},"width":280.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-16.png","element":"img","alt":"L(mt, y) = 2zit","inline":true,"padRight":true},{"text":"and similar for ","element":"span"},{"style":{"height":15.82},"width":169.4,"height":39.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-17.png","element":"img","alt":" mt − 2zit","inline":true,"padRight":true},{"text":"(note this ","element":"span"},{"text":"y ","element":"span"},{"text":"exists by the continuity of the loss function).","element":"span"}],[{"id":"id-34","text":"4.4 ","element":"span"},{"text":"Impossibility Results for Pricing Loss","element":"span"}],[{"text":"The results from the previous section apply for loss functions that are somewhat well-behaved i.e. satisfying the conditions outlined at the beginning of Section ","element":"span"},{"text":"3. ","element":"span"},{"text":"Clearly, the pricing loss function does not satisfy these assumptions. While one may hope to guarantee poly(","element":"span"},{"text":"d","element":"span"},{"text":") log log ","element":"span"},{"text":"T ","element":"span"},{"text":"total loss (where ","element":"span"},{"text":"d ","element":"span"},{"text":"is the covering dimension of the hypothesis class), here we show that for the pricing loss function, it is actually impossible to guarantee regret in this case that is polynomial in the covering dimension.","element":"span"}],[{"text":"Claim 4.7. ","element":"span"},{"text":"Let ","element":"span"},{"text":"B ","element":"span"},{"text":"be the unit ball in ","element":"span"},{"style":{"height":15.14},"width":49.68,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-18.png","element":"img","alt":" Rd","inline":true},{"text":". Consider the domain ","element":"span"},{"text":"X ","element":"span"},{"text":"= ","element":"span"},{"text":"B ","element":"span"},{"text":"and the hypothesis class ","element":"span"},{"style":{"height":19.54},"width":1103.24,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-19.png","element":"img","alt":"H = {fv|v ∈ Rd, ||v||0 = 1, ||v||2 ≤ 1} where fv(x) = ⟨v, x⟩","inline":true,"padRight":true},{"text":"(note this is the same as the hypothesis class in Example ","element":"span"},{"href":"#id-38","text":"3.6 ","element":"a"},{"text":"with ","element":"span"},{"text":"s ","element":"span"},{"text":"= 1","element":"span"},{"text":"). Any learner must incur at least ","element":"span"},{"text":"Ω","element":"span"},{"style":{"height":31.6},"width":110.96,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-20.png","element":"img","alt":"�√d�","inline":true},{"text":"regret over ","element":"span"},{"style":{"height":15.14},"width":185.56,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-21.png","element":"img","alt":" d2 rounds","inline":true,"padRight":true},{"text":"with the pricing loss function.","element":"span"}],[{"text":"Remark. ","element":"span"},{"text":"Note that the covering dimension of ","element":"span"},{"text":"H ","element":"span"},{"text":"is ","element":"span"},{"text":"O","element":"span"},{"text":"(log ","element":"span"},{"text":"d","element":"span"},{"text":") so the above claim implies that there is an exponential separation between the regret (in the pricing loss setting) and covering dimension.","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"Choose ","element":"span"},{"text":"v ","element":"span"},{"text":"uniformly at random from the ","element":"span"},{"text":"d ","element":"span"},{"text":"points (1","element":"span"},{"text":", ","element":"span"},{"text":"0","element":"span"},{"text":", . . . , ","element":"span"},{"text":"0)","element":"span"},{"text":", ","element":"span"},{"text":"(0","element":"span"},{"text":", ","element":"span"},{"text":"1","element":"span"},{"text":", . . . , ","element":"span"},{"text":"0)","element":"span"},{"text":", . . . ","element":"span"},{"text":"(0","element":"span"},{"text":", . . . , ","element":"span"},{"text":"0","element":"span"},{"text":", ","element":"span"},{"text":"1) and let the true function be ","element":"span"},{"style":{"height":16.4},"width":220.04,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-22.png","element":"img","alt":" fv. Now let","inline":true}],[{"style":{"width":"52%"},"width":985,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/14-23.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":11.2},"width":183.6,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-0.png","element":"img","alt":" x2, . . . , xd","inline":true,"padRight":true},{"text":"be obtained by cyclically permuting the coordinates of ","element":"span"},{"style":{"height":10.69},"width":41.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-1.png","element":"img","alt":" x1","inline":true},{"text":". Now the adversary randomly permutes ","element":"span"},{"style":{"height":11.2},"width":183.6,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-2.png","element":"img","alt":" x1, . . . , xd","inline":true,"padRight":true},{"text":"to obtain a sequence ","element":"span"},{"style":{"height":12.99},"width":203.16,"height":32.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-3.png","element":"img","alt":" xi1, . . . , xid","inline":true,"padRight":true},{"text":"and presents the points in that order to the learner. Note the learner gains no information when it guesses a value ","element":"span"},{"style":{"height":30.19},"width":225.64,"height":75.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-4.png","element":"img","alt":" y ≤ 1√2(d−1)","inline":true},{"text":". If all of the","element":"span"}],[{"text":"learner’s guesses through the first ","element":"span"},{"text":"d ","element":"span"},{"text":"rounds are at most ","element":"span"},{"style":{"height":30},"width":139.72,"height":75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-5.png","element":"img","alt":"1√2(d−1) ","inline":true,"padRight":true},{"text":"then the learner incurs loss at least","element":"span"}],[{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-6.png","element":"img","alt":"3 ","inline":true,"padRight":true},{"text":"over the first ","element":"span"},{"text":"d ","element":"span"},{"text":"rounds and has gained no information. The adversary can then repeat this process.","element":"span"}],[{"text":"Now it remains to consider when the learner guesses a value above ","element":"span"},{"style":{"height":30},"width":139.24,"height":75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-7.png","element":"img","alt":"1√2(d−1) ","inline":true,"padRight":true},{"text":"at some point within the","element":"span"}],[{"text":"first ","element":"span"},{"text":"d ","element":"span"},{"text":"rounds. Say the first time this occurs is at round ","element":"span"},{"style":{"height":21.66},"width":228.2,"height":54.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-8.png","element":"img","alt":" j. With d−1d","inline":true,"padRight":true},{"text":"probability, the learner incurs","element":"span"}],[{"style":{"height":17.6},"width":139.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-9.png","element":"img","alt":"√2(d−1) ","inline":true,"padRight":true},{"text":"loss this round and is able to eliminate one of the ","element":"span"},{"text":"d ","element":"span"},{"text":"possible hypotheses. The problem","element":"span"}],[{"text":"then effectively reduces to ","element":"span"},{"style":{"height":12.8},"width":65.68,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-10.png","element":"img","alt":" d −","inline":true,"padRight":true},{"text":"1 dimensions. Repeating this argument inductively, we see that the adversary can guarantee regret","element":"span"}],[{"style":{"width":"98%"},"width":1842,"height":209,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-11.png","element":"img"}]]},{"heading":"5 Noisy Feedback","paragraphs":[[{"text":"We now consider the binary feedback model with noise, where each round the feedback is (independently) flipped with probability ","element":"span"},{"text":"p < ","element":"span"},{"text":"1","element":"span"},{"text":"/","element":"span"},{"text":"2. We will no longer be able to eliminate a hypothesis based on the feedback since it is always possible that the feedback was flipped, instead we will keep a weight function expressing the likelihood of each hypothesis given observations.","element":"span"}],[{"text":"We start by giving a ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"T","element":"span"},{"text":") algorithm for general hypothesis classes. ","element":"span"},{"text":"The algorithmic techniques will be standard and inspired by both Bayesian Inference and by algorithms in the Multiplicative Weight Updates family (such a Hedge or Weighted Majority). The analysis, however, will deviate from the usual analysis of multiplicative weights given the type of feedback. We don’t have access to the actual loss of the arm we pulled nor an unbiased estimator thereof. ","element":"span"},{"text":"This will require both a new potential function as well as a modification of the multiplicative weights framework: instead of sampling from the distribution induced by the weights, we will get the (weighted) median advice on what is the right guess for this context.","element":"span"}],[{"text":"Our main innovation is in Section ","element":"span"},{"href":"#id-14","text":"5.2 ","element":"a"},{"text":"where we obtain an algorithm with ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")) regret (independent of ","element":"span"},{"text":"T","element":"span"},{"text":") for the linear contextual search case. Instead of one weight function, we keep a family of weight functions. Each weight function will correspond to different levels of uncertainty about the inner product of the hidden point with the current context. By using geometric techniques to analyze the stochastic evolution of the weights, we show that they must concentrate near the true hypothesis.","element":"span"}],[{"text":"5.1 ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"T","element":"span"},{"text":") ","element":"span"},{"text":"algorithm for a general hypothesis class","element":"span"}],[{"text":"The usual approach in Bayesian inference is to start with a uniform prior over the set of hypotheses and given each observation, compute the posterior. It is important to emphasize that the true hypothesis ","element":"span"},{"style":{"height":16.4},"width":38.6,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/15-12.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"in our model is still chosen adversarially. ","element":"span"},{"text":"The Bayesian inference only serves to provide the intuition.","element":"span"}],[{"text":"The algorithm will be as follows: as before we will start with a discretized version of the hypothesis class ","element":"span"},{"style":{"height":17.6},"width":157.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-0.png","element":"img","alt":" NT −2(H","inline":true},{"text":") which we will call ","element":"span"},{"text":"N ","element":"span"},{"text":"for short in this section. We will keep a weight function ","element":"span"},{"style":{"height":14.69},"width":259.04,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-1.png","element":"img","alt":" wt : N → R","inline":true,"padRight":true},{"text":"which roughly expresses the likelihood that a hypothesis is close to the true hypothesis. ","element":"span"},{"text":"Our guess will be a perturbed version of the weighted median ","element":"span"},{"style":{"height":10.69},"width":50.4,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-2.png","element":"img","alt":" mt","inline":true,"padRight":true},{"text":"of the set ","element":"span"},{"style":{"height":17.6},"width":281.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-3.png","element":"img","alt":"{f(xt); f ∈ N}","inline":true},{"text":". Formally, the weighted median ","element":"span"},{"style":{"height":10.69},"width":50.4,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-4.png","element":"img","alt":" mt","inline":true,"padRight":true},{"text":"is a number that satisfies:","element":"span"}],[{"style":{"width":"54%"},"width":1025,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-5.png","element":"img"}],[{"text":"After receiving the feedback, we will update the weights in the following way (we will choose ","element":"span"},{"style":{"height":14.8},"width":88.52,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-6.png","element":"img","alt":" yt at","inline":true,"padRight":true},{"text":"random such that ","element":"span"},{"style":{"height":17.6},"width":173.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-7.png","element":"img","alt":" yt = f(xt","inline":true},{"text":") occurs with zero probability):","element":"span"}],[{"style":{"width":"52%"},"width":985,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-8.png","element":"img"}],[{"text":"for some parameter ","element":"span"},{"style":{"height":11.6},"width":38.08,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-9.png","element":"img","alt":" p′ ","inline":true,"padRight":true},{"text":"and re-normalizing afterwards:","element":"span"}],[{"style":{"width":"27%"},"width":510,"height":113,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-10.png","element":"img"}],[{"text":"In standard Bayesian inference, we would normally use ","element":"span"},{"style":{"height":11.6},"width":121.12,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-11.png","element":"img","alt":" p = p′","inline":true},{"text":". For this algorithm, we will choose any parameter ","element":"span"},{"style":{"height":17.6},"width":361.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-12.png","element":"img","alt":" p′ with p < p′ < 1/","inline":true},{"text":"2. The actual choice of parameter will only affect the constants. Also note that unlike Bayesian inference we don’t choose the guess with largest likelihood but a perturbed version of the median.","element":"span"}],[{"id":"id-44","style":{"width":"100%"},"width":1874,"height":472,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-13.png","element":"img"}],[{"id":"id-40","text":"Theorem 5.1. ","element":"span"},{"text":"In the noisy feedback model, the above algorithm incurs expected regret ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"T","element":"span"},{"text":").","element":"span"}],[{"text":"We will denote the true hypothesis by ","element":"span"},{"style":{"height":16.4},"width":38.6,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-14.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"as usual. Since ","element":"span"},{"style":{"height":16.4},"width":38.6,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-15.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"might not belong to the discretized set ","element":"span"},{"text":"N","element":"span"},{"text":", we will control the weight that is placed on the closest hypothesis. Let ","element":"span"},{"style":{"height":16.4},"width":38.59,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-16.png","element":"img","alt":" f1","inline":true,"padRight":true},{"text":"be a hypothesis in ","element":"span"},{"style":{"height":19.14},"width":493.44,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-17.png","element":"img","alt":" N with d∞(f1, f0) ≤ T −2.","inline":true}],[{"id":"id-39","style":{"height":17.6},"width":570.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-18.png","element":"img","alt":"Lemma 5.2. If f1(xt), f0(xt)","inline":true,"padRight":true},{"text":"are on the same side of ","element":"span"},{"style":{"height":17.01},"width":211.28,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-19.png","element":"img","alt":" yt and W −t","inline":true,"padRight":true},{"text":"is the total weight mass on the other side of ","element":"span"},{"style":{"height":12},"width":33.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-20.png","element":"img","alt":" yt","inline":true},{"text":", then we have the following equality:","element":"span"}],[{"style":{"width":"26%"},"width":501,"height":105,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/16-21.png","element":"img"}],[{"text":"Here the expectation is taken over the randomness in the feedback, and ","element":"span"},{"text":"c ","element":"span"},{"text":"is a constant satisfying ","element":"span"},{"text":"0 ","element":"span"},{"text":"< c < ","element":"span"},{"text":"1 ","element":"span"},{"text":"given by","element":"span"}],[{"style":{"width":"30%"},"width":576,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/17-0.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Assume wlog that ","element":"span"},{"style":{"height":21.59},"width":1013.32,"height":53.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/17-1.png","element":"img","alt":" f1(xt), f0(xt) ≥ yt and let W −t = �f∈N;f(xt) 2T ","inline":true,"padRight":true},{"text":"then for the constant ","element":"span"},{"text":"c ","element":"span"},{"text":"in Lemma ","element":"span"},{"href":"#id-39","text":"5.2, ","element":"a"},{"text":"then in expectation over ","element":"span"},{"text":"both the randomness in the choice of ","element":"span"},{"style":{"height":12},"width":33.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/17-17.png","element":"img","alt":" yt","inline":true,"padRight":true},{"text":"and the randomness in the feedback, we have:","element":"span"}],[{"style":{"width":"24%"},"width":456,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/17-18.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Assume without loss of generality that ","element":"span"},{"style":{"height":21.27},"width":356.16,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-0.png","element":"img","alt":" f0(xt) > mt + 2T .","inline":true,"padRight":true},{"text":"Since the magnitude of the perturbation is 1","element":"span"},{"style":{"height":17.6},"width":173.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-1.png","element":"img","alt":"/T, f1(xt","inline":true},{"text":") will be on the same side of our guess as ","element":"span"},{"style":{"height":17.6},"width":94.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-2.png","element":"img","alt":" f0(xt","inline":true},{"text":") and hence we can apply Lemma ","element":"span"},{"href":"#id-39","text":"5.2. ","element":"a"},{"text":"With probability at least 1","element":"span"},{"style":{"height":17.81},"width":1077.44,"height":44.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-3.png","element":"img","alt":"/2 we have yt > mt and hence W −t ≥ 1/2. With the","inline":true,"padRight":true},{"text":"remaining probability we use the trivial bound ","element":"span"},{"style":{"height":16.61},"width":121.84,"height":41.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-4.png","element":"img","alt":" W −t ≥","inline":true,"padRight":true},{"text":"0. Combining those we get the bound in the ","element":"span"},{"text":"statement. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-5.png","element":"img","alt":"■","inline":true}],[{"text":"The previous lemmas imply that ","element":"span"},{"style":{"height":17.6},"width":87.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-6.png","element":"img","alt":" wt(f","inline":true},{"text":") grows by a constant factor (in expectation) whenever the median is far from the true point. We conclude the proof by showing that this can’t happen too often since weights are bounded.","element":"span"}],[{"text":"Proof of Theorem ","element":"span"},{"href":"#id-40","text":"5.1. ","element":"a"},{"text":"The regret bound follows directly from the fact that the probability of having ","element":"span"},{"style":{"height":17.6},"width":376.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-7.png","element":"img","alt":"|f0(xt) − mt| > 2/T","inline":true,"padRight":true},{"text":"for more than Ω(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"T","element":"span"},{"text":") periods is at most ","element":"span"},{"text":"O","element":"span"},{"text":"(1","element":"span"},{"text":"/T","element":"span"},{"text":"). Our strategy for proving this is to define a random process ","element":"span"},{"style":{"height":14.69},"width":37.44,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-8.png","element":"img","alt":" Yt","inline":true,"padRight":true},{"text":"that is a super-martingale, i.e. ","element":"span"},{"style":{"height":17.6},"width":231.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-9.png","element":"img","alt":" E[Yt+1] ≤ Yt","inline":true,"padRight":true},{"text":"and argue that if ","element":"span"},{"style":{"height":17.6},"width":380.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-10.png","element":"img","alt":"|f0(xt) − mt| > 2/T","inline":true,"padRight":true},{"text":"happens too often, then ","element":"span"},{"style":{"height":14.69},"width":49.44,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-11.png","element":"img","alt":" YT","inline":true,"padRight":true},{"text":"will be much larger than ","element":"span"},{"style":{"height":14.69},"width":42.44,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-12.png","element":"img","alt":" Y1","inline":true},{"text":". This happens with small probability by Markov’s inequality.","element":"span"}],[{"style":{"width":"99%"},"width":1867,"height":276,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-13.png","element":"img"}],[{"text":"Now define the following stochastic process:","element":"span"}],[{"style":{"width":"11%"},"width":221,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-14.png","element":"img"}],[{"text":"It is simple to see that ","element":"span"},{"style":{"height":14.69},"width":42.44,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-15.png","element":"img","alt":" Y1","inline":true,"padRight":true},{"text":"= 1. The previous lemmas imply that ","element":"span"},{"style":{"height":14.69},"width":37.44,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-16.png","element":"img","alt":" Yt","inline":true,"padRight":true},{"text":"is a super-martingale, i.e. ","element":"span"},{"style":{"height":17.6},"width":699.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-17.png","element":"img","alt":"E[Yt+1] ≤ Yt and hence E[YT ] ≤ 1.","inline":true,"padRight":true},{"text":"Now, in the case that ","element":"span"},{"style":{"height":17.6},"width":392.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-18.png","element":"img","alt":" |f0(xt) − mt| > 2/T","inline":true,"padRight":true},{"text":"for more than Ω(","element":"span"},{"text":"d ","element":"span"},{"text":"log ","element":"span"},{"text":"T","element":"span"},{"text":") periods, we have","element":"span"}],[{"style":{"width":"76%"},"width":1436,"height":234,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-19.png","element":"img"}],[{"text":"but since ","element":"span"},{"style":{"height":17.6},"width":151.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-20.png","element":"img","alt":" E[YT ] ≤","inline":true,"padRight":true},{"text":"1, this can happen with at most ","element":"span"},{"text":"O","element":"span"},{"text":"(1","element":"span"},{"text":"/T","element":"span"},{"text":") probability by Markov’s inequality. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-21.png","element":"img","alt":"■","inline":true}],[{"text":"Remark. ","element":"span"},{"text":"We’ve assumed here that ","element":"span"},{"text":"p ","element":"span"},{"text":"is a constant bounded away from 1","element":"span"},{"text":"/","element":"span"},{"text":"2. How does the regret of this algorithm depend on ","element":"span"},{"text":"p ","element":"span"},{"text":"as ","element":"span"},{"text":"p ","element":"span"},{"text":"approaches 1","element":"span"},{"style":{"height":21.26},"width":302.24,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-22.png","element":"img","alt":"/2? If p = 12 − δ","inline":true},{"text":", then we can set ","element":"span"},{"style":{"height":21.26},"width":325.76,"height":53.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-23.png","element":"img","alt":" p′ = 12 − δ′ where","inline":true},{"style":{"height":17.6},"width":133.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-24.png","element":"img","alt":"δ′ = δ/","inline":true},{"text":"2. This leads to ","element":"span"},{"style":{"height":19.14},"width":166.76,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-25.png","element":"img","alt":" c = O(δ2","inline":true},{"text":") – adapting the proof of Theorem ","element":"span"},{"href":"#id-40","text":"5.1 ","element":"a"},{"text":"then shows we can have at most ","element":"span"},{"style":{"height":31.6},"width":200.24,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-26.png","element":"img","alt":" O�d log Tδ2 �","inline":true},{"text":"inaccurate rounds, for a total of at most ","element":"span"},{"style":{"height":31.6},"width":337.92,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-27.png","element":"img","alt":" O�d log Tδ2 �regret.","inline":true}],[{"text":"Comparison with other approaches ","element":"span"},{"text":"It is worth comparing our algorithm with other learning techniques in the multiplicative weights family. The ‘experts’ in our problem form a continuous set with a linear structure, which resembles the settings of Kalai and Vempala ","element":"span"},{"href":"#id-41","referenceIndex":12,"text":"[12] ","element":"a"},{"text":"and Abernathy et al ","element":"span"},{"text":"[1]","element":"span"},{"text":". In their setting, however, the optimal achievable regret is ","element":"span"},{"style":{"height":19.98},"width":118.84,"height":49.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/18-28.png","element":"img","alt":" O(√T","inline":true},{"text":") while in our case we achieve ","element":"span"},{"text":"O","element":"span"},{"text":"(log ","element":"span"},{"text":"T","element":"span"},{"text":"). Another feature of our model is the stochasticity of the losses. With stochastic losses, Wei and Luo ","element":"span"},{"href":"#id-42","referenceIndex":25,"text":"[25] ","element":"a"},{"text":"recently showed that multiplicative weight update algorithms achieve ","element":"span"},{"text":"O","element":"span"},{"text":"(log ","element":"span"},{"text":"T","element":"span"},{"text":") regret when the learning rate is tuned properly, but their guarantees depend on the inverse of the gap between the two best arms. An important difference, however, is that in our setting we don’t have access to the loss. We only learn whether our guess was too large or too small, which doesn’t allow us to apply any of those algorithms.","element":"span"}],[{"id":"id-14","text":"5.2 ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")) ","element":"span"},{"text":"algorithm for Noisy Contextual Search","element":"span"}],[{"text":"In this section we study contextual search in the noisy feedback model. We show that here we can achieve total loss ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")) independent of ","element":"span"},{"text":"T ","element":"span"},{"text":"by exploiting the geometry of the Euclidean space. Throughout this section we will use ","element":"span"},{"style":{"height":16},"width":120.68,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-0.png","element":"img","alt":" q0 ∈ B","inline":true,"padRight":true},{"text":"to denote the true point, i.e. ","element":"span"},{"style":{"height":17.6},"width":285.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-1.png","element":"img","alt":" f0(x) = ⟨q0, x⟩.","inline":true}],[{"text":"Our approach (Algorithm ","element":"span"},{"href":"#id-43","text":"6) ","element":"a"},{"text":"builds off the Bayesian inference approach in the previous section (Algorithm ","element":"span"},{"href":"#id-44","text":"5) ","element":"a"},{"text":"by combining it with the multi-scale discretization ideas in Section ","element":"span"},{"href":"#id-17","text":"4.3. ","element":"a"},{"text":"At a high (and slightly inaccurate) level, Algorithm ","element":"span"},{"href":"#id-43","text":"6 ","element":"a"},{"text":"works as follows. Throughout the algorithm, we maintain a distribution ","element":"span"},{"text":"w ","element":"span"},{"text":"over the unit ball ","element":"span"},{"text":"B","element":"span"},{"text":"(0","element":"span"},{"text":", ","element":"span"},{"text":"1) where ","element":"span"},{"text":"w","element":"span"},{"text":"(","element":"span"},{"text":"q","element":"span"},{"text":") represents the likelihood that ","element":"span"},{"text":"q ","element":"span"},{"text":"is our true point ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-2.png","element":"img","alt":" q0","inline":true},{"text":". Each round ","element":"span"},{"text":"t","element":"span"},{"text":", we are provided a direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-3.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"by the adversary. We begin by measuring the “width” of our distribution in the direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-4.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"– i.e., the length of the smallest interval in this direction which contains almost all of the mass of our distribution ","element":"span"},{"text":"w","element":"span"},{"text":". Then (similarly as in Algorithm ","element":"span"},{"href":"#id-44","text":"5)","element":"a"},{"text":", we will guess a perturbed version of the median of ","element":"span"},{"text":"w ","element":"span"},{"text":"in the direction ","element":"span"},{"style":{"height":11.2},"width":51.36,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-5.png","element":"img","alt":" xt,","inline":true,"padRight":true},{"text":"where the size of the perturbation depends on the width. Finally, we multiplicatively update the distribution ","element":"span"},{"text":"w","element":"span"},{"text":", penalizing points on the wrong side of our guess (again, similarly as in Algorithm ","element":"span"},{"href":"#id-44","text":"5)","element":"a"},{"text":".","element":"span"}],[{"text":"In the actual algorithm, we maintain a separate distribution ","element":"span"},{"style":{"height":10.69},"width":43.2,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-6.png","element":"img","alt":" wi","inline":true,"padRight":true},{"text":"for each possible scale ","element":"span"},{"style":{"height":16},"width":105.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-7.png","element":"img","alt":" γi for","inline":true,"padRight":true},{"text":"the width (in particular, we are in scale ","element":"span"},{"style":{"height":10.69},"width":43.2,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-8.png","element":"img","alt":" wi","inline":true,"padRight":true},{"text":"if almost all of the mass of ","element":"span"},{"style":{"height":10.69},"width":86.12,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-9.png","element":"img","alt":" wi−1","inline":true,"padRight":true},{"text":"is concentrated in a small strip in direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/19-10.png","element":"img","alt":" xt","inline":true},{"text":"). This aids analysis in letting us guarantee we operate in each scale for at most a bounded number of rounds, which lets us bound the total loss of this algorithm.","element":"span"}],[{"id":"id-43","style":{"width":"100%"},"width":1874,"height":1503,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-0.png","element":"img"}],[{"id":"id-45","text":"Theorem 5.5. ","element":"span"},{"text":"Algorithm ","element":"span"},{"href":"#id-43","text":"6 ","element":"a"},{"text":"incurs ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")) expected total loss for the problem of noisy linear contextual search.","element":"span"}],[{"text":"The proof of Theorem ","element":"span"},{"href":"#id-45","text":"5.5 ","element":"a"},{"text":"is structured roughly as follows. ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":14.69},"width":41.76,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-1.png","element":"img","alt":" Li","inline":true,"padRight":true},{"text":"be the (expected) loss sustained at scale ","element":"span"},{"text":"i","element":"span"},{"text":". We wish to show that ","element":"span"},{"style":{"height":18.39},"width":294.68,"height":45.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-2.png","element":"img","alt":"�i Li = poly(d","inline":true},{"text":"). To bound ","element":"span"},{"style":{"height":14.69},"width":41.76,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-3.png","element":"img","alt":" Li","inline":true},{"text":", we’ll start by roughly ","element":"span"},{"text":"following the analysis in Theorem ","element":"span"},{"href":"#id-40","text":"5.1. ","element":"a"},{"text":"Specifically, we’ll look at the total weight (according to ","element":"span"},{"style":{"height":17.6},"width":62.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-4.png","element":"img","alt":" wi)","inline":true,"padRight":true},{"text":"of a tiny ball surrounding the true point ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-5.png","element":"img","alt":" q0","inline":true},{"text":". Let the weight of this ball at time ","element":"span"},{"style":{"height":17.49},"width":158.4,"height":43.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-6.png","element":"img","alt":" t be Wi,t","inline":true},{"text":". We’ll again show that 1","element":"span"},{"style":{"height":18.29},"width":96,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-7.png","element":"img","alt":"/Wi,t","inline":true,"padRight":true},{"text":"when suitably scaled is a super-martingale: it decreases in expectation by a large amount whenever our guess is far from accurate and cannot increase very much in expectation even if our guess is close to accurate. Since 1","element":"span"},{"style":{"height":18.29},"width":96,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-8.png","element":"img","alt":"/Wi,t","inline":true,"padRight":true},{"text":"cannot decrease below 1, this lets us upper bound the number of rounds where we are far from accurate.","element":"span"}],[{"text":"Now, even when we are far from accurate, we know that since we are in scale ","element":"span"},{"text":"i","element":"span"},{"text":", almost all of the mass of ","element":"span"},{"style":{"height":10.69},"width":43.2,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-9.png","element":"img","alt":" wi","inline":true,"padRight":true},{"text":"is concentrated on some thin strip in direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-10.png","element":"img","alt":" xt","inline":true},{"text":". If the true point ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-11.png","element":"img","alt":" q0","inline":true,"padRight":true},{"text":"is located in or near this strip, this lets us bound the loss each round when we are far from accurate (since the median will lie in this strip). So it suffices to show that if a weight function ","element":"span"},{"style":{"height":10.69},"width":43.2,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-12.png","element":"img","alt":" wi","inline":true,"padRight":true},{"text":"concentrates on some thin strip, then with high probability, the true point ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-13.png","element":"img","alt":" q0","inline":true,"padRight":true},{"text":"lies close to this strip.","element":"span"}],[{"text":"To prove this, we again look at the weight of a small ball ","element":"span"},{"style":{"height":17.6},"width":250.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-14.png","element":"img","alt":" Bα = B(q0, α","inline":true},{"text":") with radius ","element":"span"},{"style":{"height":12.8},"width":176.64,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/20-15.png","element":"img","alt":" α around","inline":true}],[{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-0.png","element":"img","alt":"q0","inline":true,"padRight":true},{"text":"(see left side of Figure ","element":"span"},{"href":"#id-46","text":"2)","element":"a"},{"text":". If we know that the weight on some strip is at least some threshold ","element":"span"},{"style":{"height":18.29},"width":474.16,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-1.png","element":"img","alt":"τ, then if wi,t(Bα) + τ >","inline":true,"padRight":true},{"text":"1, we know that the ball and strip intersect, and therefore ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-2.png","element":"img","alt":" q0","inline":true,"padRight":true},{"text":"is at most ","element":"span"},{"id":"id-46","text":"distance ","element":"span"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-3.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"away from this strip. It thus suffices to show that ","element":"span"},{"style":{"height":18.29},"width":299.48,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-4.png","element":"img","alt":" wi,t(Bα) > 1−τ","inline":true,"padRight":true},{"text":"with high probability.","element":"span"}],[{"style":{"width":"77%"},"width":1446,"height":634,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-5.png","element":"img"}],[{"text":"Now, we will choose ","element":"span"},{"style":{"height":12.8},"width":150.2,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-6.png","element":"img","alt":" α and τ","inline":true,"padRight":true},{"text":"large enough so that this inequality is satisfied at time ","element":"span"},{"text":"t ","element":"span"},{"text":"= 0. We therefore only need to show that this is still true with high probability for all times ","element":"span"},{"text":"t","element":"span"},{"text":". Intuitively, this should be true – the amount of weight on a ball centered at the true point ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-7.png","element":"img","alt":" q0","inline":true,"padRight":true},{"text":"should only increase as time goes on and we get more feedback (the feedback is noisy, so we might occasionally decrease the weight of this ball, but overall the increases should drown out the decreases). Proving this formally, however, is technically challenging and where we need to use the Euclidean geometry specific to linear contextual search.","element":"span"}],[{"text":"To show this, we use the following lemma. Choose two points ","element":"span"},{"style":{"height":16},"width":178.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-8.png","element":"img","alt":" q1 and q2","inline":true,"padRight":true},{"text":"on a line through ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-9.png","element":"img","alt":" q0","inline":true,"padRight":true},{"text":"so that ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-10.png","element":"img","alt":" q1","inline":true,"padRight":true},{"text":"lies between ","element":"span"},{"style":{"height":16},"width":175.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-11.png","element":"img","alt":" q0 and q2","inline":true},{"text":". We claim that with high probability (for all times ","element":"span"},{"style":{"height":18.29},"width":245.2,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-12.png","element":"img","alt":" t), wi,t(q1) ≥","inline":true},{"style":{"height":18.29},"width":178.76,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-13.png","element":"img","alt":"κ · wi,t(q2","inline":true},{"text":") for some constant ","element":"span"},{"style":{"height":8.4},"width":25,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-14.png","element":"img","alt":" κ","inline":true},{"text":". To show this, observe that there is no half space which contains both ","element":"span"},{"style":{"height":16},"width":389.48,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-15.png","element":"img","alt":" q0 and q2 but not q1","inline":true},{"text":". This means that the only way ","element":"span"},{"style":{"height":18.29},"width":119.72,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-16.png","element":"img","alt":" wi,t(q2","inline":true},{"text":") can increase relative to ","element":"span"},{"style":{"height":18.29},"width":138.92,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-17.png","element":"img","alt":" wi,t(q1)","inline":true,"padRight":true},{"text":"is if a guess separates ","element":"span"},{"style":{"height":16},"width":198.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-18.png","element":"img","alt":" q2 from q1","inline":true,"padRight":true},{"text":"and if the feedback on this guess is noisy (right side of Figure ","element":"span"},{"href":"#id-46","text":"2)","element":"a"},{"text":". This occurs with probability ","element":"span"},{"text":"p < ","element":"span"},{"text":"1","element":"span"},{"text":"/","element":"span"},{"text":"2 and is unlikelier than the alternative (which increases the weight of ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-19.png","element":"img","alt":" q1","inline":true,"padRight":true},{"text":"relative to ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-20.png","element":"img","alt":" q2","inline":true},{"text":"). We can thus bound the ratio of ","element":"span"},{"style":{"height":18.29},"width":280.52,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-21.png","element":"img","alt":" wi,t(q1)/wi,t(q2","inline":true},{"text":") from below with high probability over all rounds.","element":"span"}],[{"text":"If we could union bound over all points in ","element":"span"},{"style":{"height":15.09},"width":51.28,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-22.png","element":"img","alt":" Bα","inline":true,"padRight":true},{"text":"we would be done (this inequality allows us to relate the weight of all the points outside ","element":"span"},{"style":{"height":15.09},"width":51.28,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-23.png","element":"img","alt":" Bα","inline":true,"padRight":true},{"text":"to the weight of points inside ","element":"span"},{"style":{"height":15.09},"width":51.28,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-24.png","element":"img","alt":" Bα","inline":true},{"text":"). Unfortunately there are infinitely many points inside ","element":"span"},{"style":{"height":15.09},"width":50.8,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-25.png","element":"img","alt":" Bα","inline":true,"padRight":true},{"text":"so we cannot apply a naive union bound. Luckily, we can show that nearby points are very likely to have similar weights: the only way the relative weight of two nearby points ","element":"span"},{"style":{"height":16},"width":158.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-26.png","element":"img","alt":" q and q′ ","inline":true,"padRight":true},{"text":"changes is if we guess a hyperplane separating ","element":"span"},{"style":{"height":16},"width":158.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-27.png","element":"img","alt":" q and q′ ","inline":true,"padRight":true},{"text":"– and since we add a perturbation to our guess every round, we can bound the probability of this happening. This allows us to repeat the previous geometric argument with ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-28.png","element":"img","alt":" ǫ","inline":true},{"text":"-nets instead of single points, which completes the proof.","element":"span"}],[{"text":"Notation. ","element":"span"},{"text":"Below, we will use ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-29.png","element":"img","alt":" q0","inline":true,"padRight":true},{"text":"to denote the hidden point. We let ","element":"span"},{"text":"B","element":"span"},{"text":"(0","element":"span"},{"text":", ","element":"span"},{"text":"1) denote the unit ball and in general ","element":"span"},{"text":"B","element":"span"},{"text":"(","element":"span"},{"text":"q, r","element":"span"},{"text":") to denote the ball of radius ","element":"span"},{"text":"r ","element":"span"},{"text":"centered at ","element":"span"},{"text":"q","element":"span"},{"text":". We will use ","element":"span"},{"style":{"height":13.09},"width":64.32,"height":32.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/21-30.png","element":"img","alt":" wi,t","inline":true,"padRight":true},{"text":"to denote the","element":"span"}],[{"text":"weight function ","element":"span"},{"style":{"height":15.09},"width":254.56,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-0.png","element":"img","alt":" wi at round t","inline":true},{"text":". For a set ","element":"span"},{"style":{"height":17.6},"width":167.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-1.png","element":"img","alt":" S ⊂ B(0,","inline":true,"padRight":true},{"text":"1), we use the notation","element":"span"}],[{"style":{"width":"22%"},"width":417,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-2.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":24.53},"width":210.2,"height":61.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-3.png","element":"img","alt":" αi = 12104d2i","inline":true,"padRight":true},{"text":". Define the set ","element":"span"},{"style":{"height":17.01},"width":292.32,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-4.png","element":"img","alt":" Sαi to be the αi","inline":true},{"text":"-net consisting of all points in the unit ball whose ","element":"span"},{"text":"coordinates are integer multiples of ","element":"span"},{"style":{"height":19.04},"width":32.6,"height":47.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-5.png","element":"img","alt":"αid ","inline":true,"padRight":true},{"text":". Note that ","element":"span"},{"style":{"height":35.19},"width":938.6,"height":87.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-6.png","element":"img","alt":" |Sαi| ≤�2dαi�d. For all i, let Γi = B(q0, αi)∩B(0, 1)","inline":true,"padRight":true},{"text":"be the ball of radius ","element":"span"},{"style":{"height":10.69},"width":39.84,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-7.png","element":"img","alt":" αi","inline":true,"padRight":true},{"text":"centered at ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-8.png","element":"img","alt":" q0","inline":true},{"text":". For simplicity, throughout this proof we will assume that the feedback noise is fixed at ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1","element":"span"},{"text":"/","element":"span"},{"text":"3 (it is straightforward to adapt this proof for any other ","element":"span"},{"text":"p < ","element":"span"},{"text":"1","element":"span"},{"text":"/","element":"span"},{"text":"2; doing so only affects the constant factor of the loss bound).","element":"span"}],[{"style":{"width":"33%"},"width":623,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-9.png","element":"img"}],[{"text":"As in the analysis of Algorithm ","element":"span"},{"href":"#id-44","text":"5, ","element":"a"},{"text":"we begin by understanding how the reciprocal of our weight function 1","element":"span"},{"style":{"height":18.29},"width":129.12,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-10.png","element":"img","alt":"/wi,t(p","inline":true},{"text":") evolves over time. This will allow us to construct various helpful super-martingales (for example, allowing us to bound the number of rounds we spend in each scale).","element":"span"}],[{"text":"The following claim relates how 1","element":"span"},{"style":{"height":18.29},"width":141.8,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-11.png","element":"img","alt":"/wi,t(q1","inline":true},{"text":") changes when ","element":"span"},{"style":{"height":16},"width":174.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-12.png","element":"img","alt":" q1 and q0","inline":true,"padRight":true},{"text":"are on the same side of the hyperplane ","element":"span"},{"style":{"height":17.6},"width":196.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-13.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"(i.e. is more likely to be consistent with feedback).","element":"span"}],[{"id":"id-47","text":"Claim 5.6. ","element":"span"},{"text":"Consider a round ","element":"span"},{"text":"t","element":"span"},{"text":". Say the adversary picks direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-14.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"and the algorithm queries ","element":"span"},{"text":"ˆ","element":"span"},{"text":"y","element":"span"},{"text":". Let ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-15.png","element":"img","alt":" q1","inline":true,"padRight":true},{"text":"be a point such that ","element":"span"},{"style":{"height":16},"width":175.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-16.png","element":"img","alt":" q1 and q0","inline":true,"padRight":true},{"text":"are on the same side of the hyperplane ","element":"span"},{"style":{"height":17.6},"width":290.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-17.png","element":"img","alt":" ⟨xt, q⟩ = ˆy. Let","inline":true}],[{"style":{"width":"93%"},"width":1743,"height":593,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-18.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Recall that points that violate feedback have their weight multiplied by (1 ","element":"span"},{"style":{"height":12},"width":66.64,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-19.png","element":"img","alt":" − η","inline":true},{"text":") (and then the distribution is renormalized). With probability 1 ","element":"span"},{"style":{"height":17.6},"width":165.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-20.png","element":"img","alt":" − p = 2/","inline":true},{"text":"3 (when the feedback is not flipped), we thus have that","element":"span"}],[{"style":{"width":"33%"},"width":632,"height":104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-21.png","element":"img"}],[{"text":"Likewise, with probability ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1","element":"span"},{"text":"/","element":"span"},{"text":"3 (when the feedback is flipped), we have that","element":"span"}],[{"style":{"width":"33%"},"width":632,"height":104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/22-22.png","element":"img"}],[{"text":"Taking expectations over the feedback, we therefore have that","element":"span"}],[{"style":{"width":"83%"},"width":1556,"height":427,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-0.png","element":"img"}],[{"text":"Points very close to ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-1.png","element":"img","alt":" q0","inline":true,"padRight":true},{"text":"are likely to be on the same side of the hyperplane as ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-2.png","element":"img","alt":" q0","inline":true},{"text":", allowing us to apply Claim ","element":"span"},{"href":"#id-47","text":"5.6.","element":"a"}],[{"id":"id-48","style":{"height":17.6},"width":1012.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-3.png","element":"img","alt":"Claim 5.7. Let q1 ∈ B(0, 1) such that ∥q1 − q0∥ ≤ αi","inline":true},{"text":". Then, in expectation both over the randomness in the feedback and the algorithm,","element":"span"}],[{"style":{"width":"65%"},"width":1234,"height":203,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-4.png","element":"img"}],[{"style":{"height":17.6},"width":520.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-5.png","element":"img","alt":"Proof. Since ∥q1 − q0∥ ≤ αi","inline":true},{"text":", and since ˆ","element":"span"},{"text":"y ","element":"span"},{"text":"is chosen by adding a uniform ","element":"span"},{"style":{"height":16.4},"width":36.48,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-6.png","element":"img","alt":" βi","inline":true,"padRight":true},{"text":"random variable to ","element":"span"},{"text":"y","element":"span"},{"text":", the probability that ","element":"span"},{"style":{"height":16},"width":175.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-7.png","element":"img","alt":" q1 and q0","inline":true,"padRight":true},{"text":"are on opposite sides of the plane ","element":"span"},{"style":{"height":17.6},"width":198.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-8.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"is at most ","element":"span"},{"style":{"height":21.44},"width":32.6,"height":53.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-9.png","element":"img","alt":"αiβi ","inline":true,"padRight":true},{"text":". Combining ","element":"span"},{"text":"this with Claim ","element":"span"},{"href":"#id-47","text":"5.6 ","element":"a"},{"text":"gives us the desired result. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-10.png","element":"img","alt":"■","inline":true}],[{"text":"We now use Claims ","element":"span"},{"href":"#id-48","text":"5.7 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-47","text":"5.6 ","element":"a"},{"text":"to understand how 1","element":"span"},{"style":{"height":18.29},"width":144.48,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-11.png","element":"img","alt":"/wi,t(Γi","inline":true},{"text":") changes over time (generalizing from single points to small balls). This first claim bounds the decrease in 1","element":"span"},{"style":{"height":18.29},"width":433.92,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-12.png","element":"img","alt":"/wit+1,t+1(Γit+1) when","inline":true,"padRight":true},{"text":"our guess is close to accurate.","element":"span"}],[{"id":"id-49","style":{"height":17.6},"width":733.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-13.png","element":"img","alt":"Claim 5.8. Assume |y − ⟨xt, q0⟩| ≤ βit","inline":true},{"text":". Then, in expectation both over the randomness in feedback and in our algorithm,","element":"span"}],[{"style":{"width":"49%"},"width":933,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-14.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Recall that ","element":"span"},{"text":"y ","element":"span"},{"text":"is the median of ","element":"span"},{"style":{"height":13.09},"width":77.76,"height":32.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-15.png","element":"img","alt":" wit,t","inline":true,"padRight":true},{"text":"in direction ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-16.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"as computed by our algorithm. Now, define the two quantities","element":"span"}],[{"style":{"width":"72%"},"width":1363,"height":111,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-17.png","element":"img"}],[{"text":"These quantities represent the mass of ","element":"span"},{"style":{"height":12.22},"width":99.56,"height":30.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-18.png","element":"img","alt":" wit+1","inline":true,"padRight":true},{"text":"above and below the strip of width 2","element":"span"},{"style":{"height":17.02},"width":202.08,"height":42.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-19.png","element":"img","alt":"βit around","inline":true,"padRight":true},{"text":"the median. Note that by the maximality of ","element":"span"},{"style":{"height":21.47},"width":750.64,"height":53.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-20.png","element":"img","alt":" it, either X− ≥ γ4dit+1/2 or X+ ≥ γ4dit+1/","inline":true},{"text":"2; if not, then ","element":"span"},{"text":"there exists a strip of width 2","element":"span"},{"style":{"height":17.02},"width":244.52,"height":42.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-21.png","element":"img","alt":"βit ≤ 10γit+1","inline":true,"padRight":true},{"text":"containing at least 1","element":"span"},{"style":{"height":21.07},"width":154.04,"height":52.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-22.png","element":"img","alt":"−γ4it+1d","inline":true},{"text":". Without loss of generality, ","element":"span"},{"text":"assume ","element":"span"},{"style":{"height":21.47},"width":274.08,"height":53.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-23.png","element":"img","alt":" X− ≥ γ4dit+1/2.","inline":true}],[{"text":"Now, recall that ˆ","element":"span"},{"text":"y ","element":"span"},{"text":"is chosen uniformly in the interval [","element":"span"},{"style":{"height":17.02},"width":322.04,"height":42.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/23-24.png","element":"img","alt":"y − 2βit, y + 2βit","inline":true},{"text":"]. We will divide the expectation in the theorem statement into three cases, based on where ˆ","element":"span"},{"text":"y ","element":"span"},{"text":"lies.","element":"span"}],[{"style":{"width":"41%"},"width":775,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-0.png","element":"img"}],[{"text":"This case occurs with probability ","element":"span"},{"style":{"height":25.95},"width":164.28,"height":64.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-1.png","element":"img","alt":"14 − αit+12βit ","inline":true,"padRight":true},{"text":". Note that since ","element":"span"},{"style":{"height":17.6},"width":340.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-2.png","element":"img","alt":" |y − ⟨xt, q0⟩| ≤ βit","inline":true},{"text":", in this case ","element":"span"},{"text":"we also have that ˆ","element":"span"},{"style":{"height":17.6},"width":362.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-3.png","element":"img","alt":"y ≤ ⟨xt, q⟩ − αit+1","inline":true},{"text":". Therefore in this case we know that the ball Γ","element":"span"},{"style":{"height":9.94},"width":68.36,"height":24.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-4.png","element":"img","alt":"it+1","inline":true,"padRight":true},{"text":"lies entirely to the left of the hyperplane ","element":"span"},{"style":{"height":17.6},"width":200.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-5.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true},{"text":". By applying Claim ","element":"span"},{"href":"#id-47","text":"5.6, ","element":"a"},{"text":"we know that, conditioned on being in this case,","element":"span"}],[{"style":{"width":"79%"},"width":1495,"height":326,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-6.png","element":"img"}],[{"text":"This case covers the ˆ","element":"span"},{"text":"y ","element":"span"},{"text":"where the hyperplane ","element":"span"},{"style":{"height":17.6},"width":196.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-7.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"intersects the ball Γ","element":"span"},{"style":{"height":16.63},"width":332.84,"height":41.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-8.png","element":"img","alt":"it+1. For ˆy in this","inline":true,"padRight":true},{"text":"case, we pessimistically bound the change in weight via","element":"span"}],[{"style":{"width":"68%"},"width":1273,"height":212,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-9.png","element":"img"}],[{"text":"• ","element":"span"},{"text":"Case 3","element":"span"},{"text":": remainder of the interval [","element":"span"},{"style":{"height":17.6},"width":342.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-10.png","element":"img","alt":"y − 2βit, y + 2βit].","inline":true}],[{"text":"Since case 1 and case 2 together cover at least 1","element":"span"},{"text":"/","element":"span"},{"text":"4 of the interval, this case occurs with probability at most 3","element":"span"},{"text":"/","element":"span"},{"text":"4. In this case the ball Γ","element":"span"},{"style":{"height":9.94},"width":68.36,"height":24.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-11.png","element":"img","alt":"it+1","inline":true,"padRight":true},{"text":"does not intersect the hyperplane (since all such ˆ","element":"span"},{"text":"y ","element":"span"},{"text":"are covered by case 2). We can therefore apply (the weaker variant of) Claim ","element":"span"},{"href":"#id-47","text":"5.6 ","element":"a"},{"text":"to show that, conditioned on being in this case,","element":"span"}],[{"style":{"width":"40%"},"width":764,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-12.png","element":"img"}],[{"text":"Combining these three cases, we have that","element":"span"}],[{"style":{"width":"98%"},"width":1840,"height":309,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-13.png","element":"img"}],[{"text":"When our guess is far from accurate, we can instead (more strongly) bound the decrease in 1","element":"span"},{"style":{"height":18.29},"width":189.12,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-14.png","element":"img","alt":"/wit,t(Γt).","inline":true}],[{"style":{"height":17.6},"width":733.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-15.png","element":"img","alt":"Claim 5.9. Assume |y − ⟨xt, q0⟩| > βit","inline":true},{"text":". Then, in expectation both over the randomness in feedback and in our algorithm,","element":"span"}],[{"style":{"width":"40%"},"width":749,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/24-16.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"We essentially repeat the logic from the proof of Claim ","element":"span"},{"href":"#id-49","text":"5.8, ","element":"a"},{"text":"with the change that we can more strongly lower bound ","element":"span"},{"style":{"height":12},"width":65.36,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-0.png","element":"img","alt":" X−","inline":true},{"text":". Without loss of generality, assume that ","element":"span"},{"style":{"height":17.6},"width":469.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-1.png","element":"img","alt":" y < ⟨xt, q0⟩ − βit. Define","inline":true}],[{"style":{"width":"25%"},"width":475,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-2.png","element":"img"}],[{"text":"Note that since ","element":"span"},{"text":"y ","element":"span"},{"text":"is the weighted median of ","element":"span"},{"style":{"height":13.09},"width":77.76,"height":32.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-3.png","element":"img","alt":" wit,t","inline":true,"padRight":true},{"text":"in the direction ","element":"span"},{"style":{"height":17.6},"width":268.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-4.png","element":"img","alt":" xt, X− ≥ 1/2.","inline":true}],[{"text":"Now, we again have three cases. ","element":"span"},{"text":"To begin, with probability ","element":"span"},{"style":{"height":25.95},"width":199.12,"height":64.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-5.png","element":"img","alt":"14 − αitβit , ˆy","inline":true,"padRight":true},{"text":"lies in the interval ","element":"span"},{"text":"[","element":"span"},{"style":{"height":17.6},"width":786.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-6.png","element":"img","alt":"y, y + βit − αit]. Since y + βit < ⟨xt, q0⟩","inline":true},{"text":", the hyperplane ","element":"span"},{"style":{"height":17.6},"width":206.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-7.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"does not intersect Γ","element":"span"},{"style":{"height":16.63},"width":127.68,"height":41.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-8.png","element":"img","alt":"it, and","inline":true,"padRight":true},{"text":"therefore we can apply Claim ","element":"span"},{"href":"#id-47","text":"5.6 ","element":"a"},{"text":"to show that","element":"span"}],[{"style":{"width":"57%"},"width":1076,"height":224,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-9.png","element":"img"}],[{"text":"Likewise, the probability that the hyperplane ","element":"span"},{"style":{"height":17.6},"width":196.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-10.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"intersects Γ","element":"span"},{"style":{"height":9.94},"width":22.52,"height":24.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-11.png","element":"img","alt":"it","inline":true,"padRight":true},{"text":"is at most ","element":"span"},{"style":{"height":25.95},"width":244.8,"height":64.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-12.png","element":"img","alt":"αitβit (in which","inline":true,"padRight":true},{"text":"case we can pessimistically bound the decrease in weight as in the proof of Claim ","element":"span"},{"href":"#id-49","text":"5.8)","element":"a"},{"text":", and with the remaining 3","element":"span"},{"text":"/","element":"span"},{"text":"4 probability the hyperplane ","element":"span"},{"style":{"height":17.6},"width":199.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-13.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"does not intersect Γ","element":"span"},{"style":{"height":9.94},"width":22.52,"height":24.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-14.png","element":"img","alt":"it","inline":true},{"text":", and we can apply the weaker variant of Claim ","element":"span"},{"href":"#id-47","text":"5.6. ","element":"a"},{"text":"Combining these observations, we get that","element":"span"}],[{"style":{"width":"99%"},"width":1868,"height":428,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-15.png","element":"img"}],[{"text":"In this step we will bound the total number of rounds in each scale ","element":"span"},{"text":"i","element":"span"},{"text":". Specifically, our algorithm ensures that we move onto the next scale once either the weight concentrates on a strip or once ","element":"span"},{"style":{"height":15.09},"width":43.2,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-16.png","element":"img","alt":"Ci","inline":true,"padRight":true},{"text":"grows large enough - we will show with high probability that this is always due to the weight concentrating on a small strip.","element":"span"}],[{"text":"Let ","element":"span"},{"style":{"height":15.49},"width":44.64,"height":38.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-17.png","element":"img","alt":" Ai","inline":true,"padRight":true},{"text":"be the number of rounds ","element":"span"},{"style":{"height":17.6},"width":803.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-18.png","element":"img","alt":" t such that it = i and |y − ⟨xt, q0⟩| ≤ βi","inline":true,"padRight":true},{"text":"(i.e. the number of rounds where we are “accurate”). ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":14.69},"width":45.12,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-19.png","element":"img","alt":" Bi","inline":true,"padRight":true},{"text":"be the number of rounds ","element":"span"},{"style":{"height":15.09},"width":452.16,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-20.png","element":"img","alt":" t such that it = i and","inline":true},{"style":{"height":17.6},"width":325.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-21.png","element":"img","alt":"|y − ⟨xt, q0⟩| > βi","inline":true,"padRight":true},{"text":"(i.e. the number of rounds where we are “inaccurate”). Note that ","element":"span"},{"style":{"height":15.49},"width":260.64,"height":38.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-22.png","element":"img","alt":" Ai + Bi = Ci.","inline":true,"padRight":true},{"text":"Also, recall that our algorithm ensures that ","element":"span"},{"style":{"height":29.63},"width":387.48,"height":74.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-23.png","element":"img","alt":" Ci ≤ 100( d4iγ10di + d25i","inline":true},{"text":") + 1 for all rounds ","element":"span"},{"text":"t","element":"span"},{"text":".","element":"span"}],[{"text":"We first show that with high probability, ","element":"span"},{"style":{"height":14.69},"width":45.12,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-24.png","element":"img","alt":" Bi","inline":true,"padRight":true},{"text":"will be no larger than ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")","element":"span"},{"text":"i","element":"span"},{"text":").","element":"span"}],[{"id":"id-50","text":"Claim 5.10. ","element":"span"},{"text":"For any constant ","element":"span"},{"text":"c > ","element":"span"},{"text":"0","element":"span"},{"text":", with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":17.15},"width":154.2,"height":42.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-25.png","element":"img","alt":" − 2−d4ic ","inline":true,"padRight":true},{"text":"we have throughout all rounds that","element":"span"}],[{"style":{"width":"19%"},"width":367,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-26.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"We will construct a sequence ","element":"span"},{"style":{"height":25.97},"width":320.2,"height":64.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-27.png","element":"img","alt":" Zt so that Ztwi,t(Γi) ","inline":true,"padRight":true},{"text":"is a super-martingale. Consider the sequence ","element":"span"},{"style":{"height":14.69},"width":41.76,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/25-28.png","element":"img","alt":"Zt","inline":true,"padRight":true},{"text":"defined as follows.","element":"span"}],[{"text":"• ","element":"span"},{"style":{"height":24.54},"width":222.24,"height":61.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-0.png","element":"img","alt":" Z1 =� αi2�d.","inline":true}],[{"text":"• ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":17.6},"width":566.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-1.png","element":"img","alt":" it ̸∈ {i, i − 1} then Zt+1 = Zt.","inline":true}],[{"text":"• ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":31.6},"width":1156.28,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-2.png","element":"img","alt":" it = i and |y − ⟨xt, q⟩| ≤ βi or it = i − 1 then Zt+1 =�1 − αiβi","inline":true}],[{"text":"• ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":21.65},"width":1008.48,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-3.png","element":"img","alt":" it = i and |y − ⟨xt, q⟩| > βi then Zt+1 =�1 + 1d21�Zt.","inline":true}],[{"style":{"width":"80%"},"width":1514,"height":463,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-4.png","element":"img"}],[{"text":"(Here we have used the fact that ","element":"span"},{"style":{"height":17.6},"width":353.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-5.png","element":"img","alt":" Vol(B(q0, αi)∩B(0,","inline":true,"padRight":true},{"text":"1)) must contain a ball of radius ","element":"span"},{"style":{"height":17.6},"width":280.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-6.png","element":"img","alt":" αi/2.) Since Yt","inline":true,"padRight":true},{"text":"is a non-negative super-martingale, by Doob’s martingale inequality it holds that for any constant","element":"span"}],[{"style":{"width":"60%"},"width":1129,"height":122,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-7.png","element":"img"}],[{"text":"However note that if there exists a round where ","element":"span"},{"style":{"height":19.13},"width":355,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-8.png","element":"img","alt":" Bi ≥ 100d25i(1 + c","inline":true},{"text":"), then for ","element":"span"},{"text":"t ","element":"span"},{"text":"sufficiently large","element":"span"}],[{"style":{"width":"58%"},"width":1093,"height":450,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-9.png","element":"img"}],[{"text":"(Here in the last inequality we have used the fact that 2","element":"span"},{"style":{"height":22.24},"width":713,"height":55.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-10.png","element":"img","alt":"d4i ≥ (2/αi)d). Since wi,t(Γi) ≤ 1 for","inline":true,"padRight":true},{"text":"all ","element":"span"},{"text":"t","element":"span"},{"text":", this implies that ","element":"span"},{"style":{"height":19.84},"width":180.12,"height":49.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-11.png","element":"img","alt":" Yt ≥ 2d4ic","inline":true},{"text":", which immediately implies the desired claim. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-12.png","element":"img","alt":"■","inline":true}],[{"text":"Recall that in our algorithm, we check the following two conditions for determining the scale ","element":"span"},{"text":"i ","element":"span"},{"text":"we use for the current query:","element":"span"}],[{"text":"• ","element":"span"},{"text":"There exists ","element":"span"},{"style":{"height":24.48},"width":1260,"height":61.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-13.png","element":"img","alt":" a, b ∈ R such that |a − b| ≤ 10γi and�a≤⟨xt,q⟩≤b wi(q)dq ≥ 1 − γ4di .","inline":true}],[{"text":"• ","element":"span"},{"style":{"height":31.55},"width":617.76,"height":78.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/26-14.png","element":"img","alt":" Ci−1 > 100(d4(i−1)γ10di−1 + d25(i − 1)).","inline":true}],[{"text":"We now show that with high probability, only the first condition is ever relevant.","element":"span"}],[{"id":"id-55","style":{"width":"63%"},"width":1188,"height":290,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-0.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Again, we will construct a sequence ","element":"span"},{"style":{"height":25.97},"width":406.12,"height":64.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-1.png","element":"img","alt":" Zt so that Ztwi+1,t(Γi+1) ","inline":true,"padRight":true},{"text":"is a super-martingale. Consider ","element":"span"},{"text":"the sequence ","element":"span"},{"style":{"height":14.69},"width":41.76,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-2.png","element":"img","alt":" Zt","inline":true,"padRight":true},{"text":"defined as follows.","element":"span"}],[{"text":"• ","element":"span"},{"style":{"height":29.26},"width":181.56,"height":73.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-3.png","element":"img","alt":" Z1 =αdi+12d","inline":true}],[{"text":"• ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":17.6},"width":566.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-4.png","element":"img","alt":" it ̸∈ {i, i + 1} then Zt+1 = Zt.","inline":true}],[{"text":"• ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":31.6},"width":1210.68,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-5.png","element":"img","alt":" it = i and |y − ⟨xt, q0⟩| > βi or it = i + 1 then Zt+1 =�1 − αi+1βi+1","inline":true}],[{"text":"• ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":20.93},"width":1031.52,"height":52.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-6.png","element":"img","alt":" it = i and |y − ⟨xt, q0⟩| ≤ βi then Zt+1 =�1 + γ10di �Zt","inline":true}],[{"text":"Consider the ratio ","element":"span"},{"style":{"height":25.97},"width":303.88,"height":64.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-7.png","element":"img","alt":" Yt = Ztwi+1,t(Γi+1)","inline":true},{"text":". Similarly as in the proof of Claim ","element":"span"},{"href":"#id-50","text":"5.10, ","element":"a"},{"style":{"height":14.69},"width":95.44,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-8.png","element":"img","alt":" Y1 ≤","inline":true,"padRight":true},{"text":"1. Note that ","element":"span"},{"text":"Claim ","element":"span"},{"href":"#id-48","text":"5.7 ","element":"a"},{"text":"and Claim ","element":"span"},{"href":"#id-49","text":"5.8 ","element":"a"},{"text":"imply that","element":"span"}],[{"style":{"width":"12%"},"width":231,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-9.png","element":"img"}],[{"text":"so ","element":"span"},{"style":{"height":14.69},"width":37.44,"height":36.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-10.png","element":"img","alt":" Yt","inline":true,"padRight":true},{"text":"is a super-martingale. Now assume that ","element":"span"},{"style":{"height":32.29},"width":430.16,"height":80.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-11.png","element":"img","alt":" Ci ≥ 100�d4iγ10di + d25i�","inline":true},{"text":". By the constraints of our algorithm, we are guaranteed that","element":"span"}],[{"style":{"width":"53%"},"width":998,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-12.png","element":"img"}],[{"text":"Also by Claim ","element":"span"},{"href":"#id-50","text":"5.10, ","element":"a"},{"text":"with probability at least 1 ","element":"span"},{"style":{"height":24.53},"width":458.52,"height":61.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-13.png","element":"img","alt":" − 1210d4i , Bi ≤ 1100d25i","inline":true,"padRight":true},{"text":"over all rounds ","element":"span"},{"text":"t","element":"span"},{"text":". This ","element":"span"},{"text":"implies that eventually","element":"span"}],[{"style":{"width":"26%"},"width":500,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-14.png","element":"img"}],[{"text":"Thus, for sufficiently large ","element":"span"},{"text":"t","element":"span"},{"text":", we have that","element":"span"}],[{"style":{"width":"58%"},"width":1090,"height":126,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-15.png","element":"img"}],[{"text":"However note ","element":"span"},{"style":{"height":15.09},"width":270.24,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-16.png","element":"img","alt":" Y1 ≤ 1 and Yt","inline":true,"padRight":true},{"text":"is a supermartingale. Also, ","element":"span"},{"style":{"height":18.29},"width":276.88,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-17.png","element":"img","alt":" wi+1,t(Γi+1) ≤","inline":true,"padRight":true},{"text":"1 for all rounds ","element":"span"},{"text":"t","element":"span"},{"text":". Thus, by Doob’s martingale inequality, the probability we ever have ","element":"span"},{"style":{"height":32.1},"width":634.28,"height":80.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-18.png","element":"img","alt":" Ci ≥ 100�d4iγ10di + d25i�is at most","inline":true}],[{"style":{"width":"20%"},"width":390,"height":92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-19.png","element":"img"}],[{"text":"(the first term is from the probability that at some point ","element":"span"},{"style":{"height":19.13},"width":756.88,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/27-20.png","element":"img","alt":" Bi ≥ 1100d25i). ■","inline":true}],[{"style":{"width":"46%"},"width":864,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-0.png","element":"img"}],[{"text":"We now aim to show that with high probability, if the weight function ","element":"span"},{"style":{"height":10.69},"width":43.2,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-1.png","element":"img","alt":" wi","inline":true,"padRight":true},{"text":"is concentrated on a thin strip, this strip must be close to the true point ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-2.png","element":"img","alt":" q0","inline":true,"padRight":true},{"text":"(this is necessary to bound the total regret we incur each round in scale ","element":"span"},{"text":"i","element":"span"},{"text":"). To do this, we will argue that we can “round” points to the ","element":"span"},{"style":{"height":10.69},"width":56.76,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-3.png","element":"img","alt":" αi-","inline":true,"padRight":true},{"text":"net ","element":"span"},{"style":{"height":17.01},"width":59.48,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-4.png","element":"img","alt":" Sαi","inline":true,"padRight":true},{"text":"without significantly affecting their weight. We will then rely on the geometric observation mentioned earlier: that for points ","element":"span"},{"style":{"height":17.01},"width":848.84,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-5.png","element":"img","alt":" q1, q2 ∈ Sαi for some i such that q0, q1, and q2","inline":true,"padRight":true},{"text":"are nearly collinear, we can relate the weights ","element":"span"},{"style":{"height":18.29},"width":354.44,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-6.png","element":"img","alt":" wi,t(q1) and wi,t(q2","inline":true},{"text":"). We begin by relating the weights of collinear points.","element":"span"}],[{"id":"id-51","text":"Claim 5.12. ","element":"span"},{"text":"Fix an index ","element":"span"},{"style":{"height":16.4},"width":356.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-7.png","element":"img","alt":" i. If q0, q1, and q2","inline":true,"padRight":true},{"text":"are collinear in that order, then with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":17.34},"width":152.64,"height":43.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-8.png","element":"img","alt":" − 2−d10i ","inline":true,"padRight":true},{"text":"we have that for all rounds ","element":"span"},{"text":"t","element":"span"}],[{"style":{"width":"20%"},"width":376,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-9.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Consider a time step ","element":"span"},{"style":{"height":15.09},"width":462.16,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-10.png","element":"img","alt":" t where it = i or it = i−","inline":true},{"text":"1. We say a point is on the “good” side of the hyperplane ","element":"span"},{"style":{"height":17.6},"width":200.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-11.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"if it is on the same side as ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-12.png","element":"img","alt":" q0","inline":true},{"text":". Otherwise we say the point is on the “bad” side. Note for ","element":"span"},{"style":{"height":11.6},"width":94.28,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-13.png","element":"img","alt":" q1, q2","inline":true,"padRight":true},{"text":"satisfying the conditions of the claim, one of the following statements must be true:","element":"span"}],[{"text":"• ","element":"span"},{"style":{"height":16},"width":271.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-14.png","element":"img","alt":" Case 1: q1, q2","inline":true,"padRight":true},{"text":"are on the same side of the hyperplane ","element":"span"},{"style":{"height":17.6},"width":206.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-15.png","element":"img","alt":" ⟨xt, q⟩ = ˆy.","inline":true}],[{"text":"• ","element":"span"},{"style":{"height":16},"width":211.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-16.png","element":"img","alt":" Case 2: q1","inline":true,"padRight":true},{"text":"is on the good side of the hyperplane and ","element":"span"},{"style":{"height":11.6},"width":36.67,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-17.png","element":"img","alt":" q2","inline":true,"padRight":true},{"text":"is on the bad side of the hyperplane.","element":"span"}],[{"text":"We will now consider the quantity ","element":"span"},{"style":{"height":37.39},"width":315,"height":93.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-18.png","element":"img","alt":" Rt =�wi,t(q2)wi,t(q1)�d9","inline":true},{"text":". Note that in Case 1, then ","element":"span"},{"style":{"height":17.6},"width":288.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-19.png","element":"img","alt":" Rt+1 = Rt (the","inline":true,"padRight":true},{"text":"relative weights remain unchanged if both ","element":"span"},{"style":{"height":16},"width":178.76,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-20.png","element":"img","alt":" q1 and q2","inline":true,"padRight":true},{"text":"are on the same side of the hyperplane). In Case 2,","element":"span"}],[{"style":{"width":"99%"},"width":1870,"height":548,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-21.png","element":"img"}],[{"text":"By Doob’s martingale inequality, the probability that this ever happens is at most ","element":"span"},{"style":{"height":22.14},"width":256.32,"height":55.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-22.png","element":"img","alt":" γd10i ≤ 2−d10i,","inline":true,"padRight":true},{"text":"as desired. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-23.png","element":"img","alt":"■","inline":true}],[{"text":"We next relate the weights of nearby points ","element":"span"},{"style":{"height":16},"width":451.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-24.png","element":"img","alt":" q1 and q2. If q1 and q2","inline":true,"padRight":true},{"text":"are close together, then it is unlikely they are ever separated by a hyperplane, and their weights should be similar. The following claim captures this intuition.","element":"span"}],[{"id":"id-52","text":"Claim 5.13. ","element":"span"},{"text":"Fix an index ","element":"span"},{"style":{"height":19.73},"width":764.84,"height":49.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-25.png","element":"img","alt":" i. If q1, and q2 satisfy ∥q1 − q2∥ ≤ 2β10i ","inline":true,"padRight":true},{"text":", then with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":17.34},"width":152.16,"height":43.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-26.png","element":"img","alt":" − 2−d10i ","inline":true,"padRight":true},{"text":"we have that for all rounds ","element":"span"},{"text":"t","element":"span"}],[{"style":{"width":"19%"},"width":367,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/28-27.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"We will bound the number of rounds ","element":"span"},{"style":{"height":15.09},"width":572.08,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-0.png","element":"img","alt":" t such that it = i or it = i −","inline":true,"padRight":true},{"text":"1 and the hyperplane ","element":"span"},{"style":{"height":17.6},"width":201.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-1.png","element":"img","alt":"⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"intersects the segment connecting ","element":"span"},{"style":{"height":16},"width":176.84,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-2.png","element":"img","alt":" q1 and q2","inline":true},{"text":". We will show that with high probability, this quantity is at most ","element":"span"},{"style":{"height":15.13},"width":56.76,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-3.png","element":"img","alt":" d9i","inline":true},{"text":". Note that since ","element":"span"},{"style":{"height":18.29},"width":280.04,"height":45.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-4.png","element":"img","alt":" wi,t(q1)/wi,t(q2","inline":true},{"text":") is unchanged when ","element":"span"},{"style":{"height":16},"width":334.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-5.png","element":"img","alt":" q1 and q2 both lie","inline":true,"padRight":true},{"text":"on the same side of the hyperplane, and decreases by at most a factor of (1 ","element":"span"},{"style":{"height":12},"width":65.68,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-6.png","element":"img","alt":" − η","inline":true},{"text":") when they lie on different sides, this will show that with high probability","element":"span"}],[{"style":{"width":"99%"},"width":1870,"height":424,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-7.png","element":"img"}],[{"text":"indices ","element":"span"},{"style":{"height":15.09},"width":514,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-8.png","element":"img","alt":" t for which it = i or it = i−","inline":true},{"text":"1. The probability that at least ","element":"span"},{"style":{"height":15.13},"width":56.76,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-9.png","element":"img","alt":" d9i","inline":true,"padRight":true},{"text":"of the hyperplanes ","element":"span"},{"style":{"height":17.6},"width":196.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-10.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"intersect the segment connecting ","element":"span"},{"style":{"height":16},"width":174.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-11.png","element":"img","alt":" q2 and q1","inline":true,"padRight":true},{"text":"is at most","element":"span"}],[{"style":{"width":"43%"},"width":816,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-12.png","element":"img"}],[{"text":"which implies our desired result.","element":"span"}],[{"style":{"width":"1%"},"width":30,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-13.png","element":"img"}],[{"text":"Finally, we apply Claims ","element":"span"},{"href":"#id-51","text":"5.12 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-52","text":"5.13 ","element":"a"},{"text":"to bound the relative weights for all approximately collinear pairs of points in ","element":"span"},{"style":{"height":15.09},"width":62.4,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-14.png","element":"img","alt":" Sα.","inline":true}],[{"id":"id-53","text":"Claim 5.14. ","element":"span"},{"text":"Fix an index ","element":"span"},{"text":"i","element":"span"},{"text":". With probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":17.15},"width":138.24,"height":42.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-15.png","element":"img","alt":" − 2−d9i ","inline":true,"padRight":true},{"text":"the following claim holds: for all","element":"span"}],[{"style":{"width":"71%"},"width":1331,"height":345,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-16.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Fix a pair of points ","element":"span"},{"style":{"height":17.01},"width":215.48,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-17.png","element":"img","alt":" q1, q2 ∈ Sαi","inline":true,"padRight":true},{"text":"satisfying the conditions in the statement. Let ","element":"span"},{"style":{"height":16},"width":187.04,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-18.png","element":"img","alt":" q⊥ be the","inline":true,"padRight":true},{"text":"foot of the perpendicular from ","element":"span"},{"style":{"height":11.6},"width":36.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-19.png","element":"img","alt":" q1","inline":true,"padRight":true},{"text":"to the segment connecting ","element":"span"},{"style":{"height":16},"width":174.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-20.png","element":"img","alt":" q2 and q0","inline":true},{"text":". Note that","element":"span"}],[{"style":{"width":"16%"},"width":318,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-21.png","element":"img"}],[{"text":"Therefore, by the conditions of Claim ","element":"span"},{"href":"#id-52","text":"5.13, ","element":"a"},{"text":"with probability at least 1 ","element":"span"},{"style":{"height":17.15},"width":150.24,"height":42.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-22.png","element":"img","alt":" − 2−d10i","inline":true},{"text":", for all rounds ","element":"span"},{"text":"t","element":"span"},{"text":",","element":"span"}],[{"style":{"width":"96%"},"width":1803,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-23.png","element":"img"}],[{"style":{"height":16},"width":176.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-24.png","element":"img","alt":"q0 and q2","inline":true,"padRight":true},{"text":"on this line. By Claim ","element":"span"},{"href":"#id-51","text":"5.12, ","element":"a"},{"text":"this means that with probability at least 1 ","element":"span"},{"style":{"height":20.35},"width":295.68,"height":50.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-25.png","element":"img","alt":" − 2−d10i, for all","inline":true,"padRight":true},{"text":"rounds ","element":"span"},{"text":"t","element":"span"},{"text":",","element":"span"}],[{"style":{"width":"21%"},"width":397,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/29-26.png","element":"img"}],[{"style":{"width":"99%"},"width":1871,"height":220,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-0.png","element":"img"}],[{"text":"This is for a specific pair of points in ","element":"span"},{"style":{"height":17.01},"width":59.48,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-1.png","element":"img","alt":" Sαi","inline":true},{"text":". Union bounding over all ","element":"span"},{"style":{"height":19.35},"width":104.36,"height":48.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-2.png","element":"img","alt":" |Sαi|2 ","inline":true,"padRight":true},{"text":"pairs of points, we have that the theorem statement fails with probability at most","element":"span"}],[{"style":{"width":"70%"},"width":1329,"height":174,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-3.png","element":"img"}],[{"text":"Finally, we prove that if ","element":"span"},{"style":{"height":10.69},"width":43.2,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-4.png","element":"img","alt":" wi","inline":true,"padRight":true},{"text":"concentrates on a strip, ","element":"span"},{"style":{"height":16},"width":269.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-5.png","element":"img","alt":" q0 is within γi","inline":true,"padRight":true},{"text":"of this strip. To show this, it suffices to show that the weight of ","element":"span"},{"style":{"height":17.6},"width":138.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-6.png","element":"img","alt":" B(q0, γi","inline":true},{"text":") is large enough that it must intersect a sufficiently concentrated strip. We do this by using Claim ","element":"span"},{"href":"#id-53","text":"5.14 ","element":"a"},{"text":"to relate the weight of points of ","element":"span"},{"style":{"height":17.01},"width":59.48,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-7.png","element":"img","alt":" Sαi","inline":true,"padRight":true},{"text":"inside and outside ","element":"span"},{"style":{"height":17.6},"width":168.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-8.png","element":"img","alt":" B(q0, γi).","inline":true}],[{"id":"id-56","text":"Claim 5.15. ","element":"span"},{"text":"Fix an index ","element":"span"},{"text":"i","element":"span"},{"text":". With probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":17.34},"width":137.76,"height":43.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-9.png","element":"img","alt":" − 2−d8i","inline":true},{"text":", the following statement holds for all ","element":"span"},{"text":"t","element":"span"},{"text":":","element":"span"}],[{"style":{"width":"64%"},"width":1202,"height":288,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-10.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"First for each point ","element":"span"},{"style":{"height":17.01},"width":137.72,"height":42.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-11.png","element":"img","alt":" q ∈ Sαi","inline":true},{"text":", consider an axis-parallel box centered at that point with side length ","element":"span"},{"style":{"height":19.04},"width":32.6,"height":47.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-12.png","element":"img","alt":"αid ","inline":true,"padRight":true},{"text":". Now consider all rounds ","element":"span"},{"style":{"height":15.09},"width":444.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-13.png","element":"img","alt":" t with it = i or it = i −","inline":true,"padRight":true},{"text":"1 and all planes of the form ","element":"span"},{"style":{"height":17.6},"width":196.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-14.png","element":"img","alt":" ⟨xt, q⟩ = ˆy","inline":true,"padRight":true},{"text":"for these rounds. We show that with high probability, all boxes intersect at most ","element":"span"},{"style":{"height":15.14},"width":56.76,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-15.png","element":"img","alt":" d9i","inline":true,"padRight":true},{"text":"of these planes. Using essentially the same argument as in Claim ","element":"span"},{"href":"#id-52","text":"5.13, ","element":"a"},{"text":"we find that this probability is at least","element":"span"}],[{"id":"id-54","style":{"width":"99%"},"width":1867,"height":579,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-16.png","element":"img"}],[{"text":"In both of these inequalities we are using the fact that if at most ","element":"span"},{"style":{"height":15.14},"width":56.76,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-17.png","element":"img","alt":" d9i","inline":true,"padRight":true},{"text":"planes intersect any box, then the weights of any two points in the same box are within a factor of (1 ","element":"span"},{"style":{"height":23.28},"width":143.52,"height":58.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-18.png","element":"img","alt":" − η)d9i.","inline":true}],[{"text":"Next, consider the ball ","element":"span"},{"style":{"height":17.81},"width":920.08,"height":44.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-19.png","element":"img","alt":" B(q0, γi − αi). Let Ti = {B(q0, γi − αi) ∩ Sαi}","inline":true},{"text":". Consider the following two transformations: ","element":"span"},{"style":{"height":17.81},"width":426.24,"height":44.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/30-20.png","element":"img","alt":" f : Sαi → B(q0, γi − αi","inline":true},{"text":"), which sends a point ","element":"span"},{"text":"q ","element":"span"},{"text":"to","element":"span"}],[{"style":{"width":"25%"},"width":475,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-0.png","element":"img"}],[{"text":"and the transformation ","element":"span"},{"style":{"height":17.6},"width":646.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-1.png","element":"img","alt":" g : B(q0, γi − αi) → Ti, where g(q","inline":true},{"text":") is the point obtained by rounding the coordinates of ","element":"span"},{"text":"q ","element":"span"},{"text":"to the nearest integer multiple of ","element":"span"},{"style":{"height":19.15},"width":428.64,"height":47.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-2.png","element":"img","alt":"αid (note that g(q) ∈ Ti","inline":true},{"text":"). If we consider the map ","element":"span"},{"style":{"height":17.6},"width":281.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-3.png","element":"img","alt":"q → q′ = g(f(q","inline":true},{"text":")) given by the above, the number of points ","element":"span"},{"style":{"height":16},"width":123.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-4.png","element":"img","alt":" q ∈ Sα","inline":true,"padRight":true},{"text":"that map to a fixed point ","element":"span"},{"style":{"height":15.6},"width":123.36,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-5.png","element":"img","alt":" q′ ∈ Ti","inline":true,"padRight":true},{"text":"is at most","element":"span"}],[{"style":{"width":"7%"},"width":144,"height":100,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-6.png","element":"img"}],[{"text":"To see this, note that ","element":"span"},{"style":{"height":19.14},"width":104.48,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-7.png","element":"img","alt":" g−1(q","inline":true},{"text":") is an axis-parallel box with side-length ","element":"span"},{"style":{"height":20.69},"width":491.24,"height":51.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-8.png","element":"img","alt":"αid , and thus f −1(g−1(q))","inline":true,"padRight":true},{"text":"contains all the points in ","element":"span"},{"style":{"height":15.09},"width":48.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-9.png","element":"img","alt":" Sα","inline":true,"padRight":true},{"text":"contained within an axis aligned box with side-length ","element":"span"},{"style":{"height":24.45},"width":199.2,"height":61.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-10.png","element":"img","alt":"2αidγi , which","inline":true,"padRight":true},{"text":"contains at least (2","element":"span"},{"style":{"height":19.54},"width":526.08,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-11.png","element":"img","alt":"/γi + 1)d < (10/γi)d points.","inline":true}],[{"text":"Now, note that ","element":"span"},{"style":{"height":17.6},"width":320.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-12.png","element":"img","alt":" q and q′ = g(f(q","inline":true},{"text":")) satisfy the conditions of Claim ","element":"span"},{"href":"#id-53","text":"5.14. ","element":"a"},{"text":"Thus, by Claim ","element":"span"},{"href":"#id-53","text":"5.14, ","element":"a"},{"text":"with probability at least 1 ","element":"span"},{"style":{"height":24.53},"width":108.44,"height":61.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-13.png","element":"img","alt":" − 12d9i","inline":true,"padRight":true},{"text":"we have that","element":"span"}],[{"style":{"width":"67%"},"width":1262,"height":135,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-14.png","element":"img"}],[{"text":"Combining the above with equations ","element":"span"},{"href":"#id-54","text":"(1) ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-54","text":"(2)","element":"a"},{"text":", we conclude that with probability at least","element":"span"}],[{"style":{"width":"74%"},"width":1389,"height":298,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-15.png","element":"img"}],[{"text":"This implies that the ball ","element":"span"},{"style":{"height":17.6},"width":138.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-16.png","element":"img","alt":" B(q0, γi","inline":true},{"text":") must intersect the strip ","element":"span"},{"style":{"height":17.6},"width":286.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-17.png","element":"img","alt":" a ≤ ⟨xt, q⟩ ≤ b","inline":true},{"text":". If this happens then the desired condition is clearly satisfied. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-18.png","element":"img","alt":"■","inline":true}],[{"style":{"width":"34%"},"width":652,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-19.png","element":"img"}],[{"text":"Finally, we can proceed to prove the main theorem.","element":"span"}],[{"text":"Proof of Theorem ","element":"span"},{"href":"#id-45","text":"5.5. ","element":"a"},{"text":"First, for each ","element":"span"},{"style":{"height":15.6},"width":146.4,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-20.png","element":"img","alt":" i, let Li","inline":true,"padRight":true},{"text":"be the total loss at scale ","element":"span"},{"text":"i","element":"span"},{"text":". We will bound ","element":"span"},{"style":{"height":17.6},"width":108.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-21.png","element":"img","alt":" E[Li].","inline":true}],[{"text":"By Claim ","element":"span"},{"href":"#id-55","text":"5.11, ","element":"a"},{"text":"with probability at least 1 ","element":"span"},{"style":{"height":31.36},"width":766.96,"height":78.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-22.png","element":"img","alt":" − 12d4(i−1) , Ci−1 ≤ 100(d4(i−1)γ10di−1 + d25(i −","inline":true,"padRight":true},{"text":"1)) for all rounds. If this is true, then the only time we query at level ","element":"span"},{"text":"i","element":"span"},{"text":", there must be some strip given by ","element":"span"},{"style":{"height":17.6},"width":292.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-23.png","element":"img","alt":"a ≤ ⟨xt, q⟩ ≤ b","inline":true,"padRight":true},{"text":"of width at most 10","element":"span"},{"style":{"height":11.6},"width":34.56,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-24.png","element":"img","alt":"γi","inline":true,"padRight":true},{"text":"that contains 1 ","element":"span"},{"style":{"height":20.13},"width":105.36,"height":50.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-25.png","element":"img","alt":" − γ4di","inline":true,"padRight":true},{"text":"of the total weight of ","element":"span"},{"style":{"height":16.4},"width":257.72,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-26.png","element":"img","alt":" wi. Thus, by","inline":true,"padRight":true},{"text":"Claim ","element":"span"},{"href":"#id-56","text":"5.15, ","element":"a"},{"text":"with at least 1 ","element":"span"},{"style":{"height":24.72},"width":355.68,"height":61.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/31-27.png","element":"img","alt":" − 12d4(i−1) − 12d8(i−1)","inline":true,"padRight":true},{"text":"probability, all queries at level ","element":"span"},{"text":"i ","element":"span"},{"text":"incur loss at most","element":"span"}],[{"text":"12","element":"span"},{"style":{"height":16.4},"width":286.08,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-0.png","element":"img","alt":"γi + 2βi ≤ 14γi","inline":true},{"text":". Now, by using Claim ","element":"span"},{"href":"#id-50","text":"5.10, ","element":"a"},{"text":"we can bound the expected total loss at level ","element":"span"},{"text":"i ","element":"span"},{"text":"as","element":"span"}],[{"style":{"width":"102%"},"width":1912,"height":584,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-1.png","element":"img"}],[{"text":"It follows that","element":"span"}],[{"style":{"width":"85%"},"width":1602,"height":186,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-2.png","element":"img"}],[{"text":"Remark. ","element":"span"},{"text":"Naively, one can implement Algorithm ","element":"span"},{"href":"#id-43","text":"6 ","element":"a"},{"text":"with time complexity ","element":"span"},{"style":{"height":15.94},"width":101.8,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-3.png","element":"img","alt":" T O(d)","inline":true},{"text":", via the observation that ","element":"span"},{"text":"T ","element":"span"},{"text":"hyperplanes divide ","element":"span"},{"text":"B","element":"span"},{"text":"(0","element":"span"},{"text":", ","element":"span"},{"text":"1) into at most ","element":"span"},{"style":{"height":19.54},"width":101.04,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-4.png","element":"img","alt":" O(T d","inline":true},{"text":") pieces, so we can simply compute this division and the weight of each distribution ","element":"span"},{"style":{"height":10.69},"width":43.2,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-5.png","element":"img","alt":" wi","inline":true,"padRight":true},{"text":"(we care about at most ","element":"span"},{"text":"T ","element":"span"},{"text":"scales) on each component of this division.","element":"span"}],[{"text":"It is an interesting open question if it is possible to implement Algorithm ","element":"span"},{"href":"#id-43","text":"6 ","element":"a"},{"text":"(or otherwise achieve ","element":"span"},{"text":"O","element":"span"},{"text":"(poly(","element":"span"},{"text":"d","element":"span"},{"text":")) regret) with time complexity poly(","element":"span"},{"text":"d, T","element":"span"},{"text":"). To do so, it would suffice to be able to efficiently sample from the distributions ","element":"span"},{"style":{"height":10.69},"width":56.64,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-6.png","element":"img","alt":" wi.","inline":true}]]},{"heading":"6 Tight loss bounds for full feedback","paragraphs":[[{"text":"We also study the problem where the learner has full feedback, i.e, after the prediction ","element":"span"},{"style":{"height":16.4},"width":214.68,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-7.png","element":"img","alt":" yt the feed-","inline":true,"padRight":true},{"text":"back is the actual value of ","element":"span"},{"style":{"height":17.6},"width":94.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-8.png","element":"img","alt":" f0(xt","inline":true},{"text":"). We show that the optimal regret can be completely characterized (up to constant factors) by a continuous analogue of the Littlestone dimension.","element":"span"}],[{"text":"For this section we don’t require the assumption that ","element":"span"},{"text":"Y ","element":"span"},{"text":"is ordered, only that the loss function forms a valid metric (i.e. is symmetric and satisfies the triangle inequality).","element":"span"}],[{"text":"6.1 ","element":"span"},{"text":"Tree Dimension","element":"span"}],[{"text":"Definition 6.1. ","element":"span"},{"text":"A (","element":"span"},{"text":"X","element":"span"},{"text":", ","element":"span"},{"text":"Y","element":"span"},{"text":")-tree of cost ","element":"span"},{"text":"c ","element":"span"},{"text":"is a labeled binary tree with the following properties","element":"span"}],[{"text":"• ","element":"span"},{"text":"There is a root node and each interior node has two children","element":"span"}],[{"text":"• ","element":"span"},{"text":"Each interior node is labeled with a triple (","element":"span"},{"style":{"height":17.6},"width":630.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-9.png","element":"img","alt":"x, y1, y2) where x ∈ X, y1, y2 ∈ Y","inline":true}],[{"text":"• ","element":"span"},{"text":"For each leaf, the sum of ","element":"span"},{"style":{"height":17.6},"width":133.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/32-10.png","element":"img","alt":" ℓ(y1, y2","inline":true},{"text":") over all nodes on the path from the root to the leaf is at least ","element":"span"},{"text":"c","element":"span"},{"text":".","element":"span"}],[{"text":"Definition 6.2. ","element":"span"},{"text":"We say a (","element":"span"},{"text":"X","element":"span"},{"text":", ","element":"span"},{"text":"Y","element":"span"},{"text":")-tree ","element":"span"},{"text":"T ","element":"span"},{"text":"is ","element":"span"},{"text":"H","element":"span"},{"text":"-satisfiable if we can label each leaf with some ","element":"span"},{"style":{"height":16.4},"width":116.2,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-0.png","element":"img","alt":" f ∈ H","inline":true,"padRight":true},{"text":"such that for each node (","element":"span"},{"style":{"height":17.6},"width":249.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-1.png","element":"img","alt":"x, y1, y2) ∈ T","inline":true},{"text":", all leaves of the left subtree satisfy ","element":"span"},{"style":{"height":17.6},"width":181.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-2.png","element":"img","alt":" f(x) = y1","inline":true,"padRight":true},{"text":"and all leaves of the right subtree satisfy ","element":"span"},{"style":{"height":17.6},"width":195.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-3.png","element":"img","alt":" f(x) = y2.","inline":true}],[{"text":"Definition 6.3 ","element":"span"},{"text":"(Tree dimension)","element":"span"},{"style":{"height":17.6},"width":315.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-4.png","element":"img","alt":". We define τ(H","inline":true},{"text":"), the tree dimension of ","element":"span"},{"text":"H","element":"span"},{"text":", to be the maximum cost of a (","element":"span"},{"text":"X","element":"span"},{"text":", ","element":"span"},{"text":"Y","element":"span"},{"text":")-tree that is ","element":"span"},{"text":"H","element":"span"},{"text":"-satisfiable.","element":"span"}],[{"text":"Remark. ","element":"span"},{"text":"Note we can naturally extend the above definition to any subset ","element":"span"},{"style":{"height":13.2},"width":156,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-5.png","element":"img","alt":" H′ ⊂ H.","inline":true}],[{"text":"It is worth noting that covering dimension is “more restrictive” than tree dimension in the sense that bounded covering dimension implies bounded tree dimension.","element":"span"}],[{"id":"id-57","text":"Theorem 6.4. ","element":"span"},{"text":"Let ","element":"span"},{"text":"H ","element":"span"},{"text":"be a hypothesis class consisting of functions mapping ","element":"span"},{"style":{"height":14.8},"width":394.4,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-6.png","element":"img","alt":" X → Y and let L be","inline":true,"padRight":true},{"text":"a loss function that defines a metric on ","element":"span"},{"text":"Y","element":"span"},{"text":". If ","element":"span"},{"text":"Cdim","element":"span"},{"text":"(","element":"span"},{"text":"H","element":"span"},{"text":") is finite then","element":"span"}],[{"style":{"width":"19%"},"width":369,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-7.png","element":"img"}],[{"text":"Before proving the above we prove a few preliminary lemmas.","element":"span"}],[{"id":"id-58","text":"Lemma 6.5. ","element":"span"},{"text":"Let ","element":"span"},{"text":"H ","element":"span"},{"text":"be a hypothesis class and ","element":"span"},{"style":{"height":12.8},"width":18,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-8.png","element":"img","alt":" ℓ","inline":true,"padRight":true},{"text":"be a loss function that defines a metric on ","element":"span"},{"text":"Y","element":"span"},{"text":". Let ","element":"span"},{"text":"T ","element":"span"},{"text":"be an ","element":"span"},{"text":"H","element":"span"},{"text":"-satisfiable ","element":"span"},{"text":"(","element":"span"},{"text":"X","element":"span"},{"text":", ","element":"span"},{"text":"Y","element":"span"},{"text":")","element":"span"},{"text":"-tree where all leaves have depth ","element":"span"},{"text":"d ","element":"span"},{"text":"and such that for each internal node ","element":"span"},{"text":"(","element":"span"},{"style":{"height":19.14},"width":595.28,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-9.png","element":"img","alt":"x, y1, y2), ℓ(y1, y2) > 2−i. Then","inline":true}],[{"style":{"width":"21%"},"width":398,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-10.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Since the tree is ","element":"span"},{"text":"H","element":"span"},{"text":"-satisfiable, we can label the leaves with functions ","element":"span"},{"style":{"height":16.8},"width":347.44,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-11.png","element":"img","alt":" f ∈ H. Any two","inline":true,"padRight":true},{"text":"of these functions ","element":"span"},{"style":{"height":16.4},"width":98.12,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-12.png","element":"img","alt":" f1, f2","inline":true,"padRight":true},{"text":"must satisfy ","element":"span"},{"style":{"height":19.14},"width":328.8,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-13.png","element":"img","alt":" d∞(f1, f2) ≥ 2−i ","inline":true,"padRight":true},{"text":"since there must be some internal node (","element":"span"},{"style":{"height":17.6},"width":806.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-14.png","element":"img","alt":"x, y1, y2) where f1(x) = y1 and f2(x) = y2","inline":true},{"text":". Therefore, there are 2","element":"span"},{"style":{"height":8.8},"width":18,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-15.png","element":"img","alt":"d ","inline":true,"padRight":true},{"text":"functions in ","element":"span"},{"text":"H ","element":"span"},{"text":"such that any two have ","element":"span"},{"style":{"height":15.09},"width":56.56,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-16.png","element":"img","alt":" d∞","inline":true,"padRight":true},{"text":"distance bigger than 2","element":"span"},{"style":{"height":8.4},"width":38.4,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-17.png","element":"img","alt":"−i","inline":true},{"text":". This implies that","element":"span"}],[{"style":{"width":"18%"},"width":338,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-18.png","element":"img"}],[{"text":"Now by the definition of covering dimension, we conclude ","element":"span"},{"style":{"height":17.6},"width":748.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-19.png","element":"img","alt":" d ≤ (i + 1) · Cdim(H). ■","inline":true}],[{"text":"Given a rooted binary tree ","element":"span"},{"text":"T","element":"span"},{"text":", we say a rooted binary tree ","element":"span"},{"style":{"height":12},"width":51.03,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-20.png","element":"img","alt":" T′ ","inline":true,"padRight":true},{"text":"is contained in ","element":"span"},{"text":"T ","element":"span"},{"text":"if all of the nodes of ","element":"span"},{"style":{"height":12},"width":51.04,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-21.png","element":"img","alt":" T′ ","inline":true,"padRight":true},{"text":"are nodes of ","element":"span"},{"text":"T ","element":"span"},{"text":"and the nodes of ","element":"span"},{"style":{"height":12},"width":51.04,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-22.png","element":"img","alt":" T′ ","inline":true,"padRight":true},{"text":"form a binary tree where each interior node has two children under the topology given by ","element":"span"},{"text":"T","element":"span"},{"text":".","element":"span"}],[{"id":"id-59","text":"Lemma 6.6. ","element":"span"},{"text":"Consider a rooted binary tree ","element":"span"},{"text":"T ","element":"span"},{"text":"(where all interior nodes have exactly two children) and say its nodes are colored with colors ","element":"span"},{"text":"1","element":"span"},{"text":", ","element":"span"},{"text":"2","element":"span"},{"text":", . . . , c","element":"span"},{"text":". ","element":"span"},{"text":"We say the colored tree satisfies property ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":199.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-23.png","element":"img","alt":"x1, . . . , xc)","inline":true,"padRight":true},{"text":"if for each ","element":"span"},{"style":{"height":17.6},"width":111.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-24.png","element":"img","alt":" i ∈ [c]","inline":true},{"text":", it does not contain a monochromatic complete binary tree of color ","element":"span"},{"text":"i ","element":"span"},{"text":"and depth ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-25.png","element":"img","alt":" xi","inline":true},{"text":". If the coloring of ","element":"span"},{"text":"T ","element":"span"},{"text":"satisfies property ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":199.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-26.png","element":"img","alt":"x1, . . . , xc)","inline":true},{"text":", there exists a leaf such that on the path from the root to the leaf, there are at most ","element":"span"},{"style":{"height":10.69},"width":36.96,"height":26.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-27.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"nodes of color ","element":"span"},{"style":{"height":17.6},"width":287.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-28.png","element":"img","alt":" i for all i ∈ [c].","inline":true}],[{"text":"Proof. ","element":"span"},{"text":"We prove the lemma by induction on ","element":"span"},{"style":{"height":13.09},"width":245.4,"height":32.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-29.png","element":"img","alt":" x1 + · · · + xc","inline":true},{"text":". The base cases are obvious. Now say the root of ","element":"span"},{"text":"T ","element":"span"},{"text":"is colored with color ","element":"span"},{"text":"i","element":"span"},{"text":". Clearly we must have ","element":"span"},{"style":{"height":12.29},"width":87.76,"height":30.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-30.png","element":"img","alt":" xi >","inline":true,"padRight":true},{"text":"0. Then either the left or right subtree of the root must satisfy property (","element":"span"},{"style":{"height":14.8},"width":393.24,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-31.png","element":"img","alt":"x1, . . . , xi − 1, . . . , xc","inline":true},{"text":"). Using the inductive hypothesis, we get the desired. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/33-32.png","element":"img","alt":"■","inline":true,"padRight":true},{"text":"Proof of Theorem ","element":"span"},{"href":"#id-57","text":"6.4. ","element":"a"},{"text":"Assume for the sake of contradiction that ","element":"span"},{"style":{"height":17.6},"width":359.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-0.png","element":"img","alt":" τ(H) > 6 · Cdim(H","inline":true},{"text":"). Consider a (","element":"span"},{"text":"X","element":"span"},{"text":", ","element":"span"},{"text":"Y","element":"span"},{"text":")-tree ","element":"span"},{"text":"T ","element":"span"},{"text":"that is ","element":"span"},{"text":"H","element":"span"},{"text":"-satisfiable and has cost larger than 6 ","element":"span"},{"style":{"height":17.6},"width":170.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-1.png","element":"img","alt":" · Cdim(H","inline":true},{"text":"). Note we can assume that there are no nodes in ","element":"span"},{"style":{"height":17.6},"width":306.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-2.png","element":"img","alt":" T where ℓ(y1, y2","inline":true},{"text":") = 0 since otherwise, we can delete that node and keep only its left subtree. Let ","element":"span"},{"text":"c ","element":"span"},{"text":"be an integer such that for all nodes (","element":"span"},{"style":{"height":17.6},"width":640.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-3.png","element":"img","alt":"x, y1, y2), we have ℓ(y1, y2) > 2−c.","inline":true}],[{"text":"Now color the internal nodes of ","element":"span"},{"text":"T ","element":"span"},{"text":"with ","element":"span"},{"text":"c ","element":"span"},{"text":"colors ","element":"span"},{"text":"{","element":"span"},{"text":"1","element":"span"},{"text":", ","element":"span"},{"text":"2","element":"span"},{"text":", . . . c","element":"span"},{"text":"} ","element":"span"},{"text":"where a node (","element":"span"},{"style":{"height":17.6},"width":360.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-4.png","element":"img","alt":"x, y1, y2) is color i","inline":true}],[{"style":{"width":"60%"},"width":1132,"height":114,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-5.png","element":"img"}],[{"text":"Note by Lemma ","element":"span"},{"href":"#id-58","text":"6.5, ","element":"a"},{"text":"T ","element":"span"},{"text":"does not contain any monochromatic, complete binary trees of color ","element":"span"},{"text":"i ","element":"span"},{"text":"with depth at least (","element":"span"},{"style":{"height":17.6},"width":288.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-6.png","element":"img","alt":"i + 1) · Cdim(H","inline":true},{"text":"). By Lemma ","element":"span"},{"href":"#id-59","text":"6.6, ","element":"a"},{"text":"this implies the total cost of ","element":"span"},{"text":"T ","element":"span"},{"text":"is at most","element":"span"}],[{"style":{"width":"36%"},"width":675,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-7.png","element":"img"}],[{"text":"which completes the proof. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-8.png","element":"img","alt":"■","inline":true}],[{"text":"However, we cannot hope for any sort of converse to Theorem ","element":"span"},{"href":"#id-57","text":"6.4 ","element":"a"},{"text":"as evidenced by the following example. Let ","element":"span"},{"text":"X ","element":"span"},{"text":"= ","element":"span"},{"text":"Y ","element":"span"},{"text":"= [0","element":"span"},{"text":", ","element":"span"},{"text":"1] and let ","element":"span"},{"style":{"height":17.6},"width":908.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-9.png","element":"img","alt":" ℓ(y1, y2) = |y1 − y2|. Let H = {1x=c | c ∈ [0, 1]}","inline":true,"padRight":true},{"text":"be the set of all indicator functions of points in [0","element":"span"},{"text":", ","element":"span"},{"text":"1]. ","element":"span"},{"text":"H ","element":"span"},{"text":"has infinite covering dimension but its tree dimension is just 1.","element":"span"}],[{"text":"6.2 ","element":"span"},{"text":"Regret Bounds from Tree Dimension","element":"span"}],[{"id":"id-61","text":"Theorem 6.7. ","element":"span"},{"text":"In the full feedback model there exists an algorithm with regret ","element":"span"},{"style":{"height":17.6},"width":129.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-10.png","element":"img","alt":" O(τ(H","inline":true},{"text":")). Furthermore, no algorithm can guarantee less than ","element":"span"},{"style":{"height":17.6},"width":276.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-11.png","element":"img","alt":" τ(H)/2 regret.","inline":true}],[{"text":"First we prove that the algorithm below achieves the upper bound.","element":"span"}],[{"id":"id-60","style":{"width":"100%"},"width":1874,"height":471,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-12.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Assume for the sake of contradiction that this is false. Then we can construct a ","element":"span"},{"style":{"height":15.09},"width":38.88,"height":37.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-13.png","element":"img","alt":" St","inline":true},{"text":"-satisfiable (","element":"span"},{"style":{"height":15.2},"width":141.12,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-14.png","element":"img","alt":"X, Y, si","inline":true},{"text":")-tree with (","element":"span"},{"style":{"height":12},"width":156.68,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-15.png","element":"img","alt":"xt, y1, y2","inline":true},{"text":") as its root node and cost bigger than ","element":"span"},{"style":{"height":17.6},"width":553.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-16.png","element":"img","alt":" τ(St). ■","inline":true}],[{"style":{"width":"65%"},"width":1218,"height":237,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/34-17.png","element":"img"}],[{"style":{"height":16.4},"width":270.96,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-0.png","element":"img","alt":"Proof. Since ǫ","inline":true,"padRight":true},{"text":"is the smallest value such that ","element":"span"},{"style":{"height":17.89},"width":68.16,"height":44.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-1.png","element":"img","alt":" Aǫ,t","inline":true,"padRight":true},{"text":"is non-empty, then for every ","element":"span"},{"style":{"height":17.89},"width":329.96,"height":44.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-2.png","element":"img","alt":" y ∈ Aǫ,t we must","inline":true,"padRight":true},{"text":"have ","element":"span"},{"style":{"height":17.6},"width":652.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-3.png","element":"img","alt":" τ (St ∩ {f|f(xt) = y}) = τ (St) − ǫ","inline":true},{"text":". It follows that if ","element":"span"},{"style":{"height":17.6},"width":296.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-4.png","element":"img","alt":" ℓ(yt, f0(xt)) ≤ ǫ","inline":true,"padRight":true},{"text":"we are done.","element":"span"}],[{"text":"Consider now the case where ","element":"span"},{"style":{"height":17.6},"width":444.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-5.png","element":"img","alt":" L := ℓ(yt, f0(xt)) > ǫ.","inline":true,"padRight":true},{"text":"For this case, we want to argue that ","element":"span"},{"style":{"height":18.48},"width":610.8,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-6.png","element":"img","alt":"f0(xt) /∈ AL′,t for any L′ < L","inline":true},{"text":", since after we get the feedback, we will update ","element":"span"},{"style":{"height":17.6},"width":262.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-7.png","element":"img","alt":" St+1 = {f ∈","inline":true},{"style":{"height":17.6},"width":352.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-8.png","element":"img","alt":"St; f(xt) = f0(xt)}","inline":true},{"text":". Therefore ","element":"span"},{"style":{"height":18.48},"width":527.76,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-9.png","element":"img","alt":" f0(xt) /∈ AL′,t for all L′ < L","inline":true,"padRight":true},{"text":"implies that: ","element":"span"},{"style":{"height":17.6},"width":394.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-10.png","element":"img","alt":" τ(St+1) ≤ τ(St) − L.","inline":true}],[{"text":"were the case that ","element":"span"},{"style":{"height":18.48},"width":254.88,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-11.png","element":"img","alt":" f0(xt) ∈ AL′,t","inline":true},{"text":", we would have ","element":"span"},{"style":{"height":14.4},"width":133.6,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-12.png","element":"img","alt":" L ≤ L′ ","inline":true,"padRight":true},{"text":"by Lemma ","element":"span"},{"href":"#id-60","text":"6.8, ","element":"a"},{"text":"contradicting the fact that","element":"span"}],[{"style":{"width":"99%"},"width":1869,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-13.png","element":"img"}],[{"text":"Proof of Theorem ","element":"span"},{"href":"#id-61","text":"6.7. ","element":"a"},{"text":"The upper bound follows directly from the previous lemma. For the lower bound, consider an (","element":"span"},{"text":"X","element":"span"},{"text":", ","element":"span"},{"text":"Y","element":"span"},{"text":")-tree with cost ","element":"span"},{"style":{"height":17.6},"width":285.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-14.png","element":"img","alt":" τ(H) that is H","inline":true},{"text":"-satisfiable. We can ensure that all leaves have the same depth ","element":"span"},{"text":"d ","element":"span"},{"text":"(by adding nodes of the form (","element":"span"},{"text":"x, y, y","element":"span"},{"text":")). Now the adversary chooses a leaf uniformly at random. If the sequence of nodes from the root to the leaf are","element":"span"}],[{"style":{"width":"29%"},"width":554,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-15.png","element":"img"}],[{"text":"then the adversary presents the inputs ","element":"span"},{"style":{"height":11.2},"width":227.28,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-16.png","element":"img","alt":" x1, x2, . . . xd","inline":true,"padRight":true},{"text":"in that order to the learner. Since the loss function satisfies the triangle inequality, the expected loss of any learner is at least ","element":"span"},{"style":{"height":17.6},"width":259.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-17.png","element":"img","alt":" τ(H)/2 so we","inline":true,"padRight":true},{"text":"are done. ","element":"span"},{"style":{"height":0},"width":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-18.png","element":"img","alt":"■","inline":true}],[{"text":"Remark. ","element":"span"},{"text":"Algorithm ","element":"span"},{"href":"#id-60","text":"7 ","element":"a"},{"text":"assumes that the set ","element":"span"},{"style":{"height":18.69},"width":230.8,"height":46.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-19.png","element":"img","alt":" {ǫ; Aǫ,t ̸= ∅}","inline":true,"padRight":true},{"text":"has a minimum, which is always the case if the hypothesis class ","element":"span"},{"text":"H ","element":"span"},{"text":"is finite. For infinite ","element":"span"},{"text":"H","element":"span"},{"text":", this minimum might not exist. In such a case, choose ","element":"span"},{"style":{"height":18.69},"width":368.56,"height":46.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-20.png","element":"img","alt":" ǫ = inf{ǫ; Aǫ,t ̸= ∅}","inline":true,"padRight":true},{"text":"and choose ","element":"span"},{"style":{"height":19.42},"width":477.6,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-21.png","element":"img","alt":" yt ∈ Aǫ+gt,t for gt = 1/2t","inline":true},{"text":". Theorem ","element":"span"},{"href":"#id-61","text":"6.7 ","element":"a"},{"text":"can then be easily adapted to provide a bound of ","element":"span"},{"style":{"height":18.24},"width":491.52,"height":45.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-22.png","element":"img","alt":" τ(H) + �t gt ≤ τ(H) + 1.","inline":true}],[{"text":"6.3 ","element":"span"},{"text":"Separating binary and full feedback","element":"span"}],[{"text":"With binary feedback we can no longer obtain loss bounds that depend only on tree dimension. To see this, consider the following example:","element":"span"}],[{"text":"Let ","element":"span"},{"text":"H ","element":"span"},{"text":"be the set of all functions ","element":"span"},{"style":{"height":17.6},"width":598,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-23.png","element":"img","alt":" f : [n] → {0, 1/n, .....(n − 1)/n}","inline":true},{"text":". There are ","element":"span"},{"style":{"height":12.14},"width":47.4,"height":30.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-24.png","element":"img","alt":" nn ","inline":true,"padRight":true},{"text":"such functions. Now for each, slightly perturb the outputs (i.e. ","element":"span"},{"style":{"height":17.6},"width":293.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-25.png","element":"img","alt":" f(i) = j/n + ǫ","inline":true},{"text":") so that for every ","element":"span"},{"style":{"height":16.4},"width":234.72,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-26.png","element":"img","alt":" f1, f2 and i,","inline":true},{"style":{"height":17.6},"width":726.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.01703/images/35-27.png","element":"img","alt":"f1(i) ̸= f2(i). Let ℓ(y1, y2) = |y1 − y2|","inline":true},{"text":". The tree dimension of this class is ","element":"span"},{"text":"O","element":"span"},{"text":"(1). However, clearly any algorithm must incur Ω(","element":"span"},{"text":"n","element":"span"},{"text":") loss in expectation with binary feedback.","element":"span"}]]},{"heading":"References","paragraphs":[[{"text":"[1] Jacob D Abernethy, Elad Hazan, and Alexander Rakhlin. Interior-point methods for full-information and bandit online learning. ","element":"span"},{"text":"IEEE Transactions on Information Theory","element":"span"},{"text":", 58(7):4164–4175, 2012.","element":"span"}],[{"id":"id-0","text":"[2] Kareem Amin, Afshin Rostamizadeh, and Umar Syed. Repeated contextual auctions with strategic buyers. In ","element":"span"},{"text":"Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014, December 8-13 2014, Montreal, Quebec, Canada","element":"span"},{"text":", pages 622–630, 2014.","element":"span"}],[{"id":"id-24","text":"[3] Peter L Bartlett, Philip M Long, and Robert C Williamson. Fat-shattering and the learnability of real-valued ","element":"span"},{"text":"functions. ","element":"span"},{"text":"Journal of Computer and System Sciences","element":"span"},{"text":", 52(3):434–452, 1996.","element":"span"}],[{"id":"id-8","text":"[4] Hamsa Bastani and Mohsen Bayati. Online decision-making with high-dimensional covariates. ","element":"span"},{"text":"Working paper, Stanford University","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-28","text":"[5] Dimitris Bertsimas and Santosh Vempala. Solving convex programs by random walks. ","element":"span"},{"text":"J. ACM","element":"span"},{"text":", 51(4):540–556, 2004.","element":"span"}],[{"id":"id-18","text":"[6] S´ebastien Bubeck, R´emi Munos, Gilles Stoltz, and Csaba Szepesv´ari. X-armed bandits. ","element":"span"},{"text":"Journal of Machine Learning Research","element":"span"},{"text":", 12(May):1655–1695, 2011.","element":"span"}],[{"id":"id-1","text":"[7] Nicol`o Cesa-Bianchi, Pierre Gaillard, Claudio Gentile, and S´ebastien Gerchinovitz. Algorithmic chaining and ","element":"span"},{"text":"the role of partial feedback in online nonparametric learning. ","element":"span"},{"text":"arXiv preprint arXiv:1702.08211","element":"span"},{"text":", 2017.","element":"span"}],[{"text":"[8] Maxime C. Cohen, Ilan Lobel, and Renato Paes Leme. Feature-based dynamic pricing. In ","element":"span"},{"text":"Proceedings of the 2016 ACM Conference on Economics and Computation, EC ’16, Maastricht, The Netherlands, July 24-28, 2016","element":"span"},{"text":", page 817, 2016.","element":"span"}],[{"id":"id-9","text":"[9] Mark A Davenport, Yaniv Plan, Ewout Van Den Berg, and Mary Wootters. 1-bit matrix completion. ","element":"span"},{"text":"Information and Inference: A Journal of the IMA","element":"span"},{"text":", 3(3):189–223, 2014.","element":"span"}],[{"id":"id-20","text":"[10] Pierre Gaillard and S´ebastien Gerchinovitz. ","element":"span"},{"text":"A chaining algorithm for online nonparametric regression. ","element":"span"},{"text":"In ","element":"span"},{"text":"Conference on Learning Theory","element":"span"},{"text":", pages 764–796, 2015.","element":"span"}],[{"id":"id-2","text":"[11] Adel Javanmard and Hamid Nazerzadeh. Dynamic pricing in high-dimensions. ","element":"span"},{"text":"Working paper, University of Southern California","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-41","text":"[12] Adam Kalai and Santosh Vempala. Efficient algorithms for online decision problems. ","element":"span"},{"text":"Journal of Computer and System Sciences","element":"span"},{"text":", 71(3):291–307, 2005.","element":"span"}],[{"id":"id-29","text":"[13] Ravi Kannan, L´aszl´o Lov´asz, and Mikl´os Simonovits. Random walks and an o","element":"span"},{"text":"*","element":"span"},{"text":"(n","element":"span"},{"text":"5","element":"span"},{"text":") volume algorithm for convex bodies. ","element":"span"},{"text":"Random Struct. Algorithms","element":"span"},{"text":", 11(1):1–50, 1997.","element":"span"}],[{"id":"id-3","text":"[14] Robert Kleinberg and Tom Leighton. ","element":"span"},{"text":"The value of knowing a demand curve: Bounds on regret for online posted-price auctions. In ","element":"span"},{"text":"Foundations of Computer Science, 2003. Proceedings. 44th Annual IEEE Symposium on","element":"span"},{"text":", pages 594–605. IEEE, 2003.","element":"span"}],[{"id":"id-19","text":"[15] Robert Kleinberg, Aleksandrs Slivkins, and Eli Upfal. Bandits and experts in metric spaces. ","element":"span"},{"text":"Journal of the ACM (JACM)","element":"span"},{"text":", 66(4):1–77, 2019.","element":"span"}],[{"id":"id-4","text":"[16] Akshay Krishnamurthy, Thodoris Lykouris, and Chara Podimata. Corrupted multidimensional binary search: ","element":"span"},{"text":"Learning in the presence of irrational agents. ","element":"span"},{"text":"arXiv preprint arXiv:2002.11650","element":"span"},{"text":", 2020.","element":"span"}],[{"id":"id-5","text":"[17] Renato Paes Leme and Jon Schneider. Contextual search via intrinsic volumes. In ","element":"span"},{"text":"59th IEEE Annual Symposium on Foundations of Computer Science, FOCS 2018, Paris, France, October 7-9, 2018","element":"span"},{"text":", pages 268–282, 2018.","element":"span"}],[{"id":"id-23","text":"[18] Nick Littlestone. Learning quickly when irrelevant attributes abound: A new linear-threshold algorithm. ","element":"span"},{"text":"Machine learning","element":"span"},{"text":", 2(4):285–318, 1988.","element":"span"}],[{"id":"id-6","text":"[19] Ilan Lobel, Renato Paes Leme, and Adrian Vladu. ","element":"span"},{"text":"Multidimensional binary search for contextual decision-making. ","element":"span"},{"text":"Operations Research","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-21","text":"[20] L´aszl´o Lov´asz. Hit-and-run mixes fast. ","element":"span"},{"text":"Mathematical Programming","element":"span"},{"text":", 86(3):443–461, 1999.","element":"span"}],[{"text":"[21] Jieming Mao, Renato Leme, and Jon Schneider. Contextual pricing for lipschitz buyers. In ","element":"span"},{"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pages 5643–5651, 2018.","element":"span"}],[{"id":"id-10","text":"[22] Yaniv Plan and Roman Vershynin. Robust 1-bit compressed sensing and sparse logistic regression: A convex ","element":"span"},{"text":"programming approach. ","element":"span"},{"text":"IEEE Transactions on Information Theory","element":"span"},{"text":", 59(1):482–494, 2012.","element":"span"}],[{"id":"id-7","text":"[23] Sheng Qiang and Mohsen Bayati. Dynamic pricing with demand covariates. ","element":"span"},{"text":"Available at SSRN 2765257","element":"span"},{"text":", 2016.","element":"span"}],[{"text":"[24] Aleksandrs Slivkins. Contextual bandits with similarity information. ","element":"span"},{"text":"The Journal of Machine Learning Research","element":"span"},{"text":", 15(1):2533–2568, 2014.","element":"span"}],[{"id":"id-42","text":"[25] Chen-Yu Wei and Haipeng Luo. More adaptive algorithms for adversarial bandits. ","element":"span"},{"text":"Proceedings of Machine Learning Research","element":"span"},{"text":", 75, 2018.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]