1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMi4wODkwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2025-06-11T00:00:00.000Z","paperID":"2002.08907","published":"2020-02-20T00:00:00.000Z","authors":"[\"Carderera Alejandro\",\"Pokutta Sebastian\"]","title":"Second-order Conditional Gradient Sliding","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2025-06-13T04:03:11.896Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9zZWNvbmQtb3JkZXItY29uZGl0aW9uYWwtZ3JhZGllbnRzIn0=","type":"pwc","url":"https://paperswithcode.com/paper/second-order-conditional-gradients","data":"{\"date\":\"2025-06-15T04:02:09.576Z\"}"}],"reposConnection":{"edges":[{"official":null,"node":{"id":"eyJyZXBvSUQiOiIyNDE5NzY2MjAiLCJzb3VyY2UiOiJnaXRodWIifQ==","source":"github","repoID":"241976620","url":"https://github.com/alejandro-carderera/SOCGS","title":"SOCGS","language":"python","stars":0,"forks":2,"framework":null,"scoreTrending":null,"updated":null,"created":null,"downloads":null,"likes":null,"owner":[{"username":"alejandro-carderera","avatar":"https://avatars.githubusercontent.com/u/56440520?v=4"}]}}]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"pokutta sebastian","node":{"id":"eyJhZGRyZXNzIjoicG9rdXR0YUB6aWIuZGUifQ==","address":"pokutta@zib.de","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"uZS0NZUAAAAJ"},{"thirdPartyID":"FtsLNqIAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI4YmIzNWFkMS05NzJmLTQ1NDctYjU3OC03OTM4OTM3MjViMzEifQ==","name":"sebastian pokutta","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/33f1310c5854996c40b5dea0b4c2e98f_2a7c063fe618e935f4be6793314651cdfca159449aea2d65660fcd59a27d0dbd"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTQwNy4wNzMxIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1407.0731"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wOTI2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.09269"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNDQxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.04415"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wNTEyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.05120"},{"id":"eyJwYXBlcklEIjoiMjEwMS4xMDA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2101.10040"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wNDcxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.04711"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wMzM0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.03349"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wODQyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.08426"},{"id":"eyJwYXBlcklEIjoiMjAwOS4xNDExNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.14114"},{"id":"eyJwYXBlcklEIjoiMTgxMC4xMjk5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.12997"},{"id":"eyJwYXBlcklEIjoiMTgxMC4wMzIxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.03218"},{"id":"eyJwYXBlcklEIjoiMTcwMy4wNTg0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1703.05840"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNzI0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.07243"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wNjM2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.06369"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wODkwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.08907"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wNzMxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.07311"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xMTkyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.11921"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wNDY3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.04679"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNzg2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.07867"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNjgwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.06806"},{"id":"eyJwYXBlcklEIjoiMjMxMi4xNTIzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.15230"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMDc1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.00759"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xNjc4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.16788"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wMjA3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.02072"},{"id":"eyJwYXBlcklEIjoiMTkwNC4xMjMzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.12335"},{"id":"eyJwYXBlcklEIjoiMjExMC4wODEwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.08105"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wMTIzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.01236"},{"id":"eyJwYXBlcklEIjoiMjQwMy4xMjc2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.12764"},{"id":"eyJwYXBlcklEIjoiMjExMS4wMDg0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.00843"},{"id":"eyJwYXBlcklEIjoiMjEwMS4xMTQ0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2101.11443"},{"id":"eyJwYXBlcklEIjoiMTUwMS4wNjI0MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1501.06241"},{"id":"eyJwYXBlcklEIjoiMTUwOS4wMDEzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1509.00130"},{"id":"eyJwYXBlcklEIjoiMTgwNy4wOTQwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.09405"},{"id":"eyJwYXBlcklEIjoiMjIwMi4xMTc5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.11797"},{"id":"eyJwYXBlcklEIjoiMjMwNC4wMzc1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.03755"},{"id":"eyJwYXBlcklEIjoiMjIwOC4xMTAxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2208.11010"},{"id":"eyJwYXBlcklEIjoiNTQyMjgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54228"},{"id":"eyJwYXBlcklEIjoiNzA0NTUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70455"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xNzQzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.17434"}]}]}},{"author":"carderera alejandro","node":{"id":"eyJhZGRyZXNzIjoiYWxlamFuZHJvLmNhcmRlcmVyYUBnYXRlY2guZWR1In0=","address":"alejandro.carderera@gatech.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/56440520?v=4","username":"alejandro-carderera"}],"scholar":[{"thirdPartyID":"HvCPD1IAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI2ZjI2MzQ1Mi05NjFjLTQ4ZWItODVhMC01Y2ZiOWZlYjQxMjAifQ==","name":"alejandro carderera","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMi4wODkwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.08907"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNzg2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.07867"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNjgwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.06806"}]}]}}]},"__typename":"paper","authorArray":["Carderera Alejandro","Pokutta Sebastian"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2002.08907","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2002.08907","publisher":"arxiv","paperJSON":{"title":"Second-order Conditional Gradient Sliding","paperID":"2002.08907","avgLineHeight":11.94,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"Constrained second-order convex optimization algorithms are the method of choice when a high accuracy solution to a problem is needed, due to their local quadratic convergence. These algorithms require the solution of a constrained quadratic subproblem at every iteration. We present the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Second-Order Conditional Gradient Sliding ","element":"span"},{"text":"(SOCGS) algorithm, which uses a projection-free algorithm to solve the constrained quadratic subproblems inexactly and uses inexact Hessian oracles (subject to an accuracy requirement). When the feasible region is a polytope the algorithm converges quadratically in primal gap after a finite number of linearly convergent iterations. Once in the quadratic regime the SOCGS algorithm requires ","element":"span"},{"style":{"height":13.6},"width":237.9,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-0.png","element":"img","alt":"O(log(log 1/𝜀))","inline":true,"padRight":true},{"text":"first-order and inexact Hessian oracle calls and ","element":"span"},{"style":{"height":13.6},"width":373.97,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-1.png","element":"img","alt":" O(log(1/𝜀) log(log 1/𝜀))","inline":true,"padRight":true},{"text":"linear minimization oracle calls to achieve an ","element":"span"},{"style":{"height":6.4},"width":18,"height":16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-2.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution. This algorithm is useful when the feasible region can only be accessed efficiently through a linear optimization oracle, and computing first-order information of the function, although possible, is costly.","element":"span"}]]},{"heading":"1. Introduction","paragraphs":[[{"text":"We focus on the optimization problem defined as","element":"span"}],[{"id":"id-0","style":{"width":"54%"},"width":1017,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-3.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"X ","element":"span"},{"text":"is a polytope and ","element":"span"},{"style":{"height":14.8},"width":180.15,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-4.png","element":"img","alt":" 𝑓 : X → ℝ","inline":true,"padRight":true},{"text":"is ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-5.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strongly convex, has ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-6.png","element":"img","alt":" 𝐿","inline":true},{"text":"-Lipschitz continuous gradients and has ","element":"span"},{"style":{"height":12.39},"width":40.07,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-7.png","element":"img","alt":"𝐿2","inline":true},{"text":"-Lipschitz continuous Hessian.","element":"span"}],[{"text":"An immensely powerful approach to tackle Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") is to construct a second-order approximation to ","element":"span"},{"style":{"height":15.2},"width":69.41,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-8.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"at the current iterate using ","element":"span"},{"style":{"height":15.2},"width":103.2,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-9.png","element":"img","alt":" ∇ 𝑓 (x)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16.99},"width":121.07,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-10.png","element":"img","alt":" ∇2 𝑓 (x)","inline":true},{"text":", and move in the direction that minimizes this approximation, giving rise to a family of methods known as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Newton ","element":"span"},{"text":"methods, first developed for unconstrained problems (","element":"span"},{"href":"#id-1","referenceIndex":36,"text":"Kantorovich","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":36,"text":"1948","element":"a"},{"text":"). Variants of the former converge globally and have a local quadratic convergence rate when minimizing a self-concordant function or a strongly convex function with Lipschitz continuous Hessian (","element":"span"},{"href":"#id-2","referenceIndex":53,"text":"Nesterov & Nemirovskii","element":"a"},{"text":", ","element":"span"},{"href":"#id-2","referenceIndex":53,"text":"1994","element":"a"},{"text":"; ","element":"span"},{"href":"#id-3","referenceIndex":51,"text":"Nesterov","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":51,"text":"2013","element":"a"},{"text":"). When the problem at hand is constrained to a convex set, one can use a constrained analog of these methods (","element":"span"},{"href":"#id-4","referenceIndex":45,"text":"Levitin & Polyak","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":45,"text":"1966","element":"a"},{"text":"), where a quadratic approximation to the function is minimized over ","element":"span"},{"text":"X ","element":"span"},{"text":"at each iteration.","element":"span"}],[{"text":"However, there are two shortcomings to these methods. First, computing second-order information about ","element":"span"},{"style":{"height":15.2},"width":69.38,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-11.png","element":"img","alt":"𝑓 (x)","inline":true,"padRight":true},{"text":"is expensive. This has led to the development of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Variable-Metric ","element":"span"},{"text":"algorithms, which use approximate second-order information. Secondly, in many cases solving the quadratic subproblem to optimality is too costly. This has resulted in numerous ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Inexact Variable-Metric ","element":"span"},{"text":"algorithms, which in many cases inherit many of the favorable properties of Newton methods (","element":"span"},{"href":"#id-5","referenceIndex":58,"text":"Scheinberg & Tang","element":"a"},{"text":", ","element":"span"},{"href":"#id-5","referenceIndex":58,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-6","referenceIndex":43,"text":"Lee et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-6","referenceIndex":43,"text":"2014","element":"a"},{"text":").","element":"span"}],[{"text":"The ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conditional Gradients ","element":"span"},{"text":"(CG) algorithm (","element":"span"},{"href":"#id-4","referenceIndex":45,"text":"Levitin & Polyak","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":45,"text":"1966","element":"a"},{"text":") (also known as the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Frank-Wolfe ","element":"span"},{"text":"algorithm (","element":"span"},{"href":"#id-7","referenceIndex":19,"text":"Frank & Wolfe","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":19,"text":"1956","element":"a"},{"text":")) instead builds a linear approximation to ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-12.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"using ","element":"span"},{"style":{"height":15.2},"width":103.19,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/0-13.png","element":"img","alt":" ∇ 𝑓 (x)","inline":true},{"text":", and moves in the direction given by the point that minimizes this linear approximation over ","element":"span"},{"text":"X","element":"span"},{"text":". Instead of solving a constrained quadratic problem at each iteration, it solves a constrained linear problem, which is usually much cheaper. As the algorithm maintains its iterates as convex combinations of extremal points of ","element":"span"},{"text":"X ","element":"span"},{"text":"obtained from the linear optimization problem it is dubbed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"projection-free","element":"span"},{"text":". Conditional Gradients have become the method of choice in many applications where projecting onto ","element":"span"},{"text":"X ","element":"span"},{"text":"is computationally prohibitive, such as, e.g., in video co-localization (","element":"span"},{"href":"#id-8","referenceIndex":35,"text":"Joulin et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":35,"text":"2014","element":"a"},{"text":") or greedy particle optimization in Bayesian inference (","element":"span"},{"href":"#id-9","referenceIndex":21,"text":"Futami ","element":"a"},{"href":"#id-9","referenceIndex":21,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":21,"text":"2019","element":"a"},{"text":").","element":"span"}],[{"text":"For constrained problems where the gradient of ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-0.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"is relatively hard to compute, using Projected Variable-Metric methods seems counter-intuitive, yet it allows the construction of a quadratic approximation whose gradients are much cheaper to compute. Minimizing a quadratic approximation at each iteration is often costly, but due to the substantial progress it provides per-iteration it can often become competitive in wall-clock time with using first-order algorithms to directly minimize ","element":"span"},{"style":{"height":16},"width":97.69,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-1.png","element":"img","alt":" 𝑓 (x) (","inline":true},{"href":"#id-10","referenceIndex":59,"text":"Schmidt et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":59,"text":"2009","element":"a"},{"text":"). We consider the case where both the first-order oracle for ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-2.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"and the projection oracle onto ","element":"span"},{"text":"X ","element":"span"},{"text":"are computationally expensive, but linear programming oracles over ","element":"span"},{"text":"X ","element":"span"},{"text":"are relatively cheap. In this setting, we show how conditional gradient algorithms can be used to compute Inexact Projected Variable-Metric steps, in an approach that is similar in essence to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conditional Gradient Sliding ","element":"span"},{"text":"(CGS) (","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"Lan & Zhou","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"2016","element":"a"},{"text":"), where the Euclidean projections onto ","element":"span"},{"text":"X ","element":"span"},{"text":"in ","element":"span"},{"style":{"height":12},"width":708.94,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-3.png","element":"img","alt":" Nesterov’s Accelerated Gradient Descent","inline":true,"padRight":true},{"text":"are computed using the conditional gradient algorithm. We also show how coupling with an independent sequence of conditional gradient steps we can guarantee the global linear convergence in primal gap of the algorithm.","element":"span"}],[{"id":"id-36","style":{"fontWeight":"bold"},"text":"1.1 Contributions and related work","element":"span"}],[{"text":"We provide a projection-free Inexact Variable-Metric algorithm, denoted as the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Second-order Conditional Gradient Sliding ","element":"span"},{"text":"(SOCGS) algorithm which uses inexact second-order information. The algorithm has a stopping criterion that relies on a lower bound on the primal gap, e.g., via smoothness, and achieves global linear convergence and quadratic local convergence when close to the optimum.","element":"span"}],[{"text":"The use of a combination of second-order and projection-free methods was first pioneered in ","element":"span"},{"href":"#id-12","referenceIndex":28,"text":"Gonçalves & ","element":"a"},{"href":"#id-12","referenceIndex":28,"text":"Melo ","element":"a"},{"text":"(","element":"span"},{"href":"#id-12","referenceIndex":28,"text":"2017","element":"a"},{"text":"), who proposed an algorithm in which exact unconstrained Newton steps were performed, and were later projected onto ","element":"span"},{"text":"X ","element":"span"},{"text":"using the Euclidean norm and the CG algorithm. This resulted in a method that showed local linear convergence in distance to the optimum for functions whose derivative satisfied a Hölder-like condition and also for a subclass of analytic functions. This was later extended in ","element":"span"},{"href":"#id-13","referenceIndex":27,"text":"Gonçalves ","element":"a"},{"href":"#id-13","referenceIndex":27,"text":"& Oliveira ","element":"a"},{"text":"(","element":"span"},{"href":"#id-13","referenceIndex":27,"text":"2018","element":"a"},{"text":") to deal with inexact second-order information, using the inexactness criteria in ","element":"span"},{"href":"#id-14","referenceIndex":49,"text":"Morini ","element":"a"},{"text":"(","element":"span"},{"href":"#id-14","referenceIndex":49,"text":"1999","element":"a"},{"text":"), and resulting in the same local linear convergence. A variation of the former algorithm, was shown to converge globally (without an explicit convergence rate) using a non-monotone line search strategy (","element":"span"},{"href":"#id-15","referenceIndex":29,"text":"Gonçalves ","element":"a"},{"href":"#id-15","referenceIndex":29,"text":"& Oliveira","element":"a"},{"text":", ","element":"span"},{"href":"#id-15","referenceIndex":29,"text":"2019","element":"a"},{"text":"). Neither of these three algorithms included a complexity analysis on the number of linear minimization oracle calls needed to achieve a certain target accuracy.","element":"span"}],[{"text":"An approach that is similar in spirit is the recent ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Newton Conditional Gradient ","element":"span"},{"text":"(NCG) algorithm (","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"Liu ","element":"a"},{"href":"#id-16","referenceIndex":46,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"2022","element":"a"},{"text":") which performs Inexact Newton steps using a conditional gradient algorithm to minimize a self-concordant function over ","element":"span"},{"text":"X","element":"span"},{"text":". This algorithm requires exact second-order information, as opposed to the approximate information used by the SOCGS algorithm, however it does not require the function to be smooth and strongly convex, or the feasible region to be a polytope. After a finite number of damped-steps the NCG algorithm reaches an ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-4.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution with ","element":"span"},{"style":{"height":19.38},"width":147.52,"height":48.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-5.png","element":"img","alt":" O(log 1𝜀 )","inline":true,"padRight":true},{"text":"first order and exact Hessian oracle calls ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":15.2},"width":121.42,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-6.png","element":"img","alt":" O(𝜀−𝜈)","inline":true,"padRight":true},{"text":"linear optimization oracle calls with ","element":"span"},{"style":{"height":15.2},"width":197.82,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-7.png","element":"img","alt":" 𝜈 = 1 + 𝑜(1)","inline":true},{"text":". Note that ","element":"span"},{"href":"#id-17","referenceIndex":55,"text":"Ochs & Malitsky ","element":"a"},{"text":"(","element":"span"},{"href":"#id-17","referenceIndex":55,"text":"2019","element":"a"},{"text":")[Example 4.3] also proposed a conditional gradient-based Variable-Metric algorithm via their ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Model Function-Based Conditional Gradient ","element":"span"},{"text":"algorithm, however their approach is markedly different from ours: the steps performed in their algorithm can be seen as unconstrained Variable-Metric steps which are projected onto ","element":"span"},{"text":"X ","element":"span"},{"text":"using the Euclidean norm while the SOCGS performs steps which can be interpreted as unconstrained Inexact Variable-Metric steps which are projected onto ","element":"span"},{"text":"X ","element":"span"},{"text":"using a norm defined by the positive semi-definite matrix that approximates the Hessian. The same can be said regarding the algorithms in (","element":"span"},{"href":"#id-15","referenceIndex":29,"text":"Gonçalves & Oliveira","element":"a"},{"text":", ","element":"span"},{"href":"#id-15","referenceIndex":29,"text":"2019","element":"a"},{"text":"), moreover, the SOCGS algorithm directly approximately minimizes a quadratic using a CG variant, in an operation that directly represents an Inexact Projected Variable-Metric step, whereas the algorithm in (","element":"span"},{"href":"#id-15","referenceIndex":29,"text":"Gonçalves & Oliveira","element":"a"},{"text":", ","element":"span"},{"href":"#id-15","referenceIndex":29,"text":"2019","element":"a"},{"text":") proceeds in a sequential manner, first computing the Newton step, and afterwards projecting onto ","element":"span"},{"text":"X ","element":"span"},{"text":"using the CG algorithm.","element":"span"}],[{"text":"Further CG variants for the minimization of a self-concordant function have been developed in ","element":"span"},{"href":"#id-18","referenceIndex":17,"text":"Dvurechensky ","element":"a"},{"href":"#id-18","referenceIndex":17,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-18","referenceIndex":17,"text":"2020","element":"a"},{"text":"), in which an algorithm was developed that achieves an ","element":"span"},{"style":{"height":7.2},"width":18,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-8.png","element":"img","alt":" 𝜖","inline":true},{"text":"-optimal solution in primal gap after ","element":"span"},{"style":{"height":15.2},"width":118.11,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-9.png","element":"img","alt":"𝑂(1/𝜖)","inline":true,"padRight":true},{"text":"first-order, second-order and linear minimization oracle calls. A related second-order CG variant was later developed in ","element":"span"},{"href":"#id-19","referenceIndex":62,"text":"Zhao & Freund ","element":"a"},{"text":"(","element":"span"},{"href":"#id-19","referenceIndex":62,"text":"2022","element":"a"},{"text":") for the minimization of the sum of a logarithmically-homogeneous self-concordant barrier function and a non-smooth function with bounded domain. The aforementioned algorithm reaches an ","element":"span"},{"style":{"height":7.2},"width":18,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-10.png","element":"img","alt":" 𝜖","inline":true},{"text":"-optimal solution in primal gap after ","element":"span"},{"style":{"height":15.2},"width":118.76,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-11.png","element":"img","alt":" 𝑂(1/𝜖)","inline":true,"padRight":true},{"text":"first-order, second-order and linear minimization oracle calls.","element":"span"}],[{"text":"Later on, other variants in ","element":"span"},{"href":"#id-20","referenceIndex":11,"text":"Carderera et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-20","referenceIndex":11,"text":"2021","element":"a"},{"text":") and ","element":"span"},{"href":"#id-21","referenceIndex":18,"text":"Dvurechensky et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-21","referenceIndex":18,"text":"2022","element":"a"},{"text":") were shown to achieve an ","element":"span"},{"style":{"height":7.2},"width":18,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-12.png","element":"img","alt":" 𝜖","inline":true},{"text":"-optimal solution in primal gap when minimizing a generalized self-concordant function after ","element":"span"},{"style":{"height":15.2},"width":118.76,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/1-13.png","element":"img","alt":" 𝑂(1/𝜖)","inline":true,"padRight":true},{"text":"first-order, domain and linear minimization oracle calls (where the domain oracle call simply checks if a given point is in the domain of the function being minimized). These variants, therefore, do not require second-order information. When the feasible region under consideration is a polytope, a variant presented in ","element":"span"},{"href":"#id-20","referenceIndex":11,"text":"Carderera et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-20","referenceIndex":11,"text":"2021","element":"a"},{"text":") of the Away-step Conditional Gradient (ACG) algorithm (","element":"span"},{"href":"#id-22","referenceIndex":60,"text":"Wolfe","element":"a"},{"text":", ","element":"span"},{"href":"#id-22","referenceIndex":60,"text":"1970","element":"a"},{"text":") with the stepsize of ","element":"span"},{"href":"#id-23","referenceIndex":56,"text":"Pedregosa et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-23","referenceIndex":56,"text":"2020","element":"a"},{"text":") was shown to achieve an ","element":"span"},{"style":{"height":7.2},"width":18,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-0.png","element":"img","alt":" 𝜖","inline":true},{"text":"-optimal solution in primal gap after ","element":"span"},{"style":{"height":15.2},"width":176.16,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-1.png","element":"img","alt":" 𝑂(log 1/𝜖)","inline":true,"padRight":true},{"text":"first-order, domain and linear minimization oracle calls. Another CG variant in ","element":"span"},{"href":"#id-21","referenceIndex":18,"text":"Dvurechensky et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-21","referenceIndex":18,"text":"2022","element":"a"},{"text":") was shown to achieve an ","element":"span"},{"style":{"height":7.2},"width":18,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-2.png","element":"img","alt":" 𝜖","inline":true},{"text":"-optimal solution in primal gap after ","element":"span"},{"style":{"height":15.2},"width":176.08,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-3.png","element":"img","alt":" 𝑂(log 1/𝜖)","inline":true,"padRight":true},{"text":"first-order, second-order and linear minimization oracle calls when the feasible region is a polytope.","element":"span"}],[{"text":"Since our initial submission in 2020, several follow-up works and related preprints have been published in conferences and journals. We have updated the bibliography to reflect these developments.","element":"span"}]]},{"heading":"2. Preliminaries","paragraphs":[[{"text":"We denote the unique minimizer of Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") by ","element":"span"},{"style":{"height":15.32},"width":318.82,"height":38.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-4.png","element":"img","alt":" x∗. Let S𝑛++ and 𝐼𝑛","inline":true,"padRight":true},{"text":"denote the set of symmetric positive ","element":"span"},{"text":"definite matrices and the identity matrix in ","element":"span"},{"style":{"height":12.19},"width":84.55,"height":30.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-5.png","element":"img","alt":" ℝ𝑛×𝑛","inline":true},{"text":". We denote the largest eigenvalue of the matrix ","element":"span"},{"style":{"height":12.59},"width":209.56,"height":31.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-6.png","element":"img","alt":" 𝐻 ∈ ℝ𝑛×𝑛 as","inline":true},{"style":{"height":15.31},"width":466.32,"height":38.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-7.png","element":"img","alt":"𝜆max (𝐻). Let ∥·∥ and ∥·∥𝐻","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Euclidean norm ","element":"span"},{"text":"and the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"matrix norm ","element":"span"},{"text":"defined by ","element":"span"},{"style":{"height":15.14},"width":134.56,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-8.png","element":"img","alt":" 𝐻 ∈ S𝑛++","inline":true},{"text":", respectively. ","element":"span"},{"text":"We denote the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"diameter ","element":"span"},{"text":"of the polytope ","element":"span"},{"style":{"height":16.42},"width":1184.22,"height":41.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-9.png","element":"img","alt":" X as 𝐷 = maxx,y∈X ∥x − y∥, and its vertices by vert (X) ⊆ X. Given a","inline":true,"padRight":true},{"text":"non-empty set ","element":"span"},{"style":{"height":12},"width":125.12,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-10.png","element":"img","alt":" S ⊂ ℝ𝑛 ","inline":true,"padRight":true},{"text":"we refer to its ","element":"span"},{"style":{"height":15.2},"width":666.37,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-11.png","element":"img","alt":" convex hull as conv (S). For any x ∈ X","inline":true,"padRight":true},{"text":"we denote by ","element":"span"},{"text":"F (","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"minimal face ","element":"span"},{"text":"of ","element":"span"},{"text":"X ","element":"span"},{"text":"that contains ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":". Lastly, given a matrix ","element":"span"},{"style":{"height":15.14},"width":134.57,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-12.png","element":"img","alt":" 𝐻 ∈ S𝑛++ ","inline":true,"padRight":true},{"text":"we denote the ","element":"span"},{"style":{"height":15.2},"width":607.2,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-13.png","element":"img","alt":" 𝐻-scaled projection of y onto X as:","inline":true}],[{"style":{"width":"65%"},"width":1235,"height":197,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-14.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"2.1 The Conditional Gradients algorithm","element":"span"}],[{"text":"We define the linear approximation of the function ","element":"span"},{"style":{"height":15.2},"width":68.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-15.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"around the point ","element":"span"},{"style":{"height":9.99},"width":100.22,"height":24.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-16.png","element":"img","alt":" x𝑘 as:","inline":true}],[{"id":"id-31","style":{"width":"65%"},"width":1225,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-17.png","element":"img"}],[{"text":"At each iteration the vanilla ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conditional Gradients ","element":"span"},{"text":"(CG) algorithm (","element":"span"},{"href":"#id-4","referenceIndex":45,"text":"Levitin & Polyak","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":45,"text":"1966","element":"a"},{"text":"; ","element":"span"},{"href":"#id-7","referenceIndex":19,"text":"Frank & Wolfe","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":19,"text":"1956","element":"a"},{"text":"; ","element":"span"},{"href":"#id-24","referenceIndex":34,"text":"Jaggi","element":"a"},{"text":", ","element":"span"},{"href":"#id-24","referenceIndex":34,"text":"2013","element":"a"},{"text":") takes steps defined as ","element":"span"},{"style":{"height":18.78},"width":894.25,"height":46.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-18.png","element":"img","alt":" x𝑘+1 = x𝑘 + 𝛾𝑘(argminx∈X ˆ𝑙𝑘(x) − x𝑘) with 𝛾𝑘 ∈ (0, 1]","inline":true},{"text":". As the iterates are formed as convex combinations of points in ","element":"span"},{"text":"X ","element":"span"},{"text":"the algorithm is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"projection-free","element":"span"},{"text":". A useful quantity that can readily be computed in all steps is ","element":"span"},{"style":{"height":15.2},"width":413.22,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-19.png","element":"img","alt":" maxv∈X⟨∇ 𝑓 (x𝑘), x𝑘 − v⟩","inline":true},{"text":", known as the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Frank-Wolfe gap","element":"span"},{"text":", which provides an upper bound on the primal gap and is often used as a stopping criterion when running the CG algorithm.","element":"span"}],[{"text":"However, the vanilla CG algorithm does not converge linearly in primal gap when applied to Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") in general. This motivated the development of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Away-step Conditional Gradient ","element":"span"},{"text":"(ACG) algorithm (","element":"span"},{"href":"#id-22","referenceIndex":60,"text":"Wolfe","element":"a"},{"text":", ","element":"span"},{"href":"#id-22","referenceIndex":60,"text":"1970","element":"a"},{"text":") (shown in Algorithm ","element":"span"},{"href":"#id-25","text":"4 ","element":"a"},{"text":"in Appendix ","element":"span"},{"text":"B","element":"span"},{"text":"), which uses ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Away-steps ","element":"span"},{"text":"(shown in Algorithm ","element":"span"},{"href":"#id-26","text":"1","element":"a"},{"text":") and converges linearly when coupled with an exact line search (","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"Lacoste-Julien & Jaggi","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"2015","element":"a"},{"text":") or a step size strategy dependent on ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-20.png","element":"img","alt":" 𝐿","inline":true,"padRight":true},{"text":"(","element":"span"},{"href":"#id-23","referenceIndex":56,"text":"Pedregosa et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":56,"text":"2020","element":"a"},{"text":"). The ACG algorithm maintains what is called an ","element":"span"},{"style":{"height":15.2},"width":424.62,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-21.png","element":"img","alt":" active set S𝑘 ⊆ vert (X)","inline":true,"padRight":true},{"text":"which represents the potentially non-unique set of vertices of ","element":"span"},{"text":"X ","element":"span"},{"text":"such that ","element":"span"},{"style":{"height":15.2},"width":247.85,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-22.png","element":"img","alt":" x𝑘 ∈ conv (S𝑘)","inline":true},{"text":". Associated with this active set ","element":"span"},{"style":{"height":13.59},"width":42.37,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-23.png","element":"img","alt":" S𝑘","inline":true,"padRight":true},{"text":"we have a set of barycentric coordinates ","element":"span"},{"style":{"height":13.19},"width":43.04,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-24.png","element":"img","alt":" λ𝑘","inline":true,"padRight":true},{"text":"such that if we denote by ","element":"span"},{"style":{"height":15.2},"width":233.76,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-25.png","element":"img","alt":" λ𝑘(u) ∈ [0, 1]","inline":true,"padRight":true},{"text":"the element of ","element":"span"},{"style":{"height":13.19},"width":43.04,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-26.png","element":"img","alt":" λ𝑘","inline":true,"padRight":true},{"text":"associated with ","element":"span"},{"style":{"height":13.59},"width":109.67,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-27.png","element":"img","alt":" u ∈ S𝑘","inline":true,"padRight":true},{"text":"we have that ","element":"span"},{"style":{"height":16.96},"width":321.95,"height":42.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-28.png","element":"img","alt":" x𝑘 = �u∈S𝑘 λ𝑘(u)u","inline":true},{"text":", with ","element":"span"},{"style":{"height":16.96},"width":278.57,"height":42.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-29.png","element":"img","alt":" �u∈S𝑘 λ𝑘(u) = 1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.2},"width":170.22,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-30.png","element":"img","alt":" λ𝑘(u) ≥ 0","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":13.59},"width":123.58,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-31.png","element":"img","alt":" u ∈ S𝑘.","inline":true}],[{"id":"id-45","text":"2.1.1 Global convergence","element":"span"}],[{"text":"The first proof of asymptotic linear convergence of the ACG algorithm relied on the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"strict complementarity ","element":"span"},{"text":"of the problem in Equation (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") (shown in Assumption ","element":"span"},{"href":"#id-28","text":"1","element":"a"},{"text":"), which we will also use in the convergence proof of the SOCGS algorithm. A mild assumption that rules out degeneracy.","element":"span"}],[{"id":"id-28","style":{"fontWeight":"bold"},"text":"Assumption 1 ","element":"span"},{"text":"(Strict Complementarity)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We have that ","element":"span"},{"style":{"height":15.2},"width":348.53,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-32.png","element":"img","alt":" ⟨∇ 𝑓 (x∗) , x − x∗⟩ = 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"if and only if ","element":"span"},{"style":{"height":15.2},"width":187.84,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-33.png","element":"img","alt":" x ∈ F (x∗).","inline":true}],[{"text":"If Assumption ","element":"span"},{"href":"#id-28","text":"1 ","element":"a"},{"text":"is satisfied the iterates of the ACG algorithm reach ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-34.png","element":"img","alt":" F (x∗)","inline":true,"padRight":true},{"text":"in a finite number of steps, remaining in ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-35.png","element":"img","alt":" F (x∗)","inline":true,"padRight":true},{"text":"for all subsequent iterations (","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"Guélat & Marcotte","element":"a"},{"text":", ","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"1986","element":"a"},{"text":"). When inside ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/2-36.png","element":"img","alt":" F (x∗)","inline":true},{"text":", the iterates of the ACG algorithm contract the primal gap linearly. This analysis was later significantly extended to provide an explicit global linear convergence rate in primal gap (Theorem ","element":"span"},{"href":"#id-30","text":"2.1","element":"a"},{"text":"), by making use of the","element":"span"}],[{"id":"id-26","style":{"width":"99%"},"width":1872,"height":623,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"pyramidal width ","element":"span"},{"text":"of the polytope ","element":"span"},{"text":"X ","element":"span"},{"text":"(","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"Lacoste-Julien & Jaggi","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"2015","element":"a"},{"text":"). With the pyramidal width one can derive a primal progress guarantee for all steps taken by the ACG algorithm except ‘bad’ away-steps that reduce the cardinality of the active set ","element":"span"},{"style":{"height":13.59},"width":42.37,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-1.png","element":"img","alt":" S𝑘","inline":true},{"text":", that is when ","element":"span"},{"style":{"height":15.22},"width":602.72,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-2.png","element":"img","alt":" ⟨∇ 𝑓 (x𝑘), x𝑘 − v⟩ < ⟨∇ 𝑓 (x𝑘), a − x𝑘⟩","inline":true,"padRight":true},{"text":"and the step size satisfies ","element":"span"},{"style":{"height":10.4},"width":167.62,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-3.png","element":"img","alt":"𝛾𝑘 = 𝛾max","inline":true,"padRight":true},{"text":"in Algorithm ","element":"span"},{"href":"#id-26","text":"1","element":"a"},{"text":". This cannot happen more than ","element":"span"},{"style":{"height":14.8},"width":99.84,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-4.png","element":"img","alt":" ⌊𝐾/2⌋","inline":true,"padRight":true},{"text":"times when running the ACG algorithm for ","element":"span"},{"style":{"height":10.4},"width":29,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-5.png","element":"img","alt":" 𝐾","inline":true,"padRight":true},{"text":"iterations (as the algorithm cannot drop more vertices with away-steps than it has picked up with Frank-Wolfe steps). This is an important consideration to keep in mind, as it means that the ACG linear primal gap contraction does not hold on a per-iteration basis.","element":"span"}],[{"id":"id-30","style":{"fontWeight":"bold"},"text":"Theorem 2.1 ","element":"span"},{"text":"(Primal gap convergence of the ACG algorithm)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-27","referenceIndex":39,"style":{"fontStyle":"italic"},"text":"Lacoste-Julien & Jaggi","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-27","referenceIndex":39,"style":{"fontStyle":"italic"},"text":"2015","element":"a"},{"style":{"fontStyle":"italic"},"text":", Theorem 1) Given an initial point ","element":"span"},{"style":{"height":13.99},"width":114.01,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-6.png","element":"img","alt":" x0 ∈ X","inline":true},{"style":{"fontStyle":"italic"},"text":", the ACG algorithm applied to Problem ","element":"span"},{"text":"(","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"satisfies after ","element":"span"},{"style":{"height":12},"width":99.48,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-7.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"iterations:","element":"span"}],[{"style":{"width":"47%"},"width":896,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-8.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":10.8},"width":30,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-9.png","element":"img","alt":" 𝐷","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes the diameter of the polytope ","element":"span"},{"style":{"height":12.4},"width":142.44,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-10.png","element":"img","alt":" X and 𝛿","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"its pyramidal width.","element":"span"}],[{"text":"The CG algorithm and its variants make heavy use of the linear approximation ","element":"span"},{"style":{"height":18.3},"width":82.68,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-11.png","element":"img","alt":"ˆ𝑙𝑘(x)","inline":true,"padRight":true},{"text":"in Equation (","element":"span"},{"href":"#id-31","text":"2.3","element":"a"},{"text":"). What if we consider a quadratic approximation of ","element":"span"},{"style":{"height":15.2},"width":68.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-12.png","element":"img","alt":" 𝑓 (x)","inline":true},{"text":", as opposed to a linear approximation?","element":"span"}],[{"id":"id-43","style":{"fontWeight":"bold"},"text":"2.2 Projected Variable-Metric algorithms","element":"span"}],[{"text":"We define the quadratic approximation of the function ","element":"span"},{"style":{"height":15.2},"width":69.38,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-13.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"around the point ","element":"span"},{"style":{"height":9.59},"width":37.74,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-14.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"using a matrix ","element":"span"},{"style":{"height":15.14},"width":152.02,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-15.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++","inline":true},{"text":", ","element":"span"},{"text":"denoted by ","element":"span"},{"style":{"height":18.3},"width":143.82,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-16.png","element":"img","alt":"ˆ𝑓𝑘(x) as:","inline":true}],[{"id":"id-81","style":{"width":"72%"},"width":1360,"height":82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-17.png","element":"img"}],[{"text":"Intuitively, ","element":"span"},{"style":{"height":18.3},"width":82.07,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-18.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"will be a good local approximation to ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-19.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"around ","element":"span"},{"style":{"height":9.59},"width":37.76,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-20.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"if ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-21.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"text":"is a good approximation to ","element":"span"},{"style":{"height":16.99},"width":140.36,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-22.png","element":"img","alt":" ∇2 𝑓 (x𝑘)","inline":true},{"text":". In this case, the quadratic approximation to ","element":"span"},{"style":{"height":18.29},"width":82.08,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-23.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"will contain more information about the local curvature of the function ","element":"span"},{"style":{"height":15.2},"width":68.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-24.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"than the linear approximation ","element":"span"},{"style":{"height":18.3},"width":82.51,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-25.png","element":"img","alt":"ˆ𝑙𝑘(x)","inline":true},{"text":". Methods that minimize quadratic approximations of the function ","element":"span"},{"style":{"height":15.2},"width":75.62,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-26.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"over ","element":"span"},{"text":"X ","element":"span"},{"text":"to define iterates are commonly known as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Projected VariableMetric ","element":"span"},{"text":"(PVM) algorithms (","element":"span"},{"href":"#id-32","referenceIndex":52,"text":"Nesterov","element":"a"},{"text":", ","element":"span"},{"href":"#id-32","referenceIndex":52,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-33","referenceIndex":8,"text":"Ben-Tal & Nemirovskii","element":"a"},{"text":", ","element":"span"},{"href":"#id-33","referenceIndex":8,"text":"2020","element":"a"},{"text":"). These methods could, for example, set ","element":"span"},{"style":{"height":18.78},"width":936.43,"height":46.95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-27.png","element":"img","alt":" x𝑘+1 = x𝑘 + 𝛾𝑘(argminx∈X ˆ𝑓𝑘(x) − x𝑘), with 𝛾𝑘 ∈ (0, 1].","inline":true}],[{"text":"Minimizing the approximation ","element":"span"},{"style":{"height":18.29},"width":81.78,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-28.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"over ","element":"span"},{"text":"X ","element":"span"},{"text":"can be interpreted as a scaled projection operation onto ","element":"span"},{"text":"X","element":"span"},{"text":", which is why these methods are considered projection-based, as opposed to the CG algorithm.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 2.2. ","element":"span"},{"text":"Minimizing ","element":"span"},{"style":{"height":18.29},"width":81.74,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-29.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"over ","element":"span"},{"text":"X ","element":"span"},{"text":"can be viewed as the ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-30.png","element":"img","alt":" 𝐻𝑘","inline":true},{"text":"-scaled projection of ","element":"span"},{"style":{"height":16.99},"width":276.65,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-31.png","element":"img","alt":" x𝑘 − 𝐻−1∇ 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"onto ","element":"span"},{"text":"X","element":"span"},{"text":", namely:","element":"span"}],[{"id":"id-34","style":{"width":"68%"},"width":1276,"height":74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-32.png","element":"img"}],[{"text":"We can recover many well-known algorithms from the PVM formulation, for example, if we set ","element":"span"},{"style":{"height":12.39},"width":86.12,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-33.png","element":"img","alt":" 𝐻𝑘 =","inline":true},{"style":{"height":16.96},"width":146.58,"height":42.41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-34.png","element":"img","alt":"∇2 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"in Equation (","element":"span"},{"href":"#id-34","text":"2.5","element":"a"},{"text":") we recover the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Projected Newton ","element":"span"},{"text":"algorithm. Alternatively, if we use ","element":"span"},{"style":{"height":13.38},"width":128.58,"height":33.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/3-35.png","element":"img","alt":" 𝐻𝑘 = 𝐼𝑛","inline":true,"padRight":true},{"text":"we recover the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Projected Gradient Descent ","element":"span"},{"text":"(PGD) algorithm.","element":"span"}],[{"text":"2.2.1 Local convergence","element":"span"}],[{"text":"One of the most attractive features of the Projected Newton algorithm with ","element":"span"},{"style":{"height":14},"width":124.72,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-0.png","element":"img","alt":" 𝛾𝑘 = 1","inline":true,"padRight":true},{"text":"when applied to Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") is its local quadratic convergence in distance to ","element":"span"},{"style":{"height":11.38},"width":35.45,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-1.png","element":"img","alt":" x∗","inline":true},{"text":". This property also extends to PVM algorithms if ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-2.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"text":"approximates ","element":"span"},{"style":{"height":16.96},"width":146.58,"height":42.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-3.png","element":"img","alt":" ∇2 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"sufficiently well as ","element":"span"},{"style":{"height":9.19},"width":36.92,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-4.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"approaches ","element":"span"},{"style":{"height":11.39},"width":34.61,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-5.png","element":"img","alt":" x∗","inline":true},{"text":". What do we mean by sufficiently well? As ","element":"span"},{"style":{"height":15.2},"width":68.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-6.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"is strongly convex we know that for any ","element":"span"},{"style":{"height":15.54},"width":702.99,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-7.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++ and y ∈ X, then for d = y − x𝑘:","inline":true}],[{"style":{"width":"65%"},"width":1236,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-8.png","element":"img"}],[{"text":"for ","element":"span"},{"style":{"height":19.03},"width":1116.71,"height":47.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-9.png","element":"img","alt":" 𝜂𝑘 = max{𝜆max(𝐻−1𝑘 ∇2 𝑓 (x𝑘)), 𝜆max([∇2 𝑓 (x𝑘)]−1𝐻𝑘)} and 𝜂𝑘 ≥ 1","inline":true,"padRight":true},{"text":"(see Lemma ","element":"span"},{"href":"#id-35","text":"A.6 ","element":"a"},{"text":"in Appendix ","element":"span"},{"href":"#id-36","text":"A.1","element":"a"},{"text":"). The ","element":"span"},{"text":"parameter ","element":"span"},{"style":{"height":10.4},"width":36.83,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-10.png","element":"img","alt":" 𝜂𝑘","inline":true,"padRight":true},{"text":"can be used to measure how well ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-11.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"text":"approximates ","element":"span"},{"style":{"height":16.99},"width":140.36,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-12.png","element":"img","alt":" ∇2 𝑓 (x𝑘)","inline":true},{"text":", and will serve as our accuracy parameter. The chain of inequalities shown in Equation (","element":"span"},{"href":"#id-37","text":"2.6","element":"a"},{"text":") is presented as Assumption C in ","element":"span"},{"href":"#id-38","referenceIndex":37,"text":"Karimireddy ","element":"a"},{"href":"#id-38","referenceIndex":37,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-38","referenceIndex":37,"text":"2018a","element":"a"},{"text":"), where it is used to prove the global convergence of an Inexact Projected Variable-Metric variant. Using ","element":"span"},{"style":{"height":16.99},"width":235.9,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-13.png","element":"img","alt":" 𝐻𝑘 = ∇2 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"we recover ","element":"span"},{"style":{"height":14},"width":106.5,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-14.png","element":"img","alt":" 𝜂𝑘 = 1","inline":true},{"text":". We assume that we have access to an oracle ","element":"span"},{"style":{"height":15.54},"width":217.66,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-15.png","element":"img","alt":" Ω : X → S𝑛++","inline":true,"padRight":true},{"text":"that returns estimates of the Hessian that satisfy:","element":"span"}],[{"id":"id-41","style":{"fontWeight":"bold"},"text":"Assumption 2 ","element":"span"},{"text":"(Accuracy of Hessian oracle ","element":"span"},{"style":{"height":16},"width":58.45,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-16.png","element":"img","alt":" Ω).","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"The oracle ","element":"span"},{"style":{"height":10.8},"width":30,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-17.png","element":"img","alt":" Ω","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"queried with a point ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"style":{"fontStyle":"italic"},"text":"returns a matrix ","element":"span"},{"style":{"height":10.4},"width":32,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-18.png","element":"img","alt":" 𝐻","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with a parameter ","element":"span"},{"style":{"height":10.4},"width":20,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-19.png","element":"img","alt":" 𝜂","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that:","element":"span"}],[{"id":"id-37","style":{"width":"56%"},"width":1062,"height":95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-20.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Where ","element":"span"},{"style":{"height":16.99},"width":1034.43,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-21.png","element":"img","alt":" 𝜂 = max{𝜆max(𝐻−1∇2 𝑓 (x)), 𝜆max([∇2 𝑓 (x)]−1𝐻)} and 𝜔 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes a known constant.","element":"span"}],[{"text":"Intuitively, the accuracy of the oracle improves as the oracle is queried with points closer to ","element":"span"},{"style":{"height":11.39},"width":35.46,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-22.png","element":"img","alt":" x∗","inline":true},{"text":". If the oracle returns ","element":"span"},{"style":{"height":16.96},"width":689.66,"height":42.41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-23.png","element":"img","alt":" Ω (x) = ∇2 𝑓 (x) for all x ∈ X then 𝜔 = 0","inline":true},{"text":". This assumption allows us to obtain local quadratic convergence in distance to ","element":"span"},{"style":{"height":11.39},"width":34.92,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-24.png","element":"img","alt":" x∗ ","inline":true,"padRight":true},{"text":"for the simplest PVM algorithm, i.e., ","element":"span"},{"style":{"height":20.33},"width":696.36,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-25.png","element":"img","alt":" x𝑘+1 = x∗𝑘+1 = argminx∈X ˆ𝑓𝑘(x), as shown","inline":true,"padRight":true},{"text":"in Theorem ","element":"span"},{"href":"#id-39","text":"2.4 ","element":"a"},{"text":"(see Corollary ","element":"span"},{"href":"#id-40","text":"C.12 ","element":"a"},{"text":"in Appendix ","element":"span"},{"text":"C","element":"span"},{"text":").","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 2.3. ","element":"span"},{"text":"Note that finding a matrix ","element":"span"},{"style":{"height":10.4},"width":32,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-26.png","element":"img","alt":" 𝐻","inline":true,"padRight":true},{"text":"satisfying Assumption ","element":"span"},{"href":"#id-41","text":"2 ","element":"a"},{"text":"at ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":", given a fixed ","element":"span"},{"style":{"height":7.6},"width":28,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-27.png","element":"img","alt":" 𝜔","inline":true,"padRight":true},{"text":"requires knowledge of a tight lower bound on ","element":"span"},{"style":{"height":14.8},"width":149.67,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-28.png","element":"img","alt":" ∥x − x∗∥.","inline":true}],[{"id":"id-39","style":{"fontWeight":"bold"},"text":"Theorem 2.4 ","element":"span"},{"text":"(Local quadratic convergence of vanilla PVM algorithm)","element":"span"},{"style":{"height":11.6},"width":230.65,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-29.png","element":"img","alt":". Given an 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth and ","element":"span"},{"style":{"height":14.8},"width":172.11,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-30.png","element":"img","alt":" 𝜇-strongly","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"convex function with ","element":"span"},{"style":{"height":12.39},"width":40.07,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-31.png","element":"img","alt":" 𝐿2","inline":true},{"style":{"fontStyle":"italic"},"text":"-Lipschitz Hessian and a convex set ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"if Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"is satisfied, and we set ","element":"span"},{"style":{"height":20.34},"width":514.14,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-32.png","element":"img","alt":"x𝑘+1 = x∗𝑘+1 = argminx∈X ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"we have for all ","element":"span"},{"style":{"height":12.4},"width":103.68,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-33.png","element":"img","alt":" 𝑘 ≥ 0:","inline":true}],[{"style":{"width":"69%"},"width":1303,"height":203,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-34.png","element":"img"}],[{"text":"2.2.2 Global convergence","element":"span"}],[{"text":"One of the key questions that remains to be answered in this section is how PVM algorithms behave globally. For Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") the vanilla PVM algorithm with unit step size will converge globally, and if we use bounded step sizes, or a exact line search, we can show that the primal gap contracts linearly (Theorem ","element":"span"},{"href":"#id-42","text":"2.5","element":"a"},{"text":"). The global convergence of these methods can be recast in terms of a notion related to the multiplicative stability of the Hessian, allowing for elegant proofs of convergence (","element":"span"},{"href":"#id-38","referenceIndex":37,"text":"Karimireddy et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-38","referenceIndex":37,"text":"2018a","element":"a"},{"text":").","element":"span"}],[{"id":"id-42","style":{"fontWeight":"bold"},"text":"Theorem 2.5 ","element":"span"},{"text":"(Primal gap convergence of vanilla PVM algorithm with line search)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-38","referenceIndex":37,"style":{"fontStyle":"italic"},"text":"Karimireddy et al.","element":"a"},{"style":{"fontStyle":"italic"},"text":",","element":"span"}],[{"href":"#id-38","referenceIndex":37,"style":{"fontStyle":"italic"},"text":"2018a","element":"a"},{"style":{"fontStyle":"italic"},"text":", Theorem 4) Given an ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-35.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth and ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-36.png","element":"img","alt":" 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex function and a convex set ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"then the vanilla PVM algorithm with an exact line search or with a step size ","element":"span"},{"style":{"height":20.98},"width":145.1,"height":52.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-37.png","element":"img","alt":" 𝛾𝑘 = 𝜇𝐿𝜂𝑘 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"guarantees for all ","element":"span"},{"style":{"height":12.4},"width":103.68,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-38.png","element":"img","alt":" 𝑘 ≥ 0:","inline":true}],[{"style":{"width":"99%"},"width":1872,"height":317,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/4-39.png","element":"img"}]]},{"heading":"3. Second-order Conditional Gradient Sliding Algorithm","paragraphs":[[{"text":"The discussion of PVM algorithms in Section ","element":"span"},{"href":"#id-43","text":"2.2 ","element":"a"},{"text":"did not address two important concerns:","element":"span"}],[{"text":"1. The PVM algorithm requires computing a scaled projection at every iteration. These projections are usually too expensive to compute to optimality. Ideally we would want to solve these scaled projection problems to a certain accuracy, but can we maintain the local quadratic convergence in distance to the optimum shown in Theorem ","element":"span"},{"href":"#id-39","text":"2.4 ","element":"a"},{"text":"when computing approximate scaled projections?","element":"span"}],[{"text":"2. The global convergence rate of the PVM algorithm with exact line search and perfect Hessian information (Theorem ","element":"span"},{"href":"#id-42","text":"2.5 ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":14},"width":105.68,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-0.png","element":"img","alt":" 𝜂𝑘 = 1","inline":true},{"text":") has a worse dependence on the condition number ","element":"span"},{"style":{"height":15.2},"width":67.39,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-1.png","element":"img","alt":" 𝐿/𝜇","inline":true,"padRight":true},{"text":"than the convergence rate of the PGD and the ACG algorithm (see Theorem ","element":"span"},{"href":"#id-30","text":"2.1 ","element":"a"},{"text":"for the latter). Can we couple Inexact PVM steps with ACG steps and improve the global convergence rate in Theorem ","element":"span"},{"href":"#id-42","text":"2.5","element":"a"},{"text":"?","element":"span"}],[{"text":"The Second-order Conditional Gradient Sliding (SOCGS) algorithm (Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") is designed with these considerations in mind, providing global linear convergence in primal gap and local quadratic convergence in primal gap and distance to ","element":"span"},{"style":{"height":11.39},"width":35.43,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-2.png","element":"img","alt":" x∗","inline":true},{"text":". The algorithm couples an independent ACG step with line search (Line ","element":"span"},{"href":"#id-44","text":"4","element":"a"},{"text":") with an Inexact PVM step with unit step size (Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12","element":"a"},{"text":"). At the end of each iteration we choose the step that provides the greatest primal progress (Lines ","element":"span"},{"href":"#id-44","text":"14","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"18","element":"a"},{"text":"). The ACG steps in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"will ensure global linear convergence in primal gap, and the Inexact PVM steps in Lines ","element":"span"},{"href":"#id-44","text":"14","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"18 ","element":"a"},{"text":"will provide quadratic convergence. Note that the ACG iterates in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"do not depend on the Inexact PVM steps in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12","element":"a"},{"text":". This is because the ACG steps do not contract the primal gap on a per-iteration basis (see discussion in Section ","element":"span"},{"href":"#id-45","text":"2.1.1","element":"a"},{"text":").","element":"span"}],[{"text":"We compute the scaled projection in the Inexact PVM step (Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12","element":"a"},{"text":") using the ACG algorithm with exact line search, thereby making the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") projection-free. As the function being minimized in the Inexact PVM steps is quadratic there is a closed-form expression for the optimal step size for ","element":"span"},{"style":{"height":18.27},"width":88.3,"height":45.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-3.png","element":"img","alt":"ˆ𝑓𝑘 (x)","inline":true,"padRight":true},{"text":"in Line ","element":"span"},{"href":"#id-44","text":"10","element":"a"},{"text":". ","element":"span"},{"text":"The scaled projection problem is solved to an accuracy ","element":"span"},{"style":{"height":9.59},"width":36.68,"height":23.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-4.png","element":"img","alt":" 𝜀𝑘","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":18.3},"width":496.61,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-5.png","element":"img","alt":"ˆ𝑓𝑘(˜x𝑘+1) − minx∈X ˆ𝑓𝑘 (x) ≤ 𝜀𝑘","inline":true},{"text":", using the Frank-Wolfe gap as a stopping criterion, as in the CGS algorithm (","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"Lan & Zhou","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"2016","element":"a"},{"text":"). The accuracy parameter ","element":"span"},{"style":{"height":9.59},"width":36.67,"height":23.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-6.png","element":"img","alt":" 𝜀𝑘","inline":true,"padRight":true},{"text":"in the SOCGS algorithm depends on a lower bound on the primal gap of Problem ","element":"span"},{"href":"#id-0","text":"1.1 ","element":"a"},{"text":"which we denote by ","element":"span"},{"style":{"height":15.2},"width":112.68,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-7.png","element":"img","alt":" 𝑙𝑏 (x𝑘)","inline":true,"padRight":true},{"text":"that satisfies ","element":"span"},{"style":{"height":15.2},"width":423.49,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-8.png","element":"img","alt":" 𝑙𝑏 (x𝑘) ≤ 𝑓 (x𝑘) − 𝑓 (x∗).","inline":true}],[{"style":{"width":"99%"},"width":1872,"height":852,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-9.png","element":"img"}],[{"text":"Note that this guarantee also holds if we use a line search instead of the step size described above, as the line search is guaranteed to make at least as much progress. Computing the aforementioned quantity comes at no extra cost if ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-10.png","element":"img","alt":" 𝐿","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":10.8},"width":30,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-11.png","element":"img","alt":" 𝐷","inline":true,"padRight":true},{"text":"are known, as the Frank-Wolfe vertex from Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2 ","element":"a"},{"text":"can be reused. Alternatively one could use any CG variant that monotonically decreases the primal gap. It suffices to run an arbitrary number of steps ","element":"span"},{"style":{"height":7.2},"width":20,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-12.png","element":"img","alt":" 𝑛","inline":true,"padRight":true},{"text":"of the aforementioned variant to minimize ","element":"span"},{"style":{"height":15.2},"width":69.34,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-13.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"starting from ","element":"span"},{"style":{"height":9.59},"width":37.7,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-14.png","element":"img","alt":" x𝑘","inline":true},{"text":", resulting in ","element":"span"},{"style":{"height":17.32},"width":115.54,"height":43.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-15.png","element":"img","alt":" x𝑛𝑘 ∈ X","inline":true},{"text":". Simply noting that ","element":"span"},{"style":{"height":17.32},"width":232.17,"height":43.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-16.png","element":"img","alt":" 𝑓 (x𝑛𝑘) ≥ 𝑓 (x∗)","inline":true,"padRight":true},{"text":"allows us to conclude that ","element":"span"},{"style":{"height":17.32},"width":513.25,"height":43.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-17.png","element":"img","alt":" 𝑓 (x𝑘) − 𝑓 (x∗) ≥ 𝑓 (x𝑘) − 𝑓 (x𝑛𝑘)","inline":true},{"text":", and ","element":"span"},{"text":"therefore a valid lower bound is ","element":"span"},{"style":{"height":17.35},"width":395.56,"height":43.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-18.png","element":"img","alt":" 𝑙𝑏 (x𝑘) = 𝑓 (x𝑘) − 𝑓 (x𝑛𝑘)","inline":true},{"text":". The higher the number of CG steps performed from ","element":"span"},{"style":{"height":9.59},"width":37.34,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/5-19.png","element":"img","alt":"x𝑘","inline":true},{"text":", the tighter the resulting lower bound will be.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 3.3 ","element":"span"},{"text":"(Assuming knowledge of a lower bound)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"In several machine learning applications the value of ","element":"span"},{"style":{"height":15.2},"width":85.09,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-0.png","element":"img","alt":" 𝑓 (x∗)","inline":true,"padRight":true},{"text":"is known a priori, such is the case of the approximate Carathéodory problem (","element":"span"},{"href":"#id-46","referenceIndex":48,"text":"Mirrokni et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-46","referenceIndex":48,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-47","referenceIndex":14,"text":"Combettes & Pokutta","element":"a"},{"text":", ","element":"span"},{"href":"#id-47","referenceIndex":14,"text":"2023","element":"a"},{"text":") where ","element":"span"},{"style":{"height":15.2},"width":153.13,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-1.png","element":"img","alt":" 𝑓 (x∗) = 0","inline":true},{"text":". In other applications, estimating ","element":"span"},{"style":{"height":15.2},"width":85.1,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-2.png","element":"img","alt":" 𝑓 (x∗)","inline":true,"padRight":true},{"text":"is easier than estimating the strong convexity parameter (see (","element":"span"},{"href":"#id-48","referenceIndex":6,"text":"Barré et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-48","referenceIndex":6,"text":"2020","element":"a"},{"text":"; ","element":"span"},{"href":"#id-49","referenceIndex":5,"text":"Barré & d’Aspremont","element":"a"},{"text":", ","element":"span"},{"href":"#id-49","referenceIndex":5,"text":"2019","element":"a"},{"text":"; ","element":"span"},{"href":"#id-50","referenceIndex":3,"text":"Asi & Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-50","referenceIndex":3,"text":"2019","element":"a"},{"text":"; ","element":"span"},{"href":"#id-51","referenceIndex":32,"text":"Hazan & Kakade","element":"a"},{"text":", ","element":"span"},{"href":"#id-51","referenceIndex":32,"text":"2019","element":"a"},{"text":") for an in-depth discussion). This allows for tight lower bounds on the primal gap.","element":"span"}],[{"id":"id-44","style":{"width":"99%"},"width":1872,"height":1233,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-3.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"3.1 Global convergence","element":"span"}],[{"text":"The global convergence rate in primal gap of the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") is driven by the ACG steps in Line ","element":"span"},{"href":"#id-44","text":"4","element":"a"},{"text":", as such:","element":"span"}],[{"id":"id-137","style":{"height":13.99},"width":516.11,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-4.png","element":"img","alt":"Theorem 3.4. Given x0 ∈ X","inline":true},{"style":{"fontStyle":"italic"},"text":", then the SOCGS algorithm applied to Problem ","element":"span"},{"text":"(","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"satisfies:","element":"span"}],[{"style":{"width":"73%"},"width":1378,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-5.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":10.8},"width":30,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-6.png","element":"img","alt":" 𝐷","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes the diameter of the polytope ","element":"span"},{"style":{"height":12.4},"width":142.44,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-7.png","element":"img","alt":" X and 𝛿","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"its pyramidal width.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"As at each step the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") chooses between the independent ACG step (Line ","element":"span"},{"href":"#id-44","text":"4","element":"a"},{"text":") and the Inexact PVM step (Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12","element":"a"},{"text":") according to which one provides the greatest primal progress, the primal gap convergence in Theorem ","element":"span"},{"href":"#id-30","text":"2.1 ","element":"a"},{"text":"applies. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/6-8.png","element":"img","alt":"□","inline":true}],[{"style":{"fontWeight":"bold"},"text":"3.2 Local convergence","element":"span"}],[{"text":"Despite computing inexact scaled projections in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":", the Inexact PVM steps contract the distance to optimum quadratically when close enough to the optimal solution.","element":"span"}],[{"id":"id-52","style":{"height":14.4},"width":419.99,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-0.png","element":"img","alt":"Lemma 3.5. Given a 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-1.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth function ","element":"span"},{"style":{"height":15.2},"width":210.79,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-2.png","element":"img","alt":" 𝑓 (x) with 𝐿2","inline":true},{"style":{"fontStyle":"italic"},"text":"-Lipschitz Hessian and a convex set ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"is satisfied then the Inexact PVM steps in Lines ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"9","element":"a"},{"style":{"fontStyle":"italic"},"text":"-","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"12 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"of Algorithm ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"satisfy for all ","element":"span"},{"style":{"height":12.4},"width":103.69,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-3.png","element":"img","alt":"𝑘 ≥ 0:","inline":true}],[{"id":"id-55","style":{"width":"75%"},"width":1420,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":19.03},"width":1109.43,"height":47.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-5.png","element":"img","alt":" 𝜂𝑘 = max{𝜆max(𝐻−1𝑘 ∇2 𝑓 (x𝑘)), 𝜆max([∇2 𝑓 (x𝑘)]−1𝐻𝑘)} and 𝜔 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes a constant.","element":"span"}],[{"text":"In order to take advantage of the quadratic convergence in distance to the optimum shown in Lemma ","element":"span"},{"href":"#id-52","text":"3.5","element":"a"},{"text":", we need to show that at some point the SOCGS algorithm will always choose in Lines ","element":"span"},{"href":"#id-44","text":"14","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"19 ","element":"a"},{"text":"the Inexact PVM step defined in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12","element":"a"},{"text":". To be more specific, we show that the convergence in primal gap for the Inexact PVM step will also be quadratic. We do this by first showing that there is an iteration ","element":"span"},{"style":{"height":12},"width":99.54,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-6.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"text":"such that for all ","element":"span"},{"href":"#id-53","style":{"height":16.4},"width":692.38,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-7.png","element":"img","alt":" 𝑘 ≥ 𝐾 we have x𝑘 ∈ F (x∗) (Lemma D.8","inline":true,"padRight":true},{"text":"in Appendix ","element":"span"},{"href":"#id-54","text":"D.1","element":"a"},{"text":").","element":"span"}],[{"id":"id-56","style":{"height":14.8},"width":429.48,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-8.png","element":"img","alt":"Lemma 3.6. Given a 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-9.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth function ","element":"span"},{"style":{"height":15.2},"width":213.92,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-10.png","element":"img","alt":" 𝑓 (x) with 𝐿2","inline":true},{"style":{"fontStyle":"italic"},"text":"-Lipschitz continuous Hessian and a polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if Assumption ","element":"span"},{"href":"#id-28","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"are satisfied, then there is an index ","element":"span"},{"style":{"height":12},"width":99.55,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-11.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that for ","element":"span"},{"style":{"height":12.4},"width":100.42,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-12.png","element":"img","alt":" 𝑘 ≥ 𝐾","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"we have that ","element":"span"},{"style":{"height":15.2},"width":188.88,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-13.png","element":"img","alt":" x𝑘 ∈ F (x∗)","inline":true},{"style":{"fontStyle":"italic"},"text":", that is, both the Inexact PVM steps (Lines ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"9","element":"a"},{"style":{"fontStyle":"italic"},"text":"-","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"12 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"of Algorithm ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":") and the ACG step (Line ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"4 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"of Algorithm ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":") are in ","element":"span"},{"style":{"height":15.2},"width":114.08,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-14.png","element":"img","alt":" F (x∗).","inline":true}],[{"text":"We can upper bound the right-hand side of Equation (","element":"span"},{"href":"#id-55","text":"3.2","element":"a"},{"text":") using strong convexity, and the left-hand side using smoothness, Lemma ","element":"span"},{"href":"#id-56","text":"3.6 ","element":"a"},{"text":"and strict-complementarity (Assumption ","element":"span"},{"href":"#id-28","text":"1","element":"a"},{"text":"). This allows us to show that there exists an iteration after which the primal progress of the Inexact PMV steps in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"will be quadratic, which ensures the local quadratic convergence of the SOCGS algorithm.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 3.7 ","element":"span"},{"text":"(Quadratic convergence in primal gap of the SOCGS algorithm)","element":"span"},{"style":{"height":14.8},"width":208.63,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-15.png","element":"img","alt":". Given a 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-16.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth function ","element":"span"},{"style":{"height":15.2},"width":212.52,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-17.png","element":"img","alt":" 𝑓 (x) with 𝐿2","inline":true},{"style":{"fontStyle":"italic"},"text":"-Lipschitz Hessian and a polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if Assumption ","element":"span"},{"href":"#id-28","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"are satisfied, then there is a ","element":"span"},{"style":{"height":12},"width":99.49,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-18.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that for ","element":"span"},{"style":{"height":12.4},"width":100.35,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-19.png","element":"img","alt":" 𝑘 ≥ 𝐾","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the iterates of the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":") satisfy:","element":"span"}],[{"style":{"width":"61%"},"width":1158,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-20.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":19.02},"width":1109.43,"height":47.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-21.png","element":"img","alt":" 𝜂𝑘 = max{𝜆max(𝐻−1𝑘 ∇2 𝑓 (x𝑘)), 𝜆max([∇2 𝑓 (x𝑘)]−1𝐻𝑘)} and 𝜔 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes a constant.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3.3 Complexity analysis","element":"span"}],[{"text":"We defer the full details of the complexity analysis to Section ","element":"span"},{"href":"#id-57","text":"D.2 ","element":"a"},{"text":"in Appendix ","element":"span"},{"text":"D","element":"span"},{"text":". Throughout this section we make the simplifying assumption that we have at our disposal the tightest possible lower bound ","element":"span"},{"style":{"height":15.2},"width":162.5,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-22.png","element":"img","alt":" 𝑙𝑏(x𝑘) on","inline":true,"padRight":true},{"text":"the primal gap, that is, ","element":"span"},{"style":{"height":15.2},"width":385.72,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-23.png","element":"img","alt":" 𝑙𝑏(x𝑘) = 𝑓 (x𝑘) − 𝑓 (x∗)","inline":true,"padRight":true},{"text":"(in Remark ","element":"span"},{"href":"#id-58","text":"3.8 ","element":"a"},{"text":"we address a strategy that can be used when the primal gap is not known). Let ","element":"span"},{"style":{"height":16.99},"width":436.96,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-24.png","element":"img","alt":" 𝑟 = min{𝑟ACG, 𝑟PVM} > 0","inline":true,"padRight":true},{"text":"(where ","element":"span"},{"style":{"height":14.19},"width":88.88,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-25.png","element":"img","alt":" 𝑟ACG","inline":true,"padRight":true},{"text":"is described in Theorem ","element":"span"},{"href":"#id-59","text":"D.4 ","element":"a"},{"text":"and ","element":"span"},{"style":{"height":14.18},"width":92.42,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-26.png","element":"img","alt":"𝑟PVM","inline":true,"padRight":true},{"text":"in Corollary ","element":"span"},{"href":"#id-60","text":"D.7","element":"a"},{"text":"), ","element":"span"},{"style":{"height":15.22},"width":364.66,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-27.png","element":"img","alt":" 𝐺 = maxx∈X ∥∇ 𝑓 (x)∥","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.78},"width":740.05,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-28.png","element":"img","alt":" 𝛽 = max{(2𝐷𝐺)1/4, (2𝐿(1 + 𝜔𝐷2)𝐷3𝐺)1/8}","inline":true},{"text":". With these considerations in mind the different oracle complexities are listed in Table ","element":"span"},{"href":"#id-61","text":"1","element":"a"},{"text":". As in the classical analysis of PVM algorithms, the SOCGS algorithm shows local quadratic convergence after a number of iterations that is independent of ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-29.png","element":"img","alt":" 𝜀","inline":true,"padRight":true},{"text":"(but dependent on ","element":"span"},{"style":{"height":16},"width":219.74,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-30.png","element":"img","alt":" 𝑓 (x) and X).","inline":true}],[{"id":"id-58","style":{"fontWeight":"bold"},"text":"Remark 3.8. ","element":"span"},{"text":"Providing a looser lower bound ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-31.png","element":"img","alt":" 𝑙𝑏(x𝑘)","inline":true,"padRight":true},{"text":"on the primal gap does not affect the number of first-order or Hessian oracle calls, however it can significantly increase the number of linear optimization oracle calls used to compute the Inexact PVM steps in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12","element":"a"},{"text":". Note that the progress guarantee from a single ACG step that is not an away-step that drops a vertex is ","element":"span"},{"style":{"height":19.78},"width":711.25,"height":49.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-32.png","element":"img","alt":" 𝑓 (x𝑘) − 𝑓 (x𝑘+1) ≥ 𝜇4𝐿 ( 𝛿𝐷 )2( 𝑓 (𝑥𝑘) − 𝑓 (𝑥∗))","inline":true,"padRight":true},{"text":"(see Theorem 1 in ","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"Lacoste-Julien & Jaggi ","element":"a"},{"text":"(","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"2015","element":"a"},{"text":")). If we use as ","element":"span"},{"style":{"height":15.2},"width":105.62,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-33.png","element":"img","alt":" 𝑙𝑏(x𝑘)","inline":true,"padRight":true},{"text":"the progress obtained from such a step (note that ","element":"span"},{"style":{"height":15.2},"width":545.06,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-34.png","element":"img","alt":" 𝑓 (x𝑘) − 𝑓 (x∗) ≥ 𝑓 (x𝑘) − 𝑓 (x𝑘+1)","inline":true},{"text":") in the complexity analysis, one can obtain after a finite number of iterations, a ","element":"span"},{"style":{"height":14.8},"width":175.14,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-35.png","element":"img","alt":" log log 1/𝜀","inline":true,"padRight":true},{"text":"complexity in terms of FO and Hessian oracle calls and ","element":"span"},{"style":{"height":15.2},"width":348.64,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/7-36.png","element":"img","alt":" log(1/𝜀) log(log 1/𝜀)","inline":true,"padRight":true},{"text":"LO calls, but with worse constants then the ones in Table ","element":"span"},{"href":"#id-61","text":"1","element":"a"},{"text":".","element":"span"}],[{"id":"id-61","style":{"width":"94%"},"width":1768,"height":337,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-0.png","element":"img"}],[{"text":"Table 1: Complexity to reach an ","element":"figcaption","subtype":"caption"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-1.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution to Problem (","element":"figcaption","subtype":"caption"},{"href":"#id-0","text":"1.1","element":"a","subtype":"caption"},{"text":") for the SOCGS algorithm.","element":"figcaption","subtype":"caption"}]]},{"heading":"4. Computations","paragraphs":[[{"text":"We compare the performance of the SOCGS algorithm with that of other projection-free algorithms, and that of Projected-Gradient Descent (PGD). In all experiments we compare against the vanilla CG algorithm, the ACG algorithm, the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Pairwise-Step Conditional Gradients ","element":"span"},{"text":"algorithms (PCG) and the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Lazy ACG ","element":"span"},{"text":"algorithm (","element":"span"},{"href":"#id-62","referenceIndex":9,"text":"Braun et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-62","referenceIndex":9,"text":"2017","element":"a"},{"text":") (ACG (L)). In the first experiment we also compare against the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Decomposition Invariant Conditional Gradient ","element":"span"},{"text":"(DICG) algorithm (","element":"span"},{"href":"#id-63","referenceIndex":24,"text":"Garber & Meshi","element":"a"},{"text":", ","element":"span"},{"href":"#id-63","referenceIndex":24,"text":"2016","element":"a"},{"text":"), the CGS algorithm (","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"Lan & Zhou","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"2016","element":"a"},{"text":") and the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Stochastic Variance-Reduced Conditional Gradients ","element":"span"},{"text":"(SVRCG) algorithm (","element":"span"},{"href":"#id-64","referenceIndex":33,"text":"Hazan & Luo","element":"a"},{"text":", ","element":"span"},{"href":"#id-64","referenceIndex":33,"text":"2016","element":"a"},{"text":"). We were not able to achieve acceptable performance with the CGS algorithm in the second and third experiment and with the SVRFW algorithm in the third experiment. Lastly we also compare against the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Newton Conditional Gradients ","element":"span"},{"text":"(NCG) algorithm (","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"Liu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"2022","element":"a"},{"text":") which is similar in spirit to the SOCGS algorithm, in the second and third experiment. One of the key features of the NCG algorithm is that it does not require an exact line search strategy, as it provides a specific step size strategy (however it requires selecting five hyperparameters and using an exact Hessian).","element":"span"}],[{"text":"In the first problem the Hessian oracle will be inexact, but will satisfy Assumption ","element":"span"},{"href":"#id-41","text":"2 ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":13.2},"width":301.34,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-2.png","element":"img","alt":" 𝜔 = 0.1, moreover","inline":true,"padRight":true},{"text":"we will also assume knowledge of the primal gap, by first computing a solution to high accuracy. In the remaining problems the Hessian oracle will be exact, and we will assume that we do not have knowledge of the primal gap, and will use the strategy outlined in Remark ","element":"span"},{"href":"#id-58","text":"3.8","element":"a"},{"text":". In the second experiment, in addition to using the exact Hessian, we will also implement SOCGS with an LBFGS Hessian update (SOCGS LBFGS) (note that this does not satisfy Assumption ","element":"span"},{"href":"#id-41","text":"2","element":"a"},{"text":"). All the line searches that do not have a closed form solution are computed using a golden-section bounded line search between ","element":"span"},{"text":"0 ","element":"span"},{"text":"and ","element":"span"},{"text":"1","element":"span"},{"text":". The full details of the implementation can be found in Appendix ","element":"span"},{"href":"#id-65","text":"E","element":"a"},{"text":". In the second and third experiment we will also cap the maximum number of inner iterations to ","element":"span"},{"text":"1000 ","element":"span"},{"text":"for the SOCGS and NCG algorithms, as is done in the computational experiments of NCG and SVRCG.","element":"span"}],[{"style":{"width":"78%"},"width":1478,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-3.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Remark 4.1 ","element":"span"},{"text":"(Hyperparameter search for the NCG algorithm)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"We tested 27 hyperparameter combinations for the NCG algorithm, and the one that provided the best performance was selected (see Appendix ","element":"span"},{"href":"#id-65","text":"E ","element":"a"},{"text":"for the full details).","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Sparse coding over the Birkhoff polytope ","element":"span"},{"text":"In this example (Figure ","element":"span"},{"href":"#id-66","text":"1","element":"a"},{"text":") we minimize the objective function ","element":"span"},{"style":{"height":18.77},"width":410.56,"height":46.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-4.png","element":"img","alt":" 𝑓 (𝑋) = �𝑚𝑖=1 ∥y𝑖 − 𝑋z𝑖∥2","inline":true},{"text":", with ","element":"span"},{"style":{"height":12.58},"width":165.36,"height":31.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-5.png","element":"img","alt":" 𝑋 ∈ ℝ𝑛×𝑛","inline":true},{"text":", over the Birkhoff polytope. This objective function is ","element":"span"},{"text":"strongly convex if the vectors ","element":"span"},{"style":{"height":9.59},"width":27.07,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-6.png","element":"img","alt":" z𝑖","inline":true},{"text":", with ","element":"span"},{"style":{"height":14.4},"width":182.81,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-7.png","element":"img","alt":" 𝑚 ∈ [1, 𝑚]","inline":true,"padRight":true},{"text":"form a basis for ","element":"span"},{"style":{"height":10.98},"width":46.36,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-8.png","element":"img","alt":" ℝ𝑛","inline":true,"padRight":true},{"text":"(See discussion in Appendix ","element":"span"},{"href":"#id-67","text":"E.1","element":"a"},{"text":"). We generate synthetic data by creating a matrix ","element":"span"},{"style":{"height":12.99},"width":160.93,"height":32.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-9.png","element":"img","alt":" 𝐵 ∈ ℝ𝑛×𝑛","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":11.2},"width":112.64,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-10.png","element":"img","alt":" 𝑛 = 80","inline":true,"padRight":true},{"text":"entries sampled from a standard normal distribution, and ","element":"span"},{"style":{"height":7.2},"width":29,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-11.png","element":"img","alt":" 𝑚","inline":true,"padRight":true},{"text":"vectors ","element":"span"},{"style":{"height":10.99},"width":113.88,"height":27.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-12.png","element":"img","alt":" x ∈ ℝ𝑛","inline":true,"padRight":true},{"text":"(with ","element":"span"},{"style":{"height":11.2},"width":178.45,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-13.png","element":"img","alt":" 𝑚 = 10000","inline":true,"padRight":true},{"text":"in the first experiment and ","element":"span"},{"style":{"height":11.2},"width":198.78,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-14.png","element":"img","alt":" 𝑚 = 100000","inline":true,"padRight":true},{"text":"in the second), with entries sampled from a standard normal distribution, in order to form ","element":"span"},{"style":{"height":14.8},"width":279.46,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-15.png","element":"img","alt":" 𝑍 = {z1, · · · , z𝑚}","inline":true},{"text":". For both the experiments we verified numerically that the resulting objective function is strongly convex. The set of vectors ","element":"span"},{"style":{"height":14.82},"width":293.62,"height":37.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-16.png","element":"img","alt":" 𝑌 = {y1, · · · , y𝑚}","inline":true,"padRight":true},{"text":"is generated by computing ","element":"span"},{"style":{"height":13.6},"width":140.08,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-17.png","element":"img","alt":" y𝑖 = 𝐵z𝑖","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":16},"width":175.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-18.png","element":"img","alt":" 𝑖 ∈ ⟦1, 𝑚⟧","inline":true},{"text":". The starting point for all the algorithms is ","element":"span"},{"style":{"height":11.39},"width":32.76,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-19.png","element":"img","alt":" 𝐼𝑛","inline":true},{"text":". To implement the projection operation used in PGD we use the interior point solver implemented in CVXOPT (","element":"span"},{"href":"#id-68","referenceIndex":2,"text":"Andersen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-68","referenceIndex":2,"text":"2011","element":"a"},{"text":"), which we have found to be computationally faster than the Douglas-Rachford approach described in ","element":"span"},{"href":"#id-69","referenceIndex":13,"text":"Combettes & Pokutta ","element":"a"},{"text":"(","element":"span"},{"href":"#id-69","referenceIndex":13,"text":"2021","element":"a"},{"text":"). Note that the use of this implementation only impacts the performance with respect to time, and not with respect to iteration count.","element":"span"}],[{"style":{"height":14.8},"width":951.48,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-20.png","element":"img","alt":"Structured logistic regression over ℓ1 unit ball","inline":true,"padRight":true},{"text":"In this last experiment (Figure ","element":"span"},{"href":"#id-70","text":"2","element":"a"},{"text":") we minimize a function of the form ","element":"span"},{"style":{"height":29.2},"width":763.37,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-21.png","element":"img","alt":" 𝑓 (𝑥) = 1/𝑚 �𝑚𝑖=1 log�1 + 𝑒−𝑦𝑖⟨x,z𝑖⟩�+ 𝜆/2 ∥x∥2","inline":true,"padRight":true},{"text":"over the ","element":"span"},{"style":{"height":13.59},"width":31.82,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-22.png","element":"img","alt":" ℓ1","inline":true,"padRight":true},{"text":"unit ball with ","element":"span"},{"style":{"height":11.6},"width":143.11,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/8-23.png","element":"img","alt":" 𝜆 = 0.05","inline":true},{"text":". The ","element":"span"},{"text":"labels and samples used are taken from the training set of the ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"gissette ","element":"span"},{"text":"(","element":"span"},{"href":"#id-71","referenceIndex":31,"text":"Guyon et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-71","referenceIndex":31,"text":"2007","element":"a"},{"text":") and the ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"real-sim ","element":"span"},{"text":"(","element":"span"},{"href":"#id-72","referenceIndex":12,"text":"Chang & Lin","element":"a"},{"text":", ","element":"span"},{"href":"#id-72","referenceIndex":12,"text":"2011","element":"a"},{"text":") dataset, where ","element":"span"},{"style":{"height":11.6},"width":146.23,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-0.png","element":"img","alt":" 𝑛 = 5000","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.2},"width":156.24,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-1.png","element":"img","alt":" 𝑚 = 6000","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":12},"width":166.24,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-2.png","element":"img","alt":" 𝑛 = 72309","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.6},"width":176.24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-3.png","element":"img","alt":" 𝑚 = 20958","inline":true,"padRight":true},{"text":"respectively. The starting point for all the algorithms is the vector ","element":"span"},{"style":{"height":15.2},"width":210.88,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-4.png","element":"img","alt":" (1, 0, · · · , 0).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Inverse covariance estimation over spectrahedron ","element":"span"},{"text":"In the second experiment (Figure ","element":"span"},{"href":"#id-73","text":"3","element":"a"},{"text":") we minimize the function ","element":"span"},{"style":{"height":19.91},"width":846.88,"height":49.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-5.png","element":"img","alt":" 𝑓 (𝑋) = − log det(𝑋 + 𝛿𝐼𝑛) + trace (𝑆𝑋) + 𝜆2 ∥𝑋∥2𝐹","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":12.59},"width":172.78,"height":31.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-6.png","element":"img","alt":" 𝑋 ∈ ℝ𝑛×𝑛","inline":true,"padRight":true},{"text":"over the space of positive ","element":"span"},{"text":"semidefinite matrices of unit trace, with ","element":"span"},{"style":{"height":14.19},"width":370.72,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-7.png","element":"img","alt":" 𝛿 = 10−5 and 𝜆 = 0.05","inline":true},{"text":". This feasible region is not a polytope, and so the guarantees shown in the paper do not apply as they crucially rely on Theorem ","element":"span"},{"href":"#id-30","text":"2.1","element":"a"},{"text":", and the pyramidal width of the spectrahedron is zero. However, we include the results to show the promising numerical performance of the method. The matrix ","element":"span"},{"style":{"height":11.2},"width":21,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-8.png","element":"img","alt":" 𝑆","inline":true,"padRight":true},{"text":"is generated by computing a random orthonormal basis ","element":"span"},{"style":{"height":14.8},"width":292.74,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-9.png","element":"img","alt":" B = {v1, · · · , v𝑚}","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":10.99},"width":54.35,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-10.png","element":"img","alt":"ℝ𝑚 ","inline":true,"padRight":true},{"text":"and computing ","element":"span"},{"style":{"height":18.54},"width":438.74,"height":46.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-11.png","element":"img","alt":" 𝑆 = �𝑖=1 𝜎𝑖v𝑖v𝑇𝑖 , where 𝜎𝑖","inline":true,"padRight":true},{"text":"is uniformly distributed between ","element":"span"},{"style":{"height":16},"width":498.99,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-12.png","element":"img","alt":" 0.5 and 1 for 𝑖 ∈ ⟦1, 𝑚⟧. The","inline":true,"padRight":true},{"text":"starting point for all the algorithms is the matrix ","element":"span"},{"style":{"height":14.8},"width":107.07,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-13.png","element":"img","alt":" 1/𝑛𝐼𝑛.","inline":true}],[{"id":"id-66","style":{"width":"97%"},"width":1827,"height":368,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-14.png","element":"img"}],[{"id":"id-74","text":"Figure 1: Birkhoff polytope: Primal gap comparison for ","element":"figcaption","subtype":"caption"},{"href":"#id-74","style":{"height":16.4},"width":728.12,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-15.png","element":"img","alt":" 𝑚 = 10000 (a),(b) and 𝑚 = 100000 (c),(d).","inline":true}],[{"id":"id-70","style":{"width":"97%"},"width":1833,"height":368,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-16.png","element":"img"}],[{"id":"id-75","text":"Figure 2: ","element":"figcaption","subtype":"caption"},{"style":{"height":13.59},"width":31.82,"height":33.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-17.png","element":"img","alt":" ℓ1","inline":true},{"text":"-ball: Comparison in terms of primal gap for the ","element":"figcaption","subtype":"caption"},{"style":{"fontFamily":"monospace"},"text":"gissette ","element":"figcaption","subtype":"caption"},{"href":"#id-75","text":"(a)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-70","text":"(b) ","element":"a","subtype":"caption"},{"text":"and the ","element":"figcaption","subtype":"caption"},{"style":{"fontFamily":"monospace"},"text":"real-sim ","element":"figcaption","subtype":"caption"},{"href":"#id-70","text":"(c)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-70","text":"(d)","element":"a","subtype":"caption"}],[{"id":"id-73","style":{"width":"97%"},"width":1833,"height":535,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-18.png","element":"img"}],[{"id":"id-76","text":"Figure 3: Spectrahedron: Comparison in terms of primal gap for ","element":"figcaption","subtype":"caption"},{"href":"#id-76","style":{"height":16.4},"width":649.45,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/9-19.png","element":"img","alt":" 𝑛 = 100 (a),(b) and for 𝑛 = 50 (c),(d).","inline":true}]]},{"heading":"Conclusion","paragraphs":[[{"text":"This paper focuses on the minimization of a smooth and strongly convex function over a polytope in the setting where efficient access to the feasible region is limited to a linear optimization oracle and first-order information about the objective function is expensive to compute. We also assume inexact second-order information subject to an accuracy requirement.","element":"span"}],[{"text":"Given these challenges, we present the Second-order Conditional Gradient Sliding (SOCGS) algorithm, which at each iteration computes an Inexact Projected Variable-Metric (PVM) step with unit step size (using the Away-step Conditional Gradient (ACG) algorithm and an accuracy criterion that depends on a lower bound on the primal gap), and an independent ACG step with line search, and chooses the step that provides the greatest primal progress. As the algorithm relies on a linear minimization oracle, as opposed to a projection oracle, it is projection-free. The algorithm can be seen as the second-order analog of the Conditional Gradient Sliding algorithm (","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"Lan & Zhou","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"2016","element":"a"},{"text":"), which uses Conditional Gradient steps to compute inexact Euclidean projections in Nesterov’s Accelerated Gradient Descent algorithm. After a finite number (independent of the target accuracy ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-0.png","element":"img","alt":" 𝜀","inline":true},{"text":") of linearly convergent iterations, the convergence rate of the SOCGS algorithm is quadratic in primal gap. Once inside this phase the SOCGS algorithm reaches an ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-1.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution after ","element":"span"},{"style":{"height":15.22},"width":267.43,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-2.png","element":"img","alt":" O (log(log 1/𝜀))","inline":true,"padRight":true},{"text":"Hessian and first-order oracle calls and ","element":"span"},{"style":{"height":15.2},"width":410.32,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-3.png","element":"img","alt":" O(log(1/𝜀) log(log 1/𝜀))","inline":true,"padRight":true},{"text":"linear minimization oracle calls.","element":"span"}],[{"text":"The Newton Conditional Gradient (NCG) (or Newton Frank-Wolfe) algorithm (","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"Liu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"2022","element":"a"},{"text":") uses an approach that is similar in spirit to the one used in the SOCGS algorithm, however with a very different analysis and set of assumptions. The aforementioned algorithm minimizes a self-concordant function over a convex set by performing Inexact Newton steps using a Conditional Gradient algorithm to solve the constrained quadratic subproblems. This algorithm requires exact Hessian information, and after a finite number of iterations (independent of the target accuracy ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-4.png","element":"img","alt":" 𝜀","inline":true},{"text":"), the convergence rate of the NCG algorithm is linear in primal gap. Once inside this phase a ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-5.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution is reached after ","element":"span"},{"style":{"height":15.2},"width":183.84,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-6.png","element":"img","alt":" O (log 1/𝜀)","inline":true,"padRight":true},{"text":"exact Hessian and first-order oracle calls and ","element":"span"},{"style":{"height":15.2},"width":138.92,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-7.png","element":"img","alt":" O(1/𝜀𝜈)","inline":true,"padRight":true},{"text":"linear minimization oracle calls, where ","element":"span"},{"style":{"height":7.6},"width":19,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/10-8.png","element":"img","alt":" 𝜈","inline":true,"padRight":true},{"text":"is a constant greater than one.","element":"span"}],[{"text":"The computational results show that the SOCGS algorithm outperforms other first-order projection-free algorithms and the NCG algorithm in applications where first-order information is costly to compute. The improved performance with respect to other first-order projection-free algorithms is due to the substantial progress per iteration provided by the Inexact PVM steps, which makes up for their higher computational cost, resulting in faster convergence with respect to time. The better performance of the SOCGS algorithm with respect to the NCG algorithm is due to the better global convergence of the SOCGS algorithm, and the use of the Away-step Conditional Gradient algorithm as a subproblem solver in the SOCGS algorithm, as opposed to the vanilla Conditional Gradient algorithm used by the NCG algorithm.","element":"span"}]]},{"heading":"Acknowledgments","paragraphs":[[{"text":"Research reported in this paper was partially supported by NSF CAREER Award CMMI-1452463. We would like to thank Gábor Braun for the helpful discussions, and the anonymous reviewers for their suggestions and comments.","element":"span"}]]},{"heading":"References","paragraphs":[[{"text":"Aharon, M., Elad, M., and Bruckstein, A. K-svd: An algorithm for designing overcomplete dictionaries for ","element":"span"},{"text":"sparse representation. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Transactions on Signal Processing","element":"span"},{"text":", 54(11):4311–4322, 2006.","element":"span"}],[{"id":"id-68","text":"Andersen, M., Dahl, J., Liu, Z., Vandenberghe, L., Sra, S., Nowozin, S., and Wright, S. Interior-point methods ","element":"span"},{"text":"for large-scale cone programming. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Optimization for machine learning","element":"span"},{"text":", 5583, 2011.","element":"span"}],[{"id":"id-50","text":"Asi, H. and Duchi, J. C. The importance of better models in stochastic optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the National Academy of Sciences","element":"span"},{"text":", 116(46):22924–22930, 2019.","element":"span"}],[{"id":"id-159","text":"Banerjee, O., Ghaoui, L. E., and d’Aspremont, A. Model selection through sparse maximum likelihood ","element":"span"},{"text":"estimation for multivariate gaussian or binary data. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research","element":"span"},{"text":", 9(Mar):485–516, 2008.","element":"span"}],[{"id":"id-49","text":"Barré, M. and d’Aspremont, A. ","element":"span"},{"text":"Polyak steps for adaptive fast gradient method. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1906.03056","element":"span"},{"text":", 2019.","element":"span"}],[{"id":"id-48","text":"Barré, M., Taylor, A., and d’Aspremont, A. Complexity guarantees for polyak steps with momentum. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on learning theory","element":"span"},{"text":", pp. 452–478. PMLR, 2020.","element":"span"}],[{"id":"id-105","text":"Beck, A. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"First-order methods in optimization","element":"span"},{"text":", volume 25. SIAM, 2017.","element":"span"}],[{"id":"id-33","text":"Ben-Tal, A. and Nemirovskii, A. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Optimization III (Spring 2020 Lecture Notes): Convex analysis, nonlinear programming theory and nonlinear programming algorithms","element":"span"},{"text":". 2020.","element":"span"}],[{"id":"id-62","text":"Braun, G., Pokutta, S., and Zink, D. Lazifying conditional gradient algorithms. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 34th International Conference on Machine Learning","element":"span"},{"text":", pp. 566–575, 2017.","element":"span"}],[{"id":"id-154","text":"Braun, G., Pokutta, S., Tu, D., and Wright, S. Blended conditonal gradients. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 36th International Conference on Machine Learning","element":"span"},{"text":", pp. 735–743, 2019.","element":"span"}],[{"id":"id-20","text":"Carderera, A., Besançon, M., and Pokutta, S. Simple steps are all you need: Frank-wolfe and generalized ","element":"span"},{"text":"self-concordant functions. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 34:5390–5401, 2021.","element":"span"}],[{"id":"id-72","text":"Chang, C.-C. and Lin, C.-J. Libsvm: A library for support vector machines. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ACM transactions on intelligent systems and technology (TIST)","element":"span"},{"text":", 2(3):1–27, 2011.","element":"span"}],[{"id":"id-69","text":"Combettes, C. W. and Pokutta, S. Complexity of linear minimization and projection on some sets. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Operations Research Letters","element":"span"},{"text":", 49(4):565–571, 2021.","element":"span"}],[{"id":"id-47","text":"Combettes, C. W. and Pokutta, S. Revisiting the approximate carathéodory problem via the frank-wolfe ","element":"span"},{"text":"algorithm. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming","element":"span"},{"text":", 197(1):191–214, 2023.","element":"span"}],[{"id":"id-152","text":"Condat, L. Fast projection onto the simplex and the ","element":"span"},{"style":{"height":14.4},"width":619.78,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/11-0.png","element":"img","alt":" ℓ1 ball. Mathematical Programming","inline":true},{"text":", 158(1-2):575–585, 2016.","element":"span"}],[{"id":"id-87","text":"Diakonikolas, J., Carderera, A., and Pokutta, S. Locally accelerated conditional gradients. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":", pp. 1737–1747, 2020.","element":"span"}],[{"id":"id-18","text":"Dvurechensky, P., Ostroukhov, P., Safin, K., Shtern, S., and Staudigl, M. Self-concordant analysis of ","element":"span"},{"text":"Frank-Wolfe algorithms. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pp. 2814–2824. PMLR, 2020.","element":"span"}],[{"id":"id-21","text":"Dvurechensky, P., Safin, K., Shtern, S., and Staudigl, M. Generalized self-concordant analysis of frank–wolfe ","element":"span"},{"text":"algorithms. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming","element":"span"},{"text":", pp. 1–69, 2022.","element":"span"}],[{"id":"id-7","text":"Frank, M. and Wolfe, P. An algorithm for quadratic programming. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Naval Research Logistics Quarterly","element":"span"},{"text":", 3 (1-2):95–110, 1956.","element":"span"}],[{"id":"id-160","text":"Friedman, J., Hastie, T., and Tibshirani, R. Sparse inverse covariance estimation with the graphical lasso. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Biostatistics","element":"span"},{"text":", 9(3):432–441, 2008.","element":"span"}],[{"id":"id-9","text":"Futami, F., Cui, Z., Sato, I., and Sugiyama, M. Bayesian posterior approximation via greedy particle ","element":"span"},{"text":"optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the AAAI Conference on Artificial Intelligence","element":"span"},{"text":", volume 33, pp. 3606–3613, 2019.","element":"span"}],[{"id":"id-147","text":"Garber, D. Revisiting frank-wolfe for polytopes: Strict complementarity and sparsity. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 33:18883–18893, 2020.","element":"span"}],[{"id":"id-86","text":"Garber, D. and Hazan, E. A linearly convergent variant of the conditional gradient algorithm under strong ","element":"span"},{"text":"convexity, with applications to online and stochastic optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Optimization","element":"span"},{"text":", 26(3): 1493–1528, 2016.","element":"span"}],[{"id":"id-63","text":"Garber, D. and Meshi, O. Linear-memory and decomposition-invariant linearly convergent conditional ","element":"span"},{"text":"gradient algorithm for structured polytopes. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 29","element":"span"},{"text":", pp. 1001–1009, 2016.","element":"span"}],[{"id":"id-83","text":"Garber, D., Sabach, S., and Kaplan, A. Fast generalized conditional gradient method with applications to ","element":"span"},{"text":"matrix recovery problems. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1802.05581","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-155","text":"Ghanbari, H. and Scheinberg, K. Proximal quasi-Newton methods for regularized convex optimization with ","element":"span"},{"text":"linear and accelerated sublinear convergence rates. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Computational Optimization and Applications","element":"span"},{"text":", 69(3): 597–627, 2018.","element":"span"}],[{"id":"id-13","text":"Gonçalves, M. and Oliveira, F. An inexact newton-like conditional gradient method for constrained nonlinear ","element":"span"},{"text":"systems. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Applied Numerical Mathematics","element":"span"},{"text":", 132:22–34, 2018.","element":"span"}],[{"id":"id-12","text":"Gonçalves, M. L. and Melo, J. G. A newton conditional gradient method for constrained nonlinear systems. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Computational and Applied Mathematics","element":"span"},{"text":", 311:473–483, 2017.","element":"span"}],[{"id":"id-15","text":"Gonçalves, M. L. and Oliveira, F. On the global convergence of an inexact quasi-newton conditional gradient ","element":"span"},{"text":"method for constrained nonlinear systems. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Numerical Algorithms","element":"span"},{"text":", pp. 1–23, 2019.","element":"span"}],[{"id":"id-29","text":"Guélat, J. and Marcotte, P. Some comments on Wolfe’s ‘away step’. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming","element":"span"},{"text":", 35(1): 110–119, 1986.","element":"span"}],[{"id":"id-71","text":"Guyon, I., Li, J., Mader, T., Pletscher, P. A., Schneider, G., and Uhr, M. Competitive baseline methods ","element":"span"},{"text":"set new standards for the NIPS 2003 feature selection benchmark. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Pattern recognition letters","element":"span"},{"text":", 28(12): 1438–1444, 2007.","element":"span"}],[{"id":"id-51","text":"Hazan, E. and Kakade, S. Revisiting the Polyak step size. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1905.00313","element":"span"},{"text":", 2019.","element":"span"}],[{"id":"id-64","text":"Hazan, E. and Luo, H. Variance-reduced and projection-free stochastic optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 33th International Conference on Machine Learning","element":"span"},{"text":", pp. 1263–1271, 2016.","element":"span"}],[{"id":"id-24","text":"Jaggi, M. Revisiting frank-wolfe: Projection-free sparse convex optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 30nd International Conference on Machine Learning","element":"span"},{"text":", number CONF, pp. 427–435, 2013.","element":"span"}],[{"id":"id-8","text":"Joulin, A., Tang, K., and Fei-Fei, L. Efficient image and video co-localization with Frank-Wolfe algorithm. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"European Conference on Computer Vision","element":"span"},{"text":", pp. 253–268. Springer, 2014.","element":"span"}],[{"id":"id-1","text":"Kantorovich, L. V. Functional analysis and applied mathematics. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Uspekhi Matematicheskikh Nauk","element":"span"},{"text":", 3(6): 89–185, 1948.","element":"span"}],[{"id":"id-38","text":"Karimireddy, S. P., Stich, S. U., and Jaggi, M. Global linear convergence of Newton’s method without ","element":"span"},{"text":"strong-convexity or Lipschitz gradients. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1806.00413","element":"span"},{"text":", 2018a.","element":"span"}],[{"id":"id-98","text":"Karimireddy, S. P. R., Stich, S., and Jaggi, M. Adaptive balancing of gradient and update computation times ","element":"span"},{"text":"using global geometry and approximate subproblems. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 35th International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":", pp. 1204–1213, 2018b.","element":"span"}],[{"id":"id-27","text":"Lacoste-Julien, S. and Jaggi, M. On the global linear convergence of Frank-Wolfe optimization variants. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 28","element":"span"},{"text":", pp. 496–504, 2015.","element":"span"}],[{"id":"id-88","text":"Lan, G. The complexity of large-scale convex programming under a linear optimization oracle. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1309.5550","element":"span"},{"text":", 2013.","element":"span"}],[{"id":"id-11","text":"Lan, G. and Zhou, Y. Conditional gradient sliding for convex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Optimization","element":"span"},{"text":", 26(2):1379–1409, 2016.","element":"span"}],[{"text":"Lee, H., Battle, A., Raina, R., and Ng, A. Y. Efficient sparse coding algorithms. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 20","element":"span"},{"text":", pp. 801–808, 2007.","element":"span"}],[{"id":"id-6","text":"Lee, J. D., Sun, Y., and Saunders, M. A. Proximal Newton-type methods for minimizing composite functions. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Optimization","element":"span"},{"text":", 24(3):1420–1443, 2014.","element":"span"}],[{"id":"id-162","text":"Lehoucq, R. B., Sorensen, D. C., and Yang, C. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ARPACK users’ guide: solution of large-scale eigenvalue problems with implicitly restarted Arnoldi methods","element":"span"},{"text":", volume 6. Siam, 1998.","element":"span"}],[{"id":"id-4","text":"Levitin, E. S. and Polyak, B. T. Constrained minimization methods. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"USSR Computational Mathematics and Mathematical Physics","element":"span"},{"text":", 6(5):1–50, 1966.","element":"span"}],[{"id":"id-16","text":"Liu, D., Cevher, V., and Tran-Dinh, Q. A newton frank–wolfe method for constrained self-concordant ","element":"span"},{"text":"minimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Global Optimization","element":"span"},{"text":", pp. 1–27, 2022.","element":"span"}],[{"id":"id-148","text":"Mairal, J., Bach, F., Ponce, J., and Sapiro, G. Online learning for matrix factorization and sparse coding. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research","element":"span"},{"text":", 11(Jan):19–60, 2010.","element":"span"}],[{"id":"id-46","text":"Mirrokni, V., Leme, R. P., Vladu, A., and Wong, S. C.-w. Tight bounds for approximate carathéodory and ","element":"span"},{"text":"beyond. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 34th International Conference on Machine Learning","element":"span"},{"text":", pp. 2440–2448, 2017.","element":"span"}],[{"id":"id-14","text":"Morini, B. Convergence behaviour of inexact newton methods. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematics of Computation","element":"span"},{"text":", 68(228): 1605–1613, 1999.","element":"span"}],[{"id":"id-149","text":"Nemirovski, A. Interior point polynomial time methods in convex programming. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Lecture notes","element":"span"},{"text":", 2004.","element":"span"}],[{"id":"id-3","text":"Nesterov, Y. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Introductory lectures on convex optimization: A basic course","element":"span"},{"text":", volume 87. Springer Science & Business Media, 2013.","element":"span"}],[{"id":"id-32","text":"Nesterov, Y. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Lectures on convex optimization","element":"span"},{"text":", volume 137. Springer, 2018.","element":"span"}],[{"id":"id-2","text":"Nesterov, Y. and Nemirovskii, A. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Interior-point polynomial algorithms in convex programming","element":"span"},{"text":", volume 13. Siam, 1994.","element":"span"}],[{"id":"id-165","text":"Nocedal, J. and Wright, S. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Numerical optimization","element":"span"},{"text":". Springer Science & Business Media, 2006.","element":"span"}],[{"id":"id-17","text":"Ochs, P. and Malitsky, Y. Model function based conditional gradient method with armijo-like line search. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 36th International Conference on Machine Learning","element":"span"},{"text":", pp. 4891–4900, 2019.","element":"span"}],[{"id":"id-23","text":"Pedregosa, F., Negiar, G., Askari, A., and Jaggi, M. Linearly convergent frank-wolfe with backtracking ","element":"span"},{"text":"line-search. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":", pp. 1–10. PMLR, 2020.","element":"span"}],[{"id":"id-153","text":"Rao, N., Shah, P., and Wright, S. Forward–backward greedy algorithms for atomic norm regularization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Transactions on Signal Processing","element":"span"},{"text":", 63(21):5798–5811, 2015.","element":"span"}],[{"id":"id-5","text":"Scheinberg, K. and Tang, X. Practical inexact proximal quasi-Newton method with global complexity analysis. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming","element":"span"},{"text":", 160(1-2):495–529, 2016.","element":"span"}],[{"id":"id-10","text":"Schmidt, M., Berg, E., Friedlander, M., and Murphy, K. Optimizing costly functions with simple constraints: ","element":"span"},{"text":"A limited-memory projected quasi-Newton algorithm. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 12th International Conference on Artificial Intelligence and Statistics","element":"span"},{"text":", pp. 456–463, 2009.","element":"span"}],[{"id":"id-22","text":"Wolfe, P. Convergence theory in nonlinear programming. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Integer and nonlinear programming","element":"span"},{"text":", pp. 1–36, 1970.","element":"span"}],[{"id":"id-161","text":"Yuan, M. and Lin, Y. Model selection and estimation in the gaussian graphical model. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Biometrika","element":"span"},{"text":", 94(1): 19–35, 2007.","element":"span"}],[{"id":"id-19","text":"Zhao, R. and Freund, R. M. Analysis of the frank–wolfe method for convex composite optimization involving ","element":"span"},{"text":"a logarithmically-homogeneous barrier. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming","element":"span"},{"text":", pp. 1–41, 2022.","element":"span"}],[{"style":{"width":"73%"},"width":1386,"height":167,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Outline. ","element":"span"},{"text":"The appendix of the paper is organized as follows:","element":"span"}],[{"text":"• Section ","element":"span"},{"text":"A ","element":"span"},{"text":"presents the notation and definitions used throughout the appendix, as well as useful material pertaining to the Hessian approximation.","element":"span"}],[{"text":"• Section ","element":"span"},{"text":"B ","element":"span"},{"text":"contains background information about the Conditional Gradients algorithm, pseudocode for the vanilla Conditional Gradients algorithm and the Away-step Conditional Gradients algorithm and theoretical information about the convergence of the Away-step Conditional Gradients algorithm.","element":"span"}],[{"text":"• Section ","element":"span"},{"text":"C ","element":"span"},{"text":"presents information about the vanilla Projected Variable-Metric algorithm, its global linear convergence with exact line search or a bounded stepsize, and its quadratic local convergence in distance to the optimum with unit step size.","element":"span"}],[{"text":"• Section ","element":"span"},{"text":"D ","element":"span"},{"text":"contains the proof of global linear and local quadratic convergence in primal gap of the Secondorder Conditional Gradient Sliding algorithm, as well as an oracle complexity analysis.","element":"span"}],[{"text":"• Section ","element":"span"},{"href":"#id-65","text":"E ","element":"a"},{"text":"presents a detailed description of the numerical experiments performed.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Appendix A. Notation and Preliminaries","element":"span"}],[{"text":"We denote the norm of a vector ","element":"span"},{"style":{"height":20},"width":311.16,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-1.png","element":"img","alt":" v as ∥v∥ =√︁⟨v, v⟩","inline":true},{"text":", and the norm of a matrix ","element":"span"},{"style":{"height":14.83},"width":519.36,"height":37.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-2.png","element":"img","alt":" 𝐴 as ∥𝐴∥ = maxv≠0 ∥𝐴v∥ /∥v∥.","inline":true,"padRight":true},{"text":"Let ","element":"span"},{"style":{"height":15.13},"width":58.47,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-3.png","element":"img","alt":" S𝑛++ ","inline":true,"padRight":true},{"text":"denote the set of symmetric positive definite matrices in ","element":"span"},{"style":{"height":15.88},"width":312.51,"height":39.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-4.png","element":"img","alt":" ℝ𝑛×𝑛 and let ∥·∥𝐻 ","inline":true,"padRight":true},{"text":"denote the matrix norm ","element":"span"},{"text":"defined by ","element":"span"},{"style":{"height":15.14},"width":134.56,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-5.png","element":"img","alt":" 𝐻 ∈ S𝑛++","inline":true},{"text":", that is, for a given vector ","element":"span"},{"style":{"fontWeight":"bold"},"text":"v ","element":"span"},{"text":"the norm defined by ","element":"span"},{"style":{"height":20},"width":679.84,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-6.png","element":"img","alt":" 𝐻 is ∥v∥𝐻 =√︁⟨v, 𝐻v⟩. We use vmin (𝐻)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.2},"width":149.95,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-7.png","element":"img","alt":" vmax (𝐻)","inline":true,"padRight":true},{"text":"to refer to the eigenvectors of unit norm associated with the minimum and maximum eigenvalues, denoted by ","element":"span"},{"style":{"height":15.2},"width":386.5,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-8.png","element":"img","alt":" 𝜆min (𝐻) and 𝜆max (𝐻)","inline":true,"padRight":true},{"text":"respectively, of the matrix ","element":"span"},{"style":{"height":15.14},"width":134.57,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-9.png","element":"img","alt":" 𝐻 ∈ S𝑛++","inline":true},{"text":". Similarly, we use ","element":"span"},{"style":{"height":15.2},"width":350.03,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-10.png","element":"img","alt":" 𝜆𝑖 (𝐻) with 𝑖 ∈ [1, 𝑛]","inline":true,"padRight":true},{"text":"to refer to the ","element":"span"},{"style":{"height":10.8},"width":12,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-11.png","element":"img","alt":" 𝑖","inline":true},{"text":"-th largest eigenvalue of the matrix ","element":"span"},{"style":{"height":15.54},"width":617.45,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-12.png","element":"img","alt":" 𝐻 ∈ S𝑛++. Let 𝜎min(𝐻) and 𝜎max(𝐻)","inline":true,"padRight":true},{"text":"denote the minimum ","element":"span"},{"text":"and maximum singular values of the matrix ","element":"span"},{"style":{"height":10.4},"width":32,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-13.png","element":"img","alt":" 𝐻","inline":true},{"text":". We denote the open ball of radius ","element":"span"},{"style":{"height":11.2},"width":90.91,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-14.png","element":"img","alt":" 𝑟 > 0","inline":true,"padRight":true},{"text":"centered at ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"as ","element":"span"},{"style":{"height":15.2},"width":116.51,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-15.png","element":"img","alt":"B(x, 𝑟)","inline":true},{"text":". Let ","element":"span"},{"text":"int ","element":"span"},{"text":"(X) ","element":"span"},{"text":"and ","element":"span"},{"text":"rel","element":"span"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"int ","element":"span"},{"text":"(X) ","element":"span"},{"text":"represent the interior and the relative interior of the set ","element":"span"},{"text":"X","element":"span"},{"text":", respectively. Given a function ","element":"span"},{"style":{"height":15.2},"width":246.52,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-16.png","element":"img","alt":" 𝑓 (x) : ℝ𝑛 → ℝ","inline":true},{"text":", we say that the function is:","element":"span"}],[{"style":{"height":16.4},"width":347.83,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-17.png","element":"img","alt":"Definition A.1 (𝜇","inline":true},{"text":"-strongly convex function)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"The function is ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-18.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strongly convex over ","element":"span"},{"text":"X ","element":"span"},{"text":"if there exists a ","element":"span"},{"style":{"height":14.4},"width":91.52,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-19.png","element":"img","alt":"𝜇 > 0","inline":true,"padRight":true},{"text":"such that:","element":"span"}],[{"style":{"width":"40%"},"width":755,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-20.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":14.8},"width":147,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-21.png","element":"img","alt":" x, y ∈ X.","inline":true}],[{"style":{"height":16.4},"width":340.82,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-22.png","element":"img","alt":"Definition A.2 (𝐿","inline":true},{"text":"-smooth function)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"The function is ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-23.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smooth over ","element":"span"},{"text":"X ","element":"span"},{"text":"if there exists a ","element":"span"},{"style":{"height":11.2},"width":93.34,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-24.png","element":"img","alt":" 𝐿 > 0","inline":true,"padRight":true},{"text":"such that:","element":"span"}],[{"style":{"width":"40%"},"width":757,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-25.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":14.8},"width":147,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-26.png","element":"img","alt":" x, y ∈ X.","inline":true}],[{"text":"A simple schematic representation of the bounds provided by convexity, ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-27.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strong convexity and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-28.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smoothness can be seen in Figure ","element":"span"},{"href":"#id-77","text":"4","element":"a"},{"text":".","element":"span"}],[{"style":{"height":16},"width":350.81,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-29.png","element":"img","alt":"Definition A.3 (𝐿2","inline":true},{"text":"-Lipschitz continuous Hessian)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"The function has a ","element":"span"},{"style":{"height":12.39},"width":40.07,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-30.png","element":"img","alt":" 𝐿2","inline":true},{"text":"-Lipschitz continuous Hessian over ","element":"span"},{"text":"X ","element":"span"},{"text":"if there exists a ","element":"span"},{"style":{"height":13.19},"width":111.15,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-31.png","element":"img","alt":" 𝐿2 > 0","inline":true,"padRight":true},{"text":"such that:","element":"span"}],[{"style":{"width":"29%"},"width":557,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-32.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":14.8},"width":147,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-33.png","element":"img","alt":" x, y ∈ X.","inline":true}],[{"id":"id-91","style":{"fontWeight":"bold"},"text":"Definition A.4 ","element":"span"},{"text":"(Normal cone of ","element":"span"},{"text":"X","element":"span"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"We define the normal cone of the set ","element":"span"},{"text":"X ","element":"span"},{"text":"at point ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":", denoted by ","element":"span"},{"style":{"height":15.2},"width":124.72,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-34.png","element":"img","alt":" 𝑁X (x),","inline":true,"padRight":true},{"text":"as:","element":"span"}],[{"style":{"width":"48%"},"width":908,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/14-35.png","element":"img"}],[{"id":"id-77","style":{"width":"58%"},"width":1097,"height":566,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-0.png","element":"img"}],[{"text":"Figure 4: The red line depict the quadratic upper bound from ","element":"figcaption","subtype":"caption"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-1.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smoothness, the blue line depicts the quadratic lower bound provided by ","element":"figcaption","subtype":"caption"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-2.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strong convexity, the green line depicts the linear lower bound provided by convexity.","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"A.1 Hessian Approximation Accuracy","element":"span"}],[{"id":"id-78","style":{"fontWeight":"bold"},"text":"Lemma A.5. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":15.14},"width":197.48,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-3.png","element":"img","alt":" 𝑃, 𝑄 ∈ S𝑛++","inline":true},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The solution to the fractional quadratic program ","element":"span"},{"style":{"height":21.43},"width":355.08,"height":53.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-4.png","element":"img","alt":" maxu∈ℝ𝑛 ∥u∥2𝑄 /∥u∥2𝑃","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is given by the the largest eigenvalue of the symmetric positive definite matrix ","element":"span"},{"style":{"height":16.99},"width":216.28,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-5.png","element":"img","alt":" 𝑃−1/2𝑄𝑃−1/2","inline":true},{"style":{"fontStyle":"italic"},"text":", that is, ","element":"span"},{"style":{"height":19.24},"width":337.53,"height":48.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-6.png","element":"img","alt":"𝜆max�𝑃−1/2𝑄𝑃−1/2�","inline":true},{"style":{"fontStyle":"italic"},"text":", which in turn is equal to ","element":"span"},{"style":{"height":18.44},"width":213.75,"height":46.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-7.png","element":"img","alt":" 𝜆max�𝑃−1𝑄�","inline":true},{"style":{"fontStyle":"italic"},"text":". Moreover, the solution to the fractional quadratic program ","element":"span"},{"style":{"height":21.43},"width":347.34,"height":53.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-8.png","element":"img","alt":" minu∈ℝ𝑛 ∥u∥2𝑄 /∥u∥2𝑃 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is given by the smallest eigenvalue of the symmetric positive matrix ","element":"span"},{"style":{"height":16.99},"width":230.15,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-9.png","element":"img","alt":" 𝑃−1/2𝑄𝑃−1/2,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"that is ","element":"span"},{"style":{"height":19.24},"width":331.52,"height":48.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-10.png","element":"img","alt":" 𝜆min�𝑃−1/2𝑄𝑃−1/2�","inline":true},{"style":{"fontStyle":"italic"},"text":", which in turn is equal to ","element":"span"},{"style":{"height":18.44},"width":222.61,"height":46.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-11.png","element":"img","alt":" 𝜆min�𝑃−1𝑄�.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Writing out the expression for the quadratic program we have that:","element":"span"}],[{"style":{"width":"34%"},"width":647,"height":552,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-12.png","element":"img"}],[{"text":"Moreover, note that as ","element":"span"},{"style":{"height":14},"width":144.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-13.png","element":"img","alt":" 𝑃 and 𝑄","inline":true,"padRight":true},{"text":"are positive definite. ","element":"span"},{"style":{"height":19.24},"width":599.42,"height":48.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-14.png","element":"img","alt":" 𝜆max�𝑃−1/2𝑄𝑃−1/2� = 𝜆max�𝑃−1𝑄�","inline":true},{"text":". The second claim follows using a very similar reasoning. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-15.png","element":"img","alt":"□","inline":true}],[{"id":"id-35","style":{"fontWeight":"bold"},"text":"Lemma A.6. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given two matrices ","element":"span"},{"style":{"height":15.14},"width":176.28,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-16.png","element":"img","alt":" 𝑃, 𝑄 ∈ S𝑛++","inline":true},{"style":{"fontStyle":"italic"},"text":", then for all ","element":"span"},{"style":{"height":10.99},"width":128.81,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-17.png","element":"img","alt":" v ∈ ℝ𝑛:","inline":true}],[{"style":{"width":"61%"},"width":1154,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-18.png","element":"img"}],[{"style":{"height":19.2},"width":811.61,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/15-19.png","element":"img","alt":"with 𝜂 = max�𝜆max�𝑃−1𝑄� , 𝜆max�𝑄−1𝑃��≥ 1.","inline":true}],[{"style":{"height":15.2},"width":292.55,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-0.png","element":"img","alt":"Proof. Let 𝜆𝑖 (𝑃)","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"style":{"height":10.8},"width":12,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-1.png","element":"img","alt":" 𝑖","inline":true},{"text":"-th eigenvalue of matrix ","element":"span"},{"style":{"height":10.8},"width":25,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-2.png","element":"img","alt":" 𝑃","inline":true},{"text":". Note that as ","element":"span"},{"style":{"height":14},"width":143.64,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-3.png","element":"img","alt":" 𝑃 and 𝑄","inline":true,"padRight":true},{"text":"are positive definite ","element":"span"},{"style":{"height":13.79},"width":140.7,"height":34.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-4.png","element":"img","alt":" 𝑃−1 and","inline":true},{"style":{"height":16.19},"width":67.51,"height":40.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-5.png","element":"img","alt":"𝑄−1 ","inline":true,"padRight":true},{"text":"are well-defined, furthermore ","element":"span"},{"style":{"height":16.19},"width":280.28,"height":40.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-6.png","element":"img","alt":" 𝑃−1𝑄 and 𝑄−1𝑃","inline":true,"padRight":true},{"text":"are also positive definite, as the eigenvalues of ","element":"span"},{"style":{"height":16.19},"width":161.22,"height":40.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-7.png","element":"img","alt":" 𝑃−1𝑄 are","inline":true,"padRight":true},{"text":"the same as those of the symmetric positive definite matrix ","element":"span"},{"style":{"height":16.99},"width":216.28,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-8.png","element":"img","alt":" 𝑃−1/2𝑄𝑃−1/2","inline":true},{"text":", and the eigenvalues of ","element":"span"},{"style":{"height":16.19},"width":226.1,"height":40.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-9.png","element":"img","alt":" 𝑄−1𝑃 are the","inline":true,"padRight":true},{"text":"same as those of the symmetric positive definite matrix ","element":"span"},{"style":{"height":16.99},"width":221.58,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-10.png","element":"img","alt":" 𝑄−1/2𝑃𝑄−1/2","inline":true},{"text":". In order to show that ","element":"span"},{"style":{"height":14.4},"width":264.75,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-11.png","element":"img","alt":" 𝜂 ≥ 1 note that","inline":true,"padRight":true},{"text":"if ","element":"span"},{"style":{"height":19.24},"width":681.15,"height":48.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-12.png","element":"img","alt":" 𝜆max�𝑄−1/2𝑃𝑄−1/2� = 𝜆max�𝑄−1𝑃� ≤ 1","inline":true},{"text":", then ","element":"span"},{"style":{"height":19.24},"width":647.28,"height":48.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-13.png","element":"img","alt":" 𝜆𝑖�𝑄−1/2𝑃𝑄−1/2� = 𝜆𝑖�𝑄−1𝑃� ∈ (0, 1]","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":16},"width":161.39,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-14.png","element":"img","alt":" 𝑖 ∈ ⟦1, 𝑛⟧","inline":true},{"text":", and therefore the eigenvalues of its inverse satisfy ","element":"span"},{"style":{"height":18.44},"width":522.55,"height":46.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-15.png","element":"img","alt":" 𝜆𝑖�(𝑄−1𝑃)−1� = 𝜆𝑖�𝑃−1𝑄� ≥ 1","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":16},"width":158.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-16.png","element":"img","alt":" 𝑖 ∈ ⟦1, 𝑛⟧","inline":true},{"text":". Conversely, if ","element":"span"},{"style":{"height":18.44},"width":286.51,"height":46.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-17.png","element":"img","alt":"𝜆max�𝑃−1𝑄� ≤ 1","inline":true},{"text":", the same reasoning applies, and ","element":"span"},{"style":{"height":18.44},"width":522.95,"height":46.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-18.png","element":"img","alt":" 𝜆𝑖�𝑄−1𝑃� ≥ 1 for all 𝑖 ∈ ⟦1, 𝑛⟧","inline":true},{"text":". Note that the definition of ","element":"span"},{"style":{"height":10.4},"width":20,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-19.png","element":"img","alt":"𝜂","inline":true,"padRight":true},{"text":"together with Lemma ","element":"span"},{"href":"#id-78","text":"A.5 ","element":"a"},{"text":"implies that ","element":"span"},{"style":{"height":21.78},"width":1158.64,"height":54.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-20.png","element":"img","alt":"1𝜂 = min�𝜆min�𝑃−1𝑄� , 𝜆𝑚𝑖𝑛�𝑄−1𝑃��≤ 𝜆min�𝑃−1𝑄� = 𝜆max�𝑄−1𝑃�.","inline":true,"padRight":true},{"text":"Focusing on the first inequality on Equation (","element":"span"},{"href":"#id-0","text":"A.1","element":"a"},{"text":") and plugging in the value of ","element":"span"},{"style":{"height":14.4},"width":180.34,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-21.png","element":"img","alt":" 𝜂 leads to:","inline":true}],[{"style":{"width":"24%"},"width":468,"height":416,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-22.png","element":"img"}],[{"text":"Focusing on the second inequality of Equation (","element":"span"},{"href":"#id-0","text":"A.1","element":"a"},{"text":") and noting that ","element":"span"},{"style":{"height":19.2},"width":680.53,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-23.png","element":"img","alt":" 𝜂 = max�𝜆max�𝑃−1𝑄� , 𝜆max�𝑄−1𝑃��≥","inline":true},{"style":{"height":18.44},"width":213.75,"height":46.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-24.png","element":"img","alt":"𝜆max�𝑃−1𝑄�","inline":true,"padRight":true},{"text":"we have that:","element":"span"}],[{"style":{"width":"24%"},"width":468,"height":377,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-25.png","element":"img"}],[{"text":"Which completes the proof. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-26.png","element":"img","alt":"□","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark A.7. ","element":"span"},{"text":"Given two matrices ","element":"span"},{"style":{"height":15.14},"width":176.27,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-27.png","element":"img","alt":" 𝑃, 𝑄 ∈ S𝑛++","inline":true},{"text":", then for all ","element":"span"},{"style":{"height":10.99},"width":126.8,"height":27.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-28.png","element":"img","alt":" v ∈ ℝ𝑛:","inline":true}],[{"style":{"width":"100%"},"width":1874,"height":296,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-29.png","element":"img"}],[{"text":"If we define the ellipsoid ","element":"span"},{"style":{"height":19.2},"width":444.82,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-30.png","element":"img","alt":" E𝑃 =�v ∈ ℝ𝑛 | v𝑇𝑃v ≤ 1�","inline":true},{"text":"for ","element":"span"},{"style":{"height":15.14},"width":128.96,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-31.png","element":"img","alt":" 𝑃 ∈ S𝑛++","inline":true},{"text":", we can interpret the value of ","element":"span"},{"style":{"height":10.4},"width":20,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-32.png","element":"img","alt":" 𝜂","inline":true,"padRight":true},{"text":"as being ","element":"span"},{"text":"the smallest value that ensures that ","element":"span"},{"style":{"height":16.21},"width":518.91,"height":40.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-33.png","element":"img","alt":" E𝑃/𝜂 ⊆ E𝑄 ⊆ E𝜂𝑃 for 𝑄 ∈ S𝑛++ ","inline":true,"padRight":true},{"text":"(see Figure ","element":"span"},{"href":"#id-79","text":"5","element":"a"},{"text":").","element":"span"}],[{"text":"The following corollary will allow us to bound the maximum and minimum eigenvalue of the approximation ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-34.png","element":"img","alt":"𝐻𝑘","inline":true,"padRight":true},{"text":"in terms of the maximum and minimum eigenvalue of ","element":"span"},{"style":{"height":16.99},"width":494.47,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-35.png","element":"img","alt":" ∇2 𝑓 (x𝑘) and 𝜂𝑘 for all 𝑘 ≥ 0","inline":true},{"text":", which will be useful in the proofs to follow.","element":"span"}],[{"id":"id-80","style":{"fontWeight":"bold"},"text":"Corollary A.8. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given two matrices ","element":"span"},{"style":{"height":15.14},"width":176.28,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-36.png","element":"img","alt":" 𝑃, 𝑄 ∈ S𝑛++","inline":true},{"style":{"fontStyle":"italic"},"text":", we have that:","element":"span"}],[{"style":{"width":"31%"},"width":588,"height":199,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-37.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":19.2},"width":768.06,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-38.png","element":"img","alt":" 𝜂 = max�𝜆max�𝑃−1𝑄� , 𝜆max�𝑄−1𝑃�� ≥ 1","inline":true},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"This allows us to conclude that ","element":"span"},{"style":{"height":24.02},"width":314.54,"height":60.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-39.png","element":"img","alt":"𝜆min(𝑃)𝜂 𝐼𝑛 ⪯ 𝑄 ⪯","inline":true},{"style":{"height":15.2},"width":224.56,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/16-40.png","element":"img","alt":"𝜂𝜆max (𝑃) 𝐼𝑛.","inline":true}],[{"id":"id-79","style":{"width":"60%"},"width":1124,"height":826,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-0.png","element":"img"}],[{"text":"Figure 5: Given ","element":"figcaption","subtype":"caption"},{"style":{"height":15.13},"width":176.28,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-1.png","element":"img","alt":" 𝑃, 𝑄 ∈ S𝑛++","inline":true},{"text":", we can always find an ","element":"figcaption","subtype":"caption"},{"style":{"height":16.21},"width":532.91,"height":40.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-2.png","element":"img","alt":" 𝜂 such that E𝑃/𝜂 ⊆ E𝑄 ⊆ E𝜂𝑃.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":15.2},"width":142.84,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-3.png","element":"img","alt":" vmin (𝑄)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.2},"width":148.85,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-4.png","element":"img","alt":" vmax (𝑄)","inline":true,"padRight":true},{"text":"denote the eigenvectors of unit length associated with the minimum and maximum eigenvalue of ","element":"span"},{"style":{"height":13.6},"width":29,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-5.png","element":"img","alt":" 𝑄","inline":true},{"text":", denoted by ","element":"span"},{"style":{"height":15.2},"width":143.9,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-6.png","element":"img","alt":" 𝜆min (𝑄)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.2},"width":149.9,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-7.png","element":"img","alt":" 𝜆max (𝑄)","inline":true,"padRight":true},{"text":"respectively. As ","element":"span"},{"style":{"height":15.14},"width":176.33,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-8.png","element":"img","alt":" 𝑃, 𝑄 ∈ S𝑛++","inline":true,"padRight":true},{"text":"from Lemma ","element":"span"},{"href":"#id-35","text":"A.6 ","element":"a"},{"text":"we have that:","element":"span"}],[{"style":{"width":"64%"},"width":1199,"height":92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-9.png","element":"img"}],[{"text":"On the other hand, using similar arguments we have:","element":"span"}],[{"style":{"width":"62%"},"width":1169,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-10.png","element":"img"}],[{"text":"Moving on to the bound for ","element":"span"},{"style":{"height":15.2},"width":313.65,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-11.png","element":"img","alt":" 𝜆max (𝑄) we have:","inline":true}],[{"style":{"width":"63%"},"width":1191,"height":92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-12.png","element":"img"}],[{"text":"Similarly, we have that:","element":"span"}],[{"style":{"width":"65%"},"width":1226,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-13.png","element":"img"}],[{"text":"Combining these bounds completes the proof. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-14.png","element":"img","alt":"□","inline":true}],[{"text":"Particularizing Corollary ","element":"span"},{"href":"#id-80","text":"A.8 ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":13.6},"width":140.83,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-15.png","element":"img","alt":" 𝑄 = 𝐻𝑘","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16.99},"width":229.77,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-16.png","element":"img","alt":" 𝑃 = ∇2 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"allows us to conclude that ","element":"span"},{"style":{"height":15.2},"width":163.84,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-17.png","element":"img","alt":" 𝜇/𝜂𝑘𝐼𝑛 ⪯","inline":true},{"style":{"height":14.18},"width":198.52,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-18.png","element":"img","alt":"𝐻𝑘 ⪯ 𝜂𝑘𝐿𝐼𝑛","inline":true},{"text":", and so the quadratic approximation ","element":"span"},{"style":{"height":18.3},"width":81.9,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-19.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"in Equation (","element":"span"},{"href":"#id-81","text":"2.4","element":"a"},{"text":") will be ","element":"span"},{"style":{"height":15.2},"width":77.48,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-20.png","element":"img","alt":" 𝜇/𝜂𝑘","inline":true},{"text":"-strongly convex and ","element":"span"},{"style":{"height":14.4},"width":217.54,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/17-21.png","element":"img","alt":"𝜂𝑘𝐿-smooth.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Appendix B. The Conditional Gradients algorithm","element":"span"}],[{"text":"We define the linear approximation of the function ","element":"span"},{"style":{"height":15.2},"width":68.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-0.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"around the point ","element":"span"},{"style":{"height":9.99},"width":100.23,"height":24.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-1.png","element":"img","alt":" x𝑘 as:","inline":true}],[{"style":{"width":"65%"},"width":1225,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-2.png","element":"img"}],[{"text":"At each iteration the vanilla ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conditional Gradients ","element":"span"},{"text":"(CG) algorithm (","element":"span"},{"href":"#id-4","referenceIndex":45,"text":"Levitin & Polyak","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":45,"text":"1966","element":"a"},{"text":"; ","element":"span"},{"href":"#id-7","referenceIndex":19,"text":"Frank & Wolfe","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":19,"text":"1956","element":"a"},{"text":"; ","element":"span"},{"href":"#id-24","referenceIndex":34,"text":"Jaggi","element":"a"},{"text":", ","element":"span"},{"href":"#id-24","referenceIndex":34,"text":"2013","element":"a"},{"text":") (Algorithm ","element":"span"},{"href":"#id-82","text":"3","element":"a"},{"text":") takes steps defined as ","element":"span"},{"style":{"height":18.78},"width":917.44,"height":46.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-3.png","element":"img","alt":" x𝑘+1 = x𝑘 + 𝛾𝑘(argminx∈X ˆ𝑙𝑘(x) − x𝑘) with 𝛾𝑘 ∈ (0, 1].","inline":true,"padRight":true},{"text":"As the iterates are formed as convex combinations of points in ","element":"span"},{"text":"X ","element":"span"},{"text":"there is no need for projections onto ","element":"span"},{"text":"X","element":"span"},{"text":", making the algorithm ","element":"span"},{"style":{"fontStyle":"italic"},"text":"projection-free","element":"span"},{"text":".","element":"span"}],[{"id":"id-82","style":{"width":"99%"},"width":1872,"height":413,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-4.png","element":"img"}],[{"text":"A useful quantity that can readily be computed in all CG steps is ","element":"span"},{"style":{"height":15.2},"width":301.76,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-5.png","element":"img","alt":" ⟨∇ 𝑓 (x𝑘), x𝑘 − v𝑘⟩","inline":true},{"text":", known as the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Frank-Wolfe gap","element":"span"},{"text":", which provides an upper bound on the primal gap. If ","element":"span"},{"style":{"height":14.58},"width":203.56,"height":36.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-6.png","element":"img","alt":" x∗ ∈ argmin","inline":true}],[{"style":{"width":"70%"},"width":1324,"height":115,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-7.png","element":"img"}],[{"text":"where the last inequality follows from the convexity of ","element":"span"},{"style":{"height":15.2},"width":68.56,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-8.png","element":"img","alt":" 𝑓 (x)","inline":true},{"text":". This quantity is often used as a stopping criterion when running the CG algorithm. The CG algorithm has seen a renewed interest from the Machine Learning community, as several machine learning problems can be phrased as constrained optimization problems with feasible regions onto which it is hard to project on (","element":"span"},{"href":"#id-8","referenceIndex":35,"text":"Joulin et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":35,"text":"2014","element":"a"},{"text":"; ","element":"span"},{"href":"#id-9","referenceIndex":21,"text":"Futami et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":21,"text":"2019","element":"a"},{"text":"; ","element":"span"},{"href":"#id-83","referenceIndex":25,"text":"Garber et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-83","referenceIndex":25,"text":"2018","element":"a"},{"text":").","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"B.1 Global Convergence","element":"span"}],[{"text":"The CG algorithm with exact line search converges linearly in primal gap when applied to Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") when ","element":"span"},{"style":{"height":15.2},"width":197.44,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-9.png","element":"img","alt":" x∗ ∈ int (X)","inline":true,"padRight":true},{"text":"(","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"Guélat & Marcotte","element":"a"},{"text":", ","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"1986","element":"a"},{"text":"). However, when ","element":"span"},{"style":{"height":15.22},"width":264.64,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-10.png","element":"img","alt":" x∗ ∈ X \\ int (X)","inline":true,"padRight":true},{"text":"the algorithm suffers from a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"zig-zagging ","element":"span"},{"text":"phenomenon - as the iterates get closer to ","element":"span"},{"style":{"height":11.39},"width":35.14,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-11.png","element":"img","alt":" x∗","inline":true,"padRight":true},{"text":"the directions provided by the algorithm starts to become close to perpendicular to the gradient (Figure ","element":"span"},{"href":"#id-84","text":"6a","element":"a"},{"text":"). This is remedied by using ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Away-steps ","element":"span"},{"text":"(Algorithm ","element":"span"},{"href":"#id-85","text":"5","element":"a"},{"text":"), which result in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Away-step Conditional Gradient ","element":"span"},{"text":"(ACG) algorithm (Algorithm ","element":"span"},{"href":"#id-25","text":"4","element":"a"},{"text":", Figure ","element":"span"},{"href":"#id-84","text":"6b","element":"a"},{"text":") (","element":"span"},{"href":"#id-22","referenceIndex":60,"text":"Wolfe","element":"a"},{"text":", ","element":"span"},{"href":"#id-22","referenceIndex":60,"text":"1970","element":"a"},{"text":"), which converges linearly in primal gap regardless of the location of ","element":"span"},{"style":{"height":11.39},"width":35.41,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-12.png","element":"img","alt":" x∗","inline":true,"padRight":true},{"text":"when using exact line search (","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"Lacoste-Julien & Jaggi","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"2015","element":"a"},{"text":") or a step size strategy dependent on ","element":"span"},{"style":{"height":16.4},"width":53.43,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-13.png","element":"img","alt":" 𝐿 (","inline":true},{"href":"#id-23","referenceIndex":56,"text":"Pedregosa et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":56,"text":"2020","element":"a"},{"text":").","element":"span"}],[{"id":"id-25","style":{"width":"99%"},"width":1872,"height":381,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-14.png","element":"img"}],[{"text":"The ACG algorithm maintains what is called an ","element":"span"},{"style":{"height":15.2},"width":408.82,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-15.png","element":"img","alt":" active set S𝑘 ⊆ vert (X)","inline":true,"padRight":true},{"text":"which represents the potentially non-unique set of vertices of ","element":"span"},{"style":{"height":15.2},"width":466.49,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-16.png","element":"img","alt":" X such that x𝑘 ∈ conv (S𝑘)","inline":true},{"text":". Associated with this active set ","element":"span"},{"style":{"height":13.59},"width":42.37,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-17.png","element":"img","alt":" S𝑘","inline":true,"padRight":true},{"text":"we have a set of barycentric coordinates ","element":"span"},{"style":{"height":13.19},"width":43.04,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-18.png","element":"img","alt":" λ𝑘","inline":true,"padRight":true},{"text":"such that if we denote by ","element":"span"},{"style":{"height":15.2},"width":231.01,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-19.png","element":"img","alt":" λ𝑘(u) ∈ [0, 1]","inline":true,"padRight":true},{"text":"the element of ","element":"span"},{"style":{"height":13.19},"width":43.04,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-20.png","element":"img","alt":" λ𝑘","inline":true,"padRight":true},{"text":"associated with ","element":"span"},{"style":{"height":13.59},"width":109.21,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-21.png","element":"img","alt":" u ∈ S𝑘","inline":true,"padRight":true},{"text":"we have that ","element":"span"},{"style":{"height":16.96},"width":1228.48,"height":42.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/18-22.png","element":"img","alt":" x𝑘 = �u∈S𝑘 λ𝑘(u)u, with �u∈S𝑘 λ𝑘(u) = 1 and λ𝑘(u) ≥ 0 for all u ∈ S𝑘.","inline":true}],[{"id":"id-84","style":{"width":"87%"},"width":1632,"height":775,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-0.png","element":"img"}],[{"text":"Figure 6: Qualitative performance comparison of the CG and the ACG algorithm.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"99%"},"width":1872,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-1.png","element":"img"}],[{"style":{"height":15.71},"width":500.17,"height":39.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-2.png","element":"img","alt":"1 v ← argminv∈X ⟨∇ 𝑓 (x) , v⟩","inline":true},{"style":{"height":15.7},"width":503.98,"height":39.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-3.png","element":"img","alt":"2 a ← argmaxv∈S ⟨∇ 𝑓 (x) , v⟩","inline":true},{"id":"id-127","style":{"height":15.22},"width":728.34,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-4.png","element":"img","alt":"3 if ⟨∇ 𝑓 (x), x − v⟩ ≥ ⟨∇ 𝑓 (x), a − x⟩ then","inline":true},{"style":{"height":14},"width":458.18,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-5.png","element":"img","alt":"4 d ← x − v, 𝛾max ← 1","inline":true},{"style":{"height":11.2},"width":110.78,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-6.png","element":"img","alt":"5 else","inline":true}],[{"style":{"height":15.22},"width":703.59,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-7.png","element":"img","alt":"6 d ← a − x, 𝛾max ← λ(a)/(1 − λ(a))","inline":true},{"style":{"height":11.2},"width":110.44,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-8.png","element":"img","alt":"7 end","inline":true}],[{"id":"id-128","style":{"height":17.7},"width":580.67,"height":44.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-9.png","element":"img","alt":"8 𝛾 ← argmin𝛾∈[0,𝛾max] 𝑓 (x + 𝛾d)","inline":true},{"style":{"height":14},"width":240.54,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-10.png","element":"img","alt":"9 x′ ← x + 𝛾d","inline":true},{"style":{"height":15.22},"width":732.13,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-11.png","element":"img","alt":"10 if ⟨∇ 𝑓 (x), x − v⟩ ≥ ⟨∇ 𝑓 (x), a − x⟩ then","inline":true},{"style":{"height":14.4},"width":352.34,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-12.png","element":"img","alt":"11 if 𝛾 = 1 then","inline":true}],[{"style":{"height":14.8},"width":336.71,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-13.png","element":"img","alt":"12 S′ ← {v}","inline":true},{"style":{"height":11.2},"width":188.59,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-14.png","element":"img","alt":"13 else","inline":true}],[{"style":{"height":14.8},"width":409.38,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-15.png","element":"img","alt":"14 S′ ← S ∪ {v}","inline":true},{"style":{"height":11.2},"width":188.25,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-16.png","element":"img","alt":"15 end","inline":true}],[{"style":{"height":15.22},"width":680.79,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-17.png","element":"img","alt":"16 λ′(u) ← (1 − 𝛾) λ(u) if u ∈ S \\ v","inline":true},{"style":{"height":15.22},"width":539.46,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-18.png","element":"img","alt":"17 λ′(v) ← (1 − 𝛾) λ(v) + 𝛾","inline":true},{"style":{"height":11.2},"width":128.82,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-19.png","element":"img","alt":"18 else","inline":true}],[{"style":{"height":14.4},"width":414.82,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-20.png","element":"img","alt":"19 if 𝛾 = 𝛾max then","inline":true},{"style":{"height":15.2},"width":399.43,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-21.png","element":"img","alt":"20 S′ ← S \\ {a}","inline":true},{"style":{"height":11.2},"width":188.59,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-22.png","element":"img","alt":"21 else","inline":true}],[{"style":{"height":12},"width":306.76,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-23.png","element":"img","alt":"22 S′ ← S","inline":true},{"style":{"height":11.2},"width":188.25,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-24.png","element":"img","alt":"23 end","inline":true}],[{"style":{"height":15.22},"width":676.92,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-25.png","element":"img","alt":"24 λ′(u) ← (1 + 𝛾) λ(u) if u ∈ S \\ a","inline":true},{"style":{"height":15.23},"width":535.04,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-26.png","element":"img","alt":"25 λ′(a) ← (1 + 𝛾) λ(a) − 𝛾","inline":true},{"style":{"height":11.2},"width":128.48,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/19-27.png","element":"img","alt":"26 end","inline":true}],[{"id":"id-85","text":"In general one of the easiest ways to maintain the active set is to build a list of previously used vertices ","element":"span"},{"text":"and a list of associated barycentric coordinates. If the Frank-Wolfe step adds a new vertex ","element":"span"},{"style":{"fontWeight":"bold"},"text":"v ","element":"span"},{"text":"that is not ","element":"span"},{"text":"already in ","element":"span"},{"text":"S ","element":"span"},{"text":"it is added to the list of vertices and its associated barycentric coordinate is added to the list of barycentric coordinates. If the vertex ","element":"span"},{"style":{"fontWeight":"bold"},"text":"v ","element":"span"},{"text":"is already contained in the list that maintains ","element":"span"},{"text":"S","element":"span"},{"text":", its existing barycentric coordinate is updated in the appropiate list. Note that the barycentric coordinates of the points ","element":"span"},{"text":"S \\ {","element":"span"},{"style":{"fontWeight":"bold"},"text":"v","element":"span"},{"text":"} ","element":"span"},{"text":"are also updated at each iteration. The away-steps in Algorithm ","element":"span"},{"href":"#id-85","text":"5 ","element":"a"},{"text":"cannot add new vertices, only remove them from the active set. This type of step also requires updating the barycentric coordinates of the points ","element":"span"},{"text":"S \\ {","element":"span"},{"style":{"fontWeight":"bold"},"text":"a","element":"span"},{"text":"}","element":"span"},{"text":". For both Frank-Wolfe and away-steps a vertex is removed from the list of vertices and the associated barycentric coordinate removed from the list of coordinates if the value of the barycentric coordinate is zero.","element":"span"}],[{"text":"The first proof of asymptotic linear convergence of the ACG algorithm relied on the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"strict complementarity ","element":"span"},{"text":"of the problem in Equation (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") (shown in Assumption ","element":"span"},{"href":"#id-28","text":"1","element":"a"},{"text":"), which we will also use in the convergence proof of the SOCGS algorithm.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Assumption 1 ","element":"span"},{"text":"(Strict Complementarity)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We have that ","element":"span"},{"style":{"height":15.2},"width":348.53,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-0.png","element":"img","alt":" ⟨∇ 𝑓 (x∗) , x − x∗⟩ = 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"if and only if ","element":"span"},{"style":{"height":15.2},"width":187.84,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-1.png","element":"img","alt":" x ∈ F (x∗).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark B.1. ","element":"span"},{"text":"Assumption ","element":"span"},{"href":"#id-28","text":"1 ","element":"a"},{"text":"automatically holds if ","element":"span"},{"style":{"height":15.2},"width":196.74,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-2.png","element":"img","alt":" x∗ ∈ int (X)","inline":true},{"text":", that is, if ","element":"span"},{"style":{"height":11.39},"width":34.61,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-3.png","element":"img","alt":" x∗ ","inline":true,"padRight":true},{"text":"is in the strict interior of ","element":"span"},{"text":"X","element":"span"},{"text":". In this case the polytope is fully-dimensional and itself the optimal face, so no off-optimal-face vertices exist.","element":"span"}],[{"text":"If Assumption ","element":"span"},{"href":"#id-28","text":"1 ","element":"a"},{"text":"is satisfied the iterates of the ACG algorithm reach ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-4.png","element":"img","alt":" F (x∗)","inline":true,"padRight":true},{"text":"in a finite number of steps, remaining in ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-5.png","element":"img","alt":" F (x∗)","inline":true,"padRight":true},{"text":"for all subsequent iterations (","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"Guélat & Marcotte","element":"a"},{"text":", ","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"1986","element":"a"},{"text":"). When inside ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-6.png","element":"img","alt":" F (x∗)","inline":true},{"text":", the iterates of the ACG algorithm contract the primal gap linearly. This analysis was later significantly extended to provide an explicit global linear convergence rate in primal gap (Theorem ","element":"span"},{"href":"#id-30","text":"2.1","element":"a"},{"text":"), by making use of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"pyramidal width ","element":"span"},{"text":"of the polytope ","element":"span"},{"text":"X ","element":"span"},{"text":"(","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"Lacoste-Julien & Jaggi","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"2015","element":"a"},{"text":"). With the pyramidal width one can derive a primal progress guarantee for all steps taken by the ACG algorithm except \"bad\" away-steps that reduce the cardinality of the active set ","element":"span"},{"style":{"height":13.59},"width":42.37,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-7.png","element":"img","alt":" S𝑘","inline":true},{"text":", that is when ","element":"span"},{"style":{"height":15.23},"width":602.72,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-8.png","element":"img","alt":" ⟨∇ 𝑓 (x𝑘), x𝑘 − v⟩ < ⟨∇ 𝑓 (x𝑘), a − x𝑘⟩","inline":true,"padRight":true},{"text":"and the step size satisfies ","element":"span"},{"style":{"height":10.4},"width":167.62,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-9.png","element":"img","alt":"𝛾𝑘 = 𝛾max","inline":true,"padRight":true},{"text":"in Algorithm ","element":"span"},{"href":"#id-85","text":"5","element":"a"},{"text":". This cannot happen more than ","element":"span"},{"style":{"height":14.8},"width":99.84,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-10.png","element":"img","alt":" ⌊𝐾/2⌋","inline":true,"padRight":true},{"text":"times when running the ACG algorithm for ","element":"span"},{"style":{"height":10.4},"width":29,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-11.png","element":"img","alt":" 𝐾","inline":true,"padRight":true},{"text":"iterations (as the algorithm cannot drop more vertices with away-steps than it has picked up with Frank-Wolfe steps). This is an important consideration to keep in mind, as it means that the ACG primal gap contraction does not hold on a per-iteration basis.","element":"span"}],[{"id":"id-112","style":{"fontWeight":"bold"},"text":"Theorem B.2 ","element":"span"},{"text":"(Primal gap convergence of the ACG algorithm (Algorithm ","element":"span"},{"href":"#id-25","text":"4","element":"a"},{"text":"))","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-27","referenceIndex":39,"style":{"fontStyle":"italic"},"text":"Lacoste-Julien & Jaggi","element":"a"},{"style":{"fontStyle":"italic"},"text":",","element":"span"}],[{"href":"#id-27","referenceIndex":39,"style":{"fontStyle":"italic"},"text":"2015","element":"a"},{"style":{"fontStyle":"italic"},"text":", Theorem 1) Given an ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-12.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth and ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-13.png","element":"img","alt":" 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex function ","element":"span"},{"style":{"height":15.2},"width":68.64,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-14.png","element":"img","alt":" 𝑓 (x)","inline":true},{"style":{"fontStyle":"italic"},"text":", a polytope ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and an initial point ","element":"span"},{"style":{"height":13.99},"width":114.01,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-15.png","element":"img","alt":"x0 ∈ X","inline":true},{"style":{"fontStyle":"italic"},"text":", the ACG algorithm satisfies after ","element":"span"},{"style":{"height":12},"width":99.48,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-16.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"iterations:","element":"span"}],[{"style":{"width":"47%"},"width":896,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-17.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":10.8},"width":30,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-18.png","element":"img","alt":" 𝐷","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes the diameter of the polytope ","element":"span"},{"style":{"height":12.4},"width":142.44,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/20-19.png","element":"img","alt":" X and 𝛿","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"its pyramidal width.","element":"span"}],[{"text":"See also (","element":"span"},{"href":"#id-86","referenceIndex":23,"text":"Garber & Hazan","element":"a"},{"text":", ","element":"span"},{"href":"#id-86","referenceIndex":23,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-87","referenceIndex":16,"text":"Diakonikolas et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-87","referenceIndex":16,"text":"2020","element":"a"},{"text":") for work on linearly convergent CG algorithms, and (","element":"span"},{"href":"#id-24","referenceIndex":34,"text":"Jaggi","element":"a"},{"text":", ","element":"span"},{"href":"#id-24","referenceIndex":34,"text":"2013","element":"a"},{"text":"; ","element":"span"},{"href":"#id-88","referenceIndex":40,"text":"Lan","element":"a"},{"text":", ","element":"span"},{"href":"#id-88","referenceIndex":40,"text":"2013","element":"a"},{"text":") for strong lower bounds that limit the linear convergence that can be achieved with algorithms that only access the feasible region through a linear optimization oracle.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Appendix C. Projected Variable-Metric algorithms","element":"span"}],[{"text":"In this section we provide theoretical context for the Projected Variable-Metric (PVM) algorithm (Algorithm ","element":"span"},{"href":"#id-89","text":"6","element":"a"},{"text":"), and we present several well-known results that will be helpful in motivating the SOCGS algorithm.","element":"span"}],[{"id":"id-89","style":{"width":"99%"},"width":1872,"height":409,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-0.png","element":"img"}],[{"text":"At each iteration the PVM algorithm builds a quadratic approximation of the original function ","element":"span"},{"style":{"height":15.2},"width":69.41,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-1.png","element":"img","alt":" 𝑓 (x)","inline":true},{"text":", and moves towards the point that minimizes this approximation over ","element":"span"},{"text":"X","element":"span"},{"text":". Formally, we denote the quadratic approximation of ","element":"span"},{"style":{"height":15.54},"width":506.43,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-2.png","element":"img","alt":" 𝑓 (x) at x𝑘 using 𝐻𝑘 ∈ S𝑛++ as:","inline":true}],[{"style":{"width":"72%"},"width":1360,"height":82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-3.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-4.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"text":"is an approximation to the Hessian ","element":"span"},{"style":{"height":16.96},"width":146.58,"height":42.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-5.png","element":"img","alt":" ∇2 𝑓 (x𝑘)","inline":true},{"text":". In order to measure how well ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-6.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"text":"approximates ","element":"span"},{"style":{"height":16.96},"width":146.58,"height":42.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-7.png","element":"img","alt":"∇2 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"we note that for any ","element":"span"},{"style":{"height":15.53},"width":495.75,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-8.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++ and all y ∈ X that:","inline":true}],[{"style":{"width":"72%"},"width":1360,"height":91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-9.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.02},"width":989.28,"height":47.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-10.png","element":"img","alt":" 𝜂𝑘 = max{𝜆max(𝐻−1𝑘 ∇2 𝑓 (x𝑘)), 𝜆max([∇2 𝑓 (x𝑘)]−1𝐻𝑘)} ≥ 1","inline":true,"padRight":true},{"text":"(see Lemma ","element":"span"},{"href":"#id-35","text":"A.6 ","element":"a"},{"text":"in Appendix ","element":"span"},{"href":"#id-36","text":"A.1","element":"a"},{"text":"). We will ","element":"span"},{"text":"use the value of ","element":"span"},{"style":{"height":10.4},"width":36.84,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-11.png","element":"img","alt":" 𝜂𝑘","inline":true,"padRight":true},{"text":"to measure the accuracy of how well ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-12.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"text":"approximates ","element":"span"},{"style":{"height":16.99},"width":140.11,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-13.png","element":"img","alt":" ∇2 𝑓 (x𝑘)","inline":true},{"text":". For example, an ","element":"span"},{"style":{"height":14},"width":105.56,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-14.png","element":"img","alt":" 𝜂𝑘 = 1","inline":true,"padRight":true},{"text":"means that ","element":"span"},{"style":{"height":16.99},"width":234.52,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-15.png","element":"img","alt":" 𝐻𝑘 = ∇2 𝑓 (x𝑘)","inline":true},{"text":". If we were to use ","element":"span"},{"style":{"height":13.38},"width":128.58,"height":33.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-16.png","element":"img","alt":" 𝐻𝑘 = 𝐼𝑛 ","inline":true,"padRight":true},{"text":"we would have that ","element":"span"},{"style":{"height":15.23},"width":319.6,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-17.png","element":"img","alt":" 𝜂𝑘 = max {𝐿, 1/𝜇}.","inline":true}],[{"text":"Just as the steps taken by the Projected Gradient Descent (PGD) algorithm can be interpreted in terms of Euclidean projection operators, the steps taken by the PVM algorithm in Line ","element":"span"},{"href":"#id-89","text":"2 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-89","text":"6 ","element":"a"},{"text":"can be interpreted in terms of scaled projection operators, where the norm of the projection operator is defined by ","element":"span"},{"style":{"height":15.14},"width":152.02,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-18.png","element":"img","alt":"𝐻𝑘 ∈ S𝑛++","inline":true},{"text":". Let ","element":"span"},{"style":{"height":19.43},"width":287.81,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-19.png","element":"img","alt":" Π𝐻X (x) : ℝ𝑛 → X","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"scaled projection ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"onto ","element":"span"},{"text":"X ","element":"span"},{"text":"using the matrix norm ","element":"span"},{"style":{"height":15.31},"width":75.92,"height":38.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-20.png","element":"img","alt":" ∥·∥𝐻","inline":true},{"text":", more ","element":"span"},{"text":"concretely ","element":"span"},{"style":{"height":21.02},"width":546.27,"height":52.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-21.png","element":"img","alt":" Π𝐻X (x)def= argminy∈X12 ∥y − x∥2𝐻","inline":true},{"text":". We have that:","element":"span"}],[{"style":{"width":"71%"},"width":1343,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-22.png","element":"img"}],[{"id":"id-92","style":{"fontWeight":"bold"},"text":"Remark C.1 ","element":"span"},{"text":"(First-order optimality condition for PVM subproblems)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"The solution to the problem in Line ","element":"span"},{"href":"#id-89","text":"2 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-89","text":"6 ","element":"a"},{"text":"(also shown in Equation (","element":"span"},{"href":"#id-90","text":"C.3","element":"a"},{"text":")), that is, ","element":"span"},{"style":{"height":20.34},"width":402.44,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-23.png","element":"img","alt":" ˜x∗𝑘+1 = argminx∈X ˆ𝑓𝑘 (x)","inline":true,"padRight":true},{"text":"satisfies for all ","element":"span"},{"style":{"height":12.4},"width":104.98,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-24.png","element":"img","alt":" z ∈ X:","inline":true}],[{"id":"id-90","style":{"width":"35%"},"width":665,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-25.png","element":"img"}],[{"text":"In both the PGD and the PVM algorithm the only point that is invariant under the steps taken by the algorithms is ","element":"span"},{"style":{"height":11.39},"width":34.89,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-26.png","element":"img","alt":" x∗","inline":true},{"text":". That is, in the case of the PGD algorithm we have that ","element":"span"},{"style":{"height":20.05},"width":352.74,"height":50.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-27.png","element":"img","alt":" Π𝐼𝑛X (x − ∇ 𝑓 (x)) = x∗","inline":true,"padRight":true},{"text":"if and only if ","element":"span"},{"style":{"height":11.39},"width":102.34,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-28.png","element":"img","alt":"x = x∗","inline":true},{"text":". Similarly, in the case of the PVM algorithm we have that ","element":"span"},{"style":{"height":15.11},"width":68.52,"height":37.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-29.png","element":"img","alt":" Π𝐻𝑘","inline":true}],[{"style":{"width":"73%"},"width":1379,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-30.png","element":"img"}],[{"id":"id-94","style":{"fontWeight":"bold"},"text":"Lemma C.2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a matrix ","element":"span"},{"style":{"height":15.14},"width":134.57,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-31.png","element":"img","alt":" 𝐻 ∈ S𝑛++","inline":true},{"style":{"fontStyle":"italic"},"text":", for any ","element":"span"},{"style":{"height":12.4},"width":96.2,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-32.png","element":"img","alt":" x ∈ X","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":15.2},"width":180.26,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-33.png","element":"img","alt":" d ∈ 𝑁X (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"(where ","element":"span"},{"style":{"height":15.2},"width":111.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-34.png","element":"img","alt":" 𝑁X (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"represents the normal ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cone of ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"at ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"style":{"fontStyle":"italic"},"text":", see Definition ","element":"span"},{"href":"#id-91","style":{"fontStyle":"italic"},"text":"A.4","element":"a"},{"style":{"fontStyle":"italic"},"text":") we have that:","element":"span"}],[{"style":{"width":"64%"},"width":1217,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/21-35.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"From the definition of the normal cone, given a ","element":"span"},{"style":{"height":15.2},"width":368.31,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-0.png","element":"img","alt":" x ∈ X and d ∈ 𝑁X (x)","inline":true,"padRight":true},{"text":"we know that for all ","element":"span"},{"style":{"height":14.8},"width":96.69,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-1.png","element":"img","alt":" y ∈ X","inline":true}],[{"style":{"width":"100%"},"width":1874,"height":669,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-2.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":14.8},"width":96.86,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-3.png","element":"img","alt":" y ∈ X","inline":true},{"text":". This means that the closest point to ","element":"span"},{"style":{"height":13.79},"width":154.14,"height":34.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-4.png","element":"img","alt":" x + 𝐻−1d","inline":true,"padRight":true},{"text":"that is in ","element":"span"},{"text":"X","element":"span"},{"text":", when we measure the distance in the ","element":"span"},{"style":{"height":10.4},"width":32,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-5.png","element":"img","alt":" 𝐻","inline":true,"padRight":true},{"text":"norm, is given by ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"itself, i.e., ","element":"span"},{"style":{"height":13.39},"width":57.68,"height":33.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-6.png","element":"img","alt":" Π𝐻","inline":true}],[{"text":"holds for ","element":"span"},{"style":{"height":11.39},"width":1720.39,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-7.png","element":"img","alt":" 𝐻 = 𝐼𝑛. □","inline":true}],[{"id":"id-96","style":{"fontWeight":"bold"},"text":"Lemma C.3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a matrix ","element":"span"},{"style":{"height":15.54},"width":316.59,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-8.png","element":"img","alt":" 𝐻 ∈ S𝑛++, an x ∈ X","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"satisfies:","element":"span"}],[{"id":"id-93","style":{"width":"61%"},"width":1148,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-9.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"if and only if ","element":"span"},{"style":{"height":15.68},"width":598.54,"height":39.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-10.png","element":"img","alt":" x = x∗ where x∗ = argminx∈X 𝑓 (x).","inline":true}],[{"style":{"height":16},"width":181.02,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-11.png","element":"img","alt":"Proof. (⇒","inline":true},{"text":") Using the first-order optimality conditions for the scaled projection problem, shown in Remark ","element":"span"},{"href":"#id-92","text":"C.1","element":"a"},{"text":", and particularizing for ","element":"span"},{"style":{"height":17.02},"width":225.39,"height":42.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-12.png","element":"img","alt":" ˜x∗𝑘+1 = x𝑘 = x","inline":true,"padRight":true},{"text":"we have that for all ","element":"span"},{"style":{"height":12.4},"width":105.04,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-13.png","element":"img","alt":" z ∈ X:","inline":true}],[{"style":{"width":"71%"},"width":1331,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-14.png","element":"img"}],[{"text":"which hold true if and only if ","element":"span"},{"style":{"height":11.39},"width":102.47,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-15.png","element":"img","alt":" x = x∗","inline":true},{"text":", as Equation (","element":"span"},{"href":"#id-93","text":"C.9","element":"a"},{"text":") represents the first-order optimality conditions for Problem ","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":", of which ","element":"span"},{"style":{"height":11.39},"width":35.03,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-16.png","element":"img","alt":" x∗ ","inline":true,"padRight":true},{"text":"is the unique optimal solution.","element":"span"}],[{"text":"(","element":"span"},{"style":{"height":8.4},"width":41,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-17.png","element":"img","alt":"⇐","inline":true},{"text":") Assume that ","element":"span"},{"style":{"height":11.39},"width":102.78,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-18.png","element":"img","alt":" x = x∗","inline":true},{"text":", then ","element":"span"},{"style":{"height":15.2},"width":327.02,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-19.png","element":"img","alt":" −∇ 𝑓 (x∗) ∈ 𝑁X (x∗)","inline":true},{"text":". By the application of Lemma ","element":"span"},{"href":"#id-94","text":"C.2 ","element":"a"},{"text":"we have that for any ","element":"span"},{"style":{"height":15.14},"width":134.56,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-20.png","element":"img","alt":"𝐻 ∈ S𝑛++ ","inline":true,"padRight":true},{"text":"then it holds that ","element":"span"},{"style":{"height":19.42},"width":1414.21,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-21.png","element":"img","alt":" x = Π𝐻X�x − 𝐻−1∇ 𝑓 (x)�. □","inline":true}],[{"text":"Another interesting property of the PVM algorithm is the fact that the direction ","element":"span"},{"style":{"height":17.03},"width":153.27,"height":42.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-22.png","element":"img","alt":" ˜x∗𝑘+1 − x𝑘","inline":true,"padRight":true},{"text":"in Line ","element":"span"},{"href":"#id-89","text":"3 ","element":"a"},{"text":"of","element":"span"},{"text":"Algorithm ","element":"span"},{"href":"#id-89","text":"6 ","element":"a"},{"text":"is a descent direction regardless of how well ","element":"span"},{"style":{"height":15.14},"width":152.07,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-23.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++ ","inline":true,"padRight":true},{"text":"approximates the Hessian ","element":"span"},{"style":{"height":16.99},"width":268.43,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-24.png","element":"img","alt":" ∇2 𝑓 (x𝑘), this is","inline":true,"padRight":true},{"text":"formalized in Lemma ","element":"span"},{"href":"#id-95","text":"C.4","element":"a"},{"text":". Note that despite this, we cannot guarantee that ","element":"span"},{"style":{"height":17.24},"width":268.07,"height":43.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-25.png","element":"img","alt":" 𝑓 (˜x∗𝑘+1) ≤ 𝑓 (x𝑘)","inline":true},{"text":", which is why ","element":"span"},{"text":"to ensure primal progress at each iteration a line search or a bounded step size is often used in Line ","element":"span"},{"href":"#id-89","text":"3 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-89","text":"6","element":"a"},{"text":".","element":"span"}],[{"id":"id-95","style":{"fontWeight":"bold"},"text":"Lemma C.4 ","element":"span"},{"text":"(Descent property of Projected Variable-Metric directions)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-33","referenceIndex":8,"style":{"fontStyle":"italic"},"text":"Ben-Tal & Nemirovskii","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-33","referenceIndex":8,"style":{"fontStyle":"italic"},"text":"2020","element":"a"},{"style":{"fontStyle":"italic"},"text":", Section 7.2.1) If ","element":"span"},{"style":{"height":15.32},"width":368.26,"height":38.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-26.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++ and x𝑘 ≠ x∗","inline":true},{"style":{"fontStyle":"italic"},"text":", then the directions given by ","element":"span"},{"style":{"height":20.34},"width":689.83,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-27.png","element":"img","alt":" ˜x∗𝑘+1 − x𝑘, where ˜x∗𝑘+1 = argminx∈X ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are descent directions at point ","element":"span"},{"style":{"height":9.59},"width":37.34,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-28.png","element":"img","alt":" x𝑘","inline":true},{"style":{"fontStyle":"italic"},"text":", i.e., they satisfy","element":"span"},{"style":{"height":19.67},"width":439.45,"height":49.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-29.png","element":"img","alt":"�−∇ 𝑓 (x𝑘), ˜x∗𝑘+1 − x𝑘�> 0.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Using the first-order optimality conditions shown in Remark ","element":"span"},{"href":"#id-92","text":"C.1 ","element":"a"},{"text":"for the scaled projection subproblem and particularizing for ","element":"span"},{"style":{"height":9.59},"width":114.76,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-30.png","element":"img","alt":" z = x𝑘:","inline":true}],[{"style":{"width":"38%"},"width":717,"height":62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-31.png","element":"img"}],[{"text":"Where the last strict inequality follows from the fact that we have assumed that ","element":"span"},{"style":{"height":13.78},"width":123.13,"height":34.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-32.png","element":"img","alt":" x𝑘 ≠ x∗","inline":true},{"text":", and consequently ","element":"span"},{"style":{"height":17.03},"width":157.44,"height":42.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-33.png","element":"img","alt":"˜x∗𝑘+1 ≠ x𝑘","inline":true,"padRight":true},{"text":"by application of Lemma ","element":"span"},{"href":"#id-96","text":"C.3","element":"a"},{"text":", and the assumption that ","element":"span"},{"style":{"height":24.89},"width":735.94,"height":62.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/22-34.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++, thus��˜x∗𝑘+1 − x𝑘��2𝐻𝑘 > 0. □","inline":true}],[{"style":{"fontWeight":"bold"},"text":"C.1 Global Convergence","element":"span"}],[{"text":"The global primal gap convergence of the PVM algorithm (Algorithm ","element":"span"},{"href":"#id-89","text":"6","element":"a"},{"text":") with bounded step sizes is a well-known result that we reproduce here for completeness, as we will compare this global convergence rate with that of other first-order optimization algorithms. In order to prove it, we review Lemma ","element":"span"},{"href":"#id-97","text":"C.5 ","element":"a"},{"text":"which will be used in the global convergence proof.","element":"span"}],[{"id":"id-97","style":{"fontWeight":"bold"},"text":"Lemma C.5. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-98","referenceIndex":38,"style":{"fontStyle":"italic"},"text":"Karimireddy et al.","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-98","referenceIndex":38,"style":{"fontStyle":"italic"},"text":"2018b","element":"a"},{"style":{"fontStyle":"italic"},"text":", Lemma 9) Given a convex domain ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":15.14},"width":157.83,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-0.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"then for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"constants ","element":"span"},{"style":{"height":12.4},"width":576.62,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-1.png","element":"img","alt":" 𝛼 > 0 and 𝜈 > 0 such that 𝛼𝜈 ≥ 1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"we have that:","element":"span"}],[{"style":{"width":"88%"},"width":1665,"height":98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-2.png","element":"img"}],[{"text":"With the previous Lemma at hand, we can prove the global linear convergence in primal gap of the PVM algorithm with bounded step size when minimizing a ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-3.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-4.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smooth function over a convex set ","element":"span"},{"text":"X","element":"span"},{"text":".","element":"span"}],[{"id":"id-101","style":{"fontWeight":"bold"},"text":"Theorem C.6 ","element":"span"},{"text":"(Global convergence of Projected Variable-Metric algorithm with bounded step size.)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-38","referenceIndex":37,"style":{"fontStyle":"italic"},"text":"Karimireddy et al.","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-38","referenceIndex":37,"style":{"fontStyle":"italic"},"text":"2018a","element":"a"},{"style":{"fontStyle":"italic"},"text":", Theorem 4) Given an ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-5.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth and ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-6.png","element":"img","alt":" 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex function and a convex set","element":"span"}],[{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"then the Projected Variable-Metric algorithm (Algorithm ","element":"span"},{"href":"#id-89","style":{"fontStyle":"italic"},"text":"6","element":"a"},{"style":{"fontStyle":"italic"},"text":") with a step size ","element":"span"},{"style":{"height":20.98},"width":149.24,"height":52.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-7.png","element":"img","alt":" 𝛾𝑘 ≤ 𝜇𝐿𝜂𝑘","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"guarantees for all","element":"span"}],[{"style":{"width":"70%"},"width":1324,"height":177,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-8.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the parameter ","element":"span"},{"style":{"height":10.4},"width":36.83,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-9.png","element":"img","alt":" 𝜂𝑘","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"measures how well ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-10.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"approximates ","element":"span"},{"style":{"height":16.99},"width":154.2,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-11.png","element":"img","alt":" ∇2 𝑓 (x𝑘).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"The iterate ","element":"span"},{"style":{"height":9.59},"width":70.73,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-12.png","element":"img","alt":" x𝑘+1","inline":true,"padRight":true},{"text":"can be rewritten as:","element":"span"}],[{"style":{"width":"75%"},"width":1408,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-13.png","element":"img"}],[{"text":"Using ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-14.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smoothness and the ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-15.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strong convexity of the function ","element":"span"},{"style":{"height":14.4},"width":23,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-16.png","element":"img","alt":" 𝑓","inline":true,"padRight":true},{"text":"we can write:","element":"span"}],[{"style":{"height":35.33},"width":1532.22,"height":88.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-17.png","element":"img","alt":"𝑓 (x𝑘+1) − 𝑓 (x𝑘) ≤ ⟨∇ 𝑓 (x𝑘), x𝑘+1 − x𝑘⟩ + 𝐿2𝜇 ∥x𝑘+1 − x𝑘∥2∇2 𝑓 (x𝑘 ) (C.12)","inline":true,"padRight":true},{"text":"≤ ⟨∇ ","element":"span"},{"style":{"height":35.32},"width":1168.75,"height":88.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-18.png","element":"img","alt":" 𝑓 (x𝑘), x𝑘+1 − x𝑘⟩ + 𝐿𝜂𝑘2𝜇 ∥x𝑘+1 − x𝑘∥2𝐻𝑘 (C.13)","inline":true,"padRight":true},{"text":"≤ ⟨∇ ","element":"span"},{"style":{"height":35.72},"width":1168.75,"height":89.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-19.png","element":"img","alt":" 𝑓 (x𝑘), x𝑘+1 − x𝑘⟩ + 12𝛾𝑘∥x𝑘+1 − x𝑘∥2𝐻𝑘 (C.14)","inline":true}],[{"id":"id-99","style":{"width":"66%"},"width":1252,"height":98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-20.png","element":"img"}],[{"text":"Where the second inequality follows from Equation (","element":"span"},{"href":"#id-37","text":"2.6","element":"a"},{"text":") (which in turn is a consequence of ","element":"span"},{"style":{"height":15.14},"width":152.07,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-21.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++","inline":true},{"text":") and ","element":"span"},{"text":"the third inequality follows from the fact that ","element":"span"},{"style":{"height":20.98},"width":149.24,"height":52.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-22.png","element":"img","alt":" 𝛾𝑘 ≤ 𝜇𝐿𝜂𝑘 ","inline":true,"padRight":true},{"text":". Applying Lemma ","element":"span"},{"href":"#id-97","text":"C.5 ","element":"a"},{"text":"to Equation (","element":"span"},{"href":"#id-99","text":"C.15","element":"a"},{"text":") and noting ","element":"span"},{"text":"that as ","element":"span"},{"style":{"height":15.14},"width":152.04,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-23.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++ ","inline":true,"padRight":true},{"text":"we can apply Equation (","element":"span"},{"href":"#id-37","text":"2.6","element":"a"},{"text":") and transform the minimization problem involving ","element":"span"},{"style":{"height":17.12},"width":180.42,"height":42.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/23-24.png","element":"img","alt":" ∥x − x𝑘∥𝐻𝑘","inline":true}],[{"text":"to one that involves ","element":"span"},{"style":{"height":17.72},"width":255.11,"height":44.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-0.png","element":"img","alt":" ∥x − x𝑘∥∇2 𝑓 (x𝑘 )","inline":true},{"text":". Continuing with the chain of inequalities:","element":"span"}],[{"id":"id-100","style":{"width":"85%"},"width":1598,"height":658,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-1.png","element":"img"}],[{"text":"We obtain Equation (","element":"span"},{"href":"#id-100","text":"C.17","element":"a"},{"text":") by applying Lemma ","element":"span"},{"href":"#id-97","text":"C.5","element":"a"},{"text":", and Equation (","element":"span"},{"href":"#id-100","text":"C.18","element":"a"},{"text":") from applying Lemma ","element":"span"},{"href":"#id-35","text":"A.6 ","element":"a"},{"text":"to the norm term in Equation (","element":"span"},{"href":"#id-100","text":"C.17","element":"a"},{"text":"), which allows us to use that ","element":"span"},{"href":"#id-100","style":{"height":22.07},"width":886.47,"height":55.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-2.png","element":"img","alt":" 1/𝜂𝑘 ∥x − x𝑘∥2𝐻𝑘 ≤ ∥x − x𝑘∥2∇2 𝑓 (x𝑘 ). Equation (C.19)","inline":true,"padRight":true},{"text":"follows from plugging in ","element":"span"},{"style":{"height":15.2},"width":384.88,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-3.png","element":"img","alt":" x = (1 − 𝛾𝑘)x𝑘 + 𝛾𝑘x∗","inline":true,"padRight":true},{"text":"into Equation (","element":"span"},{"href":"#id-100","text":"C.18","element":"a"},{"text":") (as of course ","element":"span"},{"style":{"height":12.4},"width":125.58,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-4.png","element":"img","alt":" x∗ ∈ X","inline":true},{"text":"). We obtain Equation (","element":"span"},{"href":"#id-100","text":"C.20","element":"a"},{"text":") by considering that ","element":"span"},{"style":{"height":14},"width":126,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-5.png","element":"img","alt":" 𝛾𝑘 ≤ 1","inline":true},{"text":", and Equation (","element":"span"},{"href":"#id-100","text":"C.21","element":"a"},{"text":") from the ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-6.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strong convexity and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-7.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smoothness of the function ","element":"span"},{"style":{"height":15.2},"width":68.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-8.png","element":"img","alt":" 𝑓 (x)","inline":true},{"text":". Reordering the previous expression leads to:","element":"span"}],[{"style":{"width":"70%"},"width":1326,"height":185,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-9.png","element":"img"}],[{"text":"As the exact line search strategy makes at least as much progress as choosing any ","element":"span"},{"style":{"height":20.98},"width":149.25,"height":52.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-10.png","element":"img","alt":" 𝛾𝑘 ≤ 𝜇𝐿𝜂𝑘 ","inline":true,"padRight":true},{"text":", the bound in ","element":"span"},{"text":"Theorem ","element":"span"},{"href":"#id-101","text":"C.6 ","element":"a"},{"text":"also holds for the Projected Variable-Metric algorithm (Algorithm ","element":"span"},{"href":"#id-89","text":"6","element":"a"},{"text":") with exact line search.","element":"span"}],[{"id":"id-102","style":{"fontWeight":"bold"},"text":"Corollary C.7 ","element":"span"},{"text":"(Global convergence of Projected Variable-Metric algorithm with exact line search or ","element":"span"},{"style":{"height":20.98},"width":161.48,"height":52.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-11.png","element":"img","alt":"𝛾𝑘 = 𝜇𝐿𝜂𝑘","inline":true,"padRight":true},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given an ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-12.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth and ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-13.png","element":"img","alt":" 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex function and a convex set ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"then the Projected ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Variable-Metric algorithm (Algorithm ","element":"span"},{"href":"#id-89","style":{"fontStyle":"italic"},"text":"6","element":"a"},{"style":{"fontStyle":"italic"},"text":") with an exact line search or with a step size ","element":"span"},{"style":{"height":20.98},"width":145.1,"height":52.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-14.png","element":"img","alt":" 𝛾𝑘 = 𝜇𝐿𝜂𝑘 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"guarantees for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"all ","element":"span"},{"style":{"height":12.4},"width":103.68,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-15.png","element":"img","alt":" 𝑘 ≥ 0:","inline":true}],[{"style":{"width":"42%"},"width":799,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-16.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the parameter ","element":"span"},{"style":{"height":10.4},"width":36.83,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-17.png","element":"img","alt":" 𝜂𝑘","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"measures how well ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-18.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"approximates ","element":"span"},{"style":{"height":16.99},"width":154.2,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-19.png","element":"img","alt":" ∇2 𝑓 (x𝑘).","inline":true}],[{"text":"As was mentioned in Lemma ","element":"span"},{"href":"#id-95","text":"C.4 ","element":"a"},{"text":"the direction ","element":"span"},{"style":{"height":17.03},"width":153.19,"height":42.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-20.png","element":"img","alt":" ˜x∗𝑘+1 − x𝑘","inline":true,"padRight":true},{"text":"in Line ","element":"span"},{"href":"#id-89","text":"3 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-89","text":"6 ","element":"a"},{"text":"is a descent direction","element":"span"},{"text":"regardless of how well ","element":"span"},{"style":{"height":15.14},"width":152.12,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-21.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++ ","inline":true,"padRight":true},{"text":"approximates the Hessian ","element":"span"},{"style":{"height":16.99},"width":139.75,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-22.png","element":"img","alt":" ∇2 𝑓 (x𝑘)","inline":true},{"text":". However, as we can see in Theorem ","element":"span"},{"href":"#id-101","text":"C.6 ","element":"a"},{"text":"and Corollary ","element":"span"},{"href":"#id-102","text":"C.7","element":"a"},{"text":", if we pick a matrix ","element":"span"},{"style":{"height":15.14},"width":155.17,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-23.png","element":"img","alt":" 𝐻𝑘 ∈ S𝑛++","inline":true,"padRight":true},{"text":"that approximates the Hessian ","element":"span"},{"style":{"height":16.99},"width":140.36,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-24.png","element":"img","alt":" ∇2 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"well, that is, we ","element":"span"},{"text":"have an ","element":"span"},{"style":{"height":10.4},"width":36.83,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-25.png","element":"img","alt":" 𝜂𝑘","inline":true,"padRight":true},{"text":"close to 1, we will be able to guarantee more primal progress per step when using an exact line search or bounded step sizes.","element":"span"}],[{"text":"One of the key consequences of Corollary ","element":"span"},{"href":"#id-102","text":"C.7 ","element":"a"},{"text":"is that even if we run the PVM algorithm with an exact line search and we use ","element":"span"},{"style":{"height":16.99},"width":234.49,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-26.png","element":"img","alt":" 𝐻𝑘 = ∇2 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"(which is equivalent to ","element":"span"},{"style":{"height":17.39},"width":585.68,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-27.png","element":"img","alt":" 𝜂𝑘 = 1), we need O(𝐿3/𝜇3 log 1/𝜀)","inline":true,"padRight":true},{"text":"iterations to reach an ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-28.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution to Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":"). This stands in contrast to the PGD algorithm, which requires ","element":"span"},{"style":{"height":15.2},"width":252.78,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-29.png","element":"img","alt":"O(𝐿/𝜇 log 1/𝜀)","inline":true,"padRight":true},{"text":"iterations, or ","element":"span"},{"style":{"height":14.4},"width":693.81,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-30.png","element":"img","alt":" Nesterov’s Projected Gradient Descent","inline":true,"padRight":true},{"text":"(NPGD) algorithm, which requires","element":"span"}],[{"style":{"height":20},"width":234.59,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-31.png","element":"img","alt":"√︁𝐿/𝜇 log 1/𝜀)","inline":true,"padRight":true},{"text":"iterations to reach an ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-32.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution. Note that with a small modification of the proof in Theorem ","element":"span"},{"href":"#id-101","text":"C.6 ","element":"a"},{"text":"we can recover the same rate for the PGD algorithm and the PVM algorithm with ","element":"span"},{"style":{"height":13.38},"width":142.44,"height":33.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/24-33.png","element":"img","alt":" 𝐻𝑘 = 𝐼𝑛.","inline":true,"padRight":true},{"text":"This is expected, as in this case the algorithms are equivalent, except for the bounded step size strategy.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem C.8 ","element":"span"},{"text":"(Global convergence of Projected Variable-Metric algorithm with bounded step size and ","element":"span"},{"style":{"height":15.6},"width":373.9,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-0.png","element":"img","alt":"𝐻𝑘 = 𝐼𝑛). Given an 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth and ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-1.png","element":"img","alt":" 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex function and a convex set ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"then the Projected VariableMetric algorithm (Algorithm ","element":"span"},{"href":"#id-89","style":{"fontStyle":"italic"},"text":"6","element":"a"},{"style":{"fontStyle":"italic"},"text":") with a step size ","element":"span"},{"style":{"height":19.38},"width":482.08,"height":48.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-2.png","element":"img","alt":" 𝛾𝑘 ≤ min{1, 1𝐿 } and 𝐻𝑘 = 𝐼𝑛","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"guarantees for all ","element":"span"},{"style":{"height":12.4},"width":103.68,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-3.png","element":"img","alt":" 𝑘 ≥ 0:","inline":true}],[{"style":{"width":"69%"},"width":1308,"height":74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"The proof mirrors that of Theorem ","element":"span"},{"href":"#id-101","text":"C.6","element":"a"},{"text":", and so we only give a brief outline. The iterate ","element":"span"},{"style":{"height":13.59},"width":200.19,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-5.png","element":"img","alt":" x𝑘+1 can be","inline":true,"padRight":true},{"text":"rewritten as:","element":"span"}],[{"style":{"width":"75%"},"width":1405,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-6.png","element":"img"}],[{"text":"Using ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-7.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smoothness we can write:","element":"span"}],[{"style":{"height":31.73},"width":1530.1,"height":79.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-8.png","element":"img","alt":"𝑓 (x𝑘+1) − 𝑓 (x𝑘) ≤ ⟨∇ 𝑓 (x𝑘), x𝑘+1 − x𝑘⟩ + 𝐿2 ∥x𝑘+1 − x𝑘∥2 (C.24)","inline":true,"padRight":true},{"text":"≤ ⟨∇ ","element":"span"},{"style":{"height":35.72},"width":1166.63,"height":89.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-9.png","element":"img","alt":" 𝑓 (x𝑘), x𝑘+1 − x𝑘⟩ + 12𝛾𝑘∥x𝑘+1 − x𝑘∥2 (C.25)","inline":true}],[{"id":"id-103","style":{"width":"66%"},"width":1250,"height":374,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-10.png","element":"img"}],[{"text":"Where Equation (","element":"span"},{"href":"#id-103","text":"C.25","element":"a"},{"text":") follows from ","element":"span"},{"style":{"height":19.38},"width":269.56,"height":48.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-11.png","element":"img","alt":" 𝛾𝑘 ≤ min{1, 1𝐿 }","inline":true,"padRight":true},{"text":"and Equation (","element":"span"},{"href":"#id-103","text":"C.26","element":"a"},{"text":") follows from Equation (","element":"span"},{"href":"#id-103","text":"C.23","element":"a"},{"text":"). ","element":"span"},{"text":"Applying Lemma ","element":"span"},{"href":"#id-97","text":"C.5 ","element":"a"},{"text":"to Equation (","element":"span"},{"href":"#id-103","text":"C.26","element":"a"},{"text":") leads to Equation (","element":"span"},{"href":"#id-103","text":"C.27","element":"a"},{"text":"). Equation (","element":"span"},{"href":"#id-103","text":"C.28","element":"a"},{"text":") follows from plugging in ","element":"span"},{"style":{"height":15.2},"width":359.22,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-12.png","element":"img","alt":" x = (1 − 𝛾𝑘)x𝑘 + 𝛾𝑘x∗ ","inline":true,"padRight":true},{"text":"into Equation (","element":"span"},{"href":"#id-103","text":"C.27","element":"a"},{"text":") (as of course ","element":"span"},{"style":{"height":12.4},"width":111.85,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-13.png","element":"img","alt":" x∗ ∈ X","inline":true},{"text":") Lastly, in Equation (","element":"span"},{"href":"#id-103","text":"C.29","element":"a"},{"text":") we have used ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-14.png","element":"img","alt":"𝜇","inline":true},{"text":"-strong convexity and the fact that ","element":"span"},{"style":{"height":19.38},"width":259.05,"height":48.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-15.png","element":"img","alt":" 𝛾𝑘 ≤ min{1, 1𝐿 }","inline":true},{"text":". Reordering the terms previous inequality completes the ","element":"span"},{"text":"proof. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-16.png","element":"img","alt":"□","inline":true}],[{"style":{"fontWeight":"bold"},"text":"C.2 Local Convergence","element":"span"}],[{"text":"Despite the lackluster convergence rate in primal gap shown in Theorem ","element":"span"},{"href":"#id-101","text":"C.6","element":"a"},{"text":", the PVM algorithm can achieve quadratic convergence in distance to the optimum when the iterates are close enough to the optimum and the Hessian approximations are accurate enough. We first review a series of results that will allow us to prove the local quadratic convergence of the PVM algorithm. One of the key properties that is often used in the convergence proof of the PGD algorithm is the non-expansiveness of the Euclidean projection operator onto a convex set ","element":"span"},{"text":"X","element":"span"},{"text":", denoted by ","element":"span"},{"style":{"height":20.05},"width":59.22,"height":50.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-17.png","element":"img","alt":" Π𝐼𝑛X ","inline":true,"padRight":true},{"text":". In the local convergence proof of the PVM algorithm we use a generalization ","element":"span"},{"text":"of the aforementioned fact, that is, the scaled projection operator onto a convex set ","element":"span"},{"text":"X","element":"span"},{"text":", denoted by ","element":"span"},{"style":{"height":21.31},"width":184.59,"height":53.27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-18.png","element":"img","alt":" Π𝐻𝑘X where","inline":true},{"style":{"height":15.14},"width":134.56,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-19.png","element":"img","alt":"𝐻 ∈ S𝑛++","inline":true},{"text":", is also non-expansive (see Lemma ","element":"span"},{"href":"#id-104","text":"C.9","element":"a"},{"text":").","element":"span"}],[{"id":"id-104","style":{"fontWeight":"bold"},"text":"Lemma C.9. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-105","referenceIndex":7,"style":{"fontStyle":"italic"},"text":"Beck","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-105","referenceIndex":7,"style":{"fontStyle":"italic"},"text":"2017","element":"a"},{"style":{"fontStyle":"italic"},"text":")[Theorem 6.42] Given a ","element":"span"},{"style":{"height":15.14},"width":134.6,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-20.png","element":"img","alt":" 𝐻 ∈ S𝑛++","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a convex set ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", the scaled projection is a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"contraction mapping (it is firmly-nonexpansive) in the ","element":"span"},{"style":{"height":10.4},"width":150.45,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-21.png","element":"img","alt":" 𝐻-norm:","inline":true}],[{"style":{"width":"99%"},"width":1871,"height":265,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-22.png","element":"img"}],[{"id":"id-106","style":{"fontWeight":"bold"},"text":"Lemma C.10. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-32","referenceIndex":52,"style":{"fontStyle":"italic"},"text":"Nesterov","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-32","referenceIndex":52,"style":{"fontStyle":"italic"},"text":"2018","element":"a"},{"style":{"fontStyle":"italic"},"text":")[Lemma 4.1.1] If a twice differentiable function ","element":"span"},{"style":{"height":14.4},"width":137.98,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-23.png","element":"img","alt":" 𝑓 has 𝐿2","inline":true},{"style":{"fontStyle":"italic"},"text":"-Lipschitz continuous Hessian over ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"then for all ","element":"span"},{"style":{"height":14.8},"width":149,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-24.png","element":"img","alt":" x, y ∈ X:","inline":true}],[{"style":{"width":"45%"},"width":846,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/25-25.png","element":"img"}],[{"text":"With the results from Lemma ","element":"span"},{"href":"#id-104","text":"C.9 ","element":"a"},{"text":"and Lemma ","element":"span"},{"href":"#id-106","text":"C.10 ","element":"a"},{"text":"we can formalize the local convergence of the PVM algorithm.","element":"span"}],[{"id":"id-108","style":{"fontWeight":"bold"},"text":"Lemma C.11 ","element":"span"},{"text":"(Local convergence of Projected Variable-Metric algorithm)","element":"span"},{"style":{"height":11.2},"width":220.76,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-0.png","element":"img","alt":". Given an 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth and ","element":"span"},{"style":{"height":14.4},"width":170.55,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-1.png","element":"img","alt":" 𝜇-strongly","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"convex function with ","element":"span"},{"style":{"height":12.39},"width":40.06,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-2.png","element":"img","alt":" 𝐿2","inline":true},{"style":{"fontStyle":"italic"},"text":"-Lipschitz Hessian and a compact convex set ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if ","element":"span"},{"style":{"height":28.71},"width":336.12,"height":71.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-3.png","element":"img","alt":" ˜x∗𝑘+1 = argminx∈X ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"then for all ","element":"span"},{"style":{"height":12.4},"width":103.69,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-4.png","element":"img","alt":"𝑘 ≥ 0:","inline":true}],[{"style":{"width":"100%"},"width":1875,"height":1283,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-5.png","element":"img"}],[{"text":"The last inequality is a consequence of the ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-6.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strong convexity of ","element":"span"},{"style":{"height":14.4},"width":23,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-7.png","element":"img","alt":" 𝑓","inline":true,"padRight":true},{"text":"which ensures that ","element":"span"},{"style":{"height":16.99},"width":328.56,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-8.png","element":"img","alt":" ∇2 𝑓 (x𝑘)−1 ⪯ 𝜇−1𝐼𝑛","inline":true},{"text":". Using the fact that the Hessian is ","element":"span"},{"style":{"height":12.39},"width":40.07,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-9.png","element":"img","alt":" 𝐿2","inline":true},{"text":"-Lipschitz and applying Lemma ","element":"span"},{"href":"#id-106","text":"C.10 ","element":"a"},{"text":"and using the ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-10.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smoothness of ","element":"span"},{"style":{"height":14.4},"width":23,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-11.png","element":"img","alt":" 𝑓","inline":true,"padRight":true},{"text":"leads to:","element":"span"}],[{"id":"id-107","style":{"width":"76%"},"width":1428,"height":100,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-12.png","element":"img"}],[{"text":"Using Lemma ","element":"span"},{"href":"#id-35","text":"A.6 ","element":"a"},{"text":"along with the ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-13.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strong convexity of ","element":"span"},{"style":{"height":14.4},"width":23,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-14.png","element":"img","alt":" 𝑓","inline":true,"padRight":true},{"text":"and reordering the expression shown in Equation (","element":"span"},{"href":"#id-107","text":"C.35","element":"a"},{"text":") completes the proof. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-15.png","element":"img","alt":"□","inline":true}],[{"text":"As we can see, even if the scaled projection subproblems are solved to optimality we arrive at a convergence rate for","element":"span"},{"style":{"height":20.1},"width":186.77,"height":50.25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-16.png","element":"img","alt":"��˜x∗𝑘+1 − x∗��","inline":true,"padRight":true},{"text":"that is linear-quadratic in terms of ","element":"span"},{"style":{"height":14.8},"width":156.02,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-17.png","element":"img","alt":" ∥x𝑘 − x∗∥","inline":true},{"text":", and we do not obtain local quadratic ","element":"span"},{"text":"convergence without additional assumptions on how well ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-18.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"text":"approximates ","element":"span"},{"style":{"height":16.99},"width":140.35,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-19.png","element":"img","alt":" ∇2 𝑓 (x𝑘)","inline":true},{"text":", due to ","element":"span"},{"style":{"height":14},"width":104.95,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-20.png","element":"img","alt":" 𝜂𝑘 − 1","inline":true,"padRight":true},{"text":"in the second term in Equation (","element":"span"},{"href":"#id-107","text":"C.35","element":"a"},{"text":"). This can be remedied with Assumption ","element":"span"},{"href":"#id-41","text":"2","element":"a"},{"text":":","element":"span"}],[{"id":"id-40","style":{"fontWeight":"bold"},"text":"Corollary C.12. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If in addition to the conditions described in Lemma ","element":"span"},{"href":"#id-108","style":{"fontStyle":"italic"},"text":"C.11 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"we also assume that Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"is satisfied, we have:","element":"span"}],[{"style":{"width":"70%"},"width":1322,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-21.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":12},"width":98.21,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/26-22.png","element":"img","alt":" 𝜔 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is described in Equation ","element":"span"},{"text":"(","element":"span"},{"href":"#id-37","text":"2.7","element":"a"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"text":"Even though","element":"span"},{"style":{"height":20.1},"width":186.76,"height":50.25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/27-0.png","element":"img","alt":"��˜x∗𝑘+1 − x∗��","inline":true,"padRight":true},{"text":"may converge quadratically, what we are interested in is in the quadratic ","element":"span"},{"text":"convergence of ","element":"span"},{"style":{"height":14.8},"width":188.32,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/27-1.png","element":"img","alt":" ∥x𝑘+1 − x∗∥","inline":true},{"text":", formed as ","element":"span"},{"style":{"height":17.24},"width":576.32,"height":43.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/27-2.png","element":"img","alt":" x𝑘+1 = x𝑘 + 𝛾𝑘(˜x∗𝑘+1 − x𝑘), that is:","inline":true}],[{"id":"id-109","style":{"width":"72%"},"width":1352,"height":179,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/27-3.png","element":"img"}],[{"text":"We can see from Equation (","element":"span"},{"href":"#id-109","text":"C.39","element":"a"},{"text":") that we will only have the desired convergence rate if ","element":"span"},{"style":{"height":15.22},"width":371.8,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/27-4.png","element":"img","alt":" (1 − 𝛾𝑘) ≤ 𝛽 ∥x𝑘 − x∗∥","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":14},"width":90.96,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/27-5.png","element":"img","alt":" 𝛽 ≥ 0","inline":true},{"text":", that is, we either need to set ","element":"span"},{"style":{"height":13.6},"width":106.06,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/27-6.png","element":"img","alt":" 𝛾𝑘 = 1","inline":true},{"text":", or select a step size strategy that makes ","element":"span"},{"style":{"height":10.4},"width":37.35,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/27-7.png","element":"img","alt":" 𝛾𝑘","inline":true,"padRight":true},{"text":"converge to ","element":"span"},{"text":"1 ","element":"span"},{"text":"fast enough.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Appendix D. Second-order Conditional Gradient Sliding","element":"span"}],[{"text":"In Section ","element":"span"},{"href":"#id-54","text":"D.1 ","element":"a"},{"text":"we prove that the Inexact PVM steps (Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"in Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") that the SOCGS algorithm computes contract the distance to the optimum and the primal gap quadratically when close enough to ","element":"span"},{"style":{"height":11.39},"width":35.18,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-0.png","element":"img","alt":" x∗","inline":true},{"text":", by carefully choosing the ","element":"span"},{"style":{"height":9.59},"width":36.67,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-1.png","element":"img","alt":" 𝜀𝑘","inline":true},{"text":"-parameter at each iteration. First, we review the SOCGS from the main body of the text (shown in Algorithm ","element":"span"},{"href":"#id-110","text":"7","element":"a"},{"text":"), and then we review a key result in Lemma ","element":"span"},{"href":"#id-111","text":"D.1 ","element":"a"},{"text":"that measures the accuracy of the Hessian matrix approximation ","element":"span"},{"style":{"height":10.4},"width":32,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-2.png","element":"img","alt":" 𝐻","inline":true,"padRight":true},{"text":"as we approach ","element":"span"},{"style":{"height":11.39},"width":35.03,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-3.png","element":"img","alt":" x∗","inline":true},{"text":", which will be used in the convergence proofs.","element":"span"}],[{"id":"id-110","style":{"width":"99%"},"width":1872,"height":1233,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-4.png","element":"img"}],[{"text":"The algorithm couples an independent ACG step with line search (Line ","element":"span"},{"href":"#id-110","text":"4","element":"a"},{"text":") with an Inexact PVM step with unit step size (Lines ","element":"span"},{"href":"#id-110","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-110","text":"12","element":"a"},{"text":"). At the end of each iteration we choose the step that provides the greatest primal progress (Lines ","element":"span"},{"href":"#id-110","text":"14","element":"a"},{"text":"-","element":"span"},{"href":"#id-110","text":"18","element":"a"},{"text":"). The ACG steps in Line ","element":"span"},{"href":"#id-110","text":"4 ","element":"a"},{"text":"will ensure global linear convergence in primal gap, and the Inexact PVM steps in Lines ","element":"span"},{"href":"#id-110","text":"14","element":"a"},{"text":"-","element":"span"},{"href":"#id-110","text":"18 ","element":"a"},{"text":"will provide quadratic convergence.","element":"span"}],[{"text":"Note that the ACG iterates in Line ","element":"span"},{"href":"#id-110","text":"4 ","element":"a"},{"text":"do not depend on the Inexact PVM steps in Lines Lines ","element":"span"},{"href":"#id-110","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-110","text":"12","element":"a"},{"text":". This is because the ACG steps do not contract the primal gap on a per-iteration basis, and if the active sets of the ACG steps in Line ","element":"span"},{"href":"#id-110","text":"4 ","element":"a"},{"text":"were to be modified using the active set of the PVM steps in Lines ","element":"span"},{"href":"#id-110","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-110","text":"12","element":"a"},{"text":", this would break the proof of linear convergence in Theorem ","element":"span"},{"href":"#id-112","text":"B.2 ","element":"a"},{"text":"for the ACG algorithm. The proof in Theorem ","element":"span"},{"href":"#id-112","text":"B.2 ","element":"a"},{"text":"crucially relies on the fact that at each iteration of the ACG algorithm we can pick up or drop at most one vertex from the active set, whereas a PVM step may have dropped or picked up multiple vertices from the active set. The line search in the ACG step (Line ","element":"span"},{"href":"#id-110","text":"4","element":"a"},{"text":") can be substituted with a step size strategy that requires knowledge of the ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-5.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smoothness parameter of ","element":"span"},{"style":{"height":16.4},"width":100.53,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-6.png","element":"img","alt":" 𝑓 (x) (","inline":true},{"href":"#id-23","referenceIndex":56,"text":"Pedregosa et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":56,"text":"2020","element":"a"},{"text":").","element":"span"}],[{"text":"We compute the scaled projection in the Inexact PVM step (Lines ","element":"span"},{"href":"#id-110","text":"14","element":"a"},{"text":"-","element":"span"},{"href":"#id-110","text":"18","element":"a"},{"text":") using the ACG algorithm with exact line search, as the objective function is quadratic, thereby making the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-110","text":"7","element":"a"},{"text":") projection-free. As the function being minimized in the Inexact PVM steps is quadratic there is a closed-form expression for the optimal step size in Line ","element":"span"},{"href":"#id-110","text":"10","element":"a"},{"text":". The scaled projection problem is solved to an accuracy ","element":"span"},{"style":{"height":9.59},"width":36.68,"height":23.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-7.png","element":"img","alt":" 𝜀𝑘","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":18.3},"width":498.01,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-8.png","element":"img","alt":"ˆ𝑓𝑘(˜x𝑘+1) − minx∈X ˆ𝑓𝑘 (x) ≤ 𝜀𝑘","inline":true},{"text":", using the Frank-Wolfe gap as a stopping criterion, as in the CGS algorithm (","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"Lan & Zhou","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"2016","element":"a"},{"text":"). The accuracy parameter ","element":"span"},{"style":{"height":9.59},"width":36.67,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-9.png","element":"img","alt":" 𝜀𝑘","inline":true,"padRight":true},{"text":"in the SOCGS algorithm depends on a lower bound on the primal gap of Problem ","element":"span"},{"href":"#id-0","text":"1.1 ","element":"a"},{"text":"which we denote by ","element":"span"},{"style":{"height":15.2},"width":112.68,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-10.png","element":"img","alt":" 𝑙𝑏 (x𝑘)","inline":true,"padRight":true},{"text":"that satisfies ","element":"span"},{"style":{"height":15.2},"width":423.5,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/28-11.png","element":"img","alt":" 𝑙𝑏 (x𝑘) ≤ 𝑓 (x𝑘) − 𝑓 (x∗).","inline":true}],[{"id":"id-111","style":{"height":14.8},"width":429.92,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-0.png","element":"img","alt":"Lemma D.1. Given a 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-1.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth function ","element":"span"},{"style":{"height":15.2},"width":68.56,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-2.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a convex set ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", then for any ","element":"span"},{"style":{"height":12.4},"width":95.71,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-3.png","element":"img","alt":" x ∈ X","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and any matrix ","element":"span"},{"style":{"height":15.14},"width":134.56,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-4.png","element":"img","alt":" 𝐻 ∈ S𝑛++ ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"that satisfies Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"at ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"style":{"fontStyle":"italic"},"text":"we have that:","element":"span"}],[{"style":{"width":"66%"},"width":1255,"height":82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-5.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Similarly, we also have that:","element":"span"}],[{"id":"id-113","style":{"width":"64%"},"width":1208,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-6.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"We can bound the term on the left-hand side of Equation (","element":"span"},{"href":"#id-113","text":"D.1","element":"a"},{"text":") as:","element":"span"}],[{"id":"id-114","style":{"width":"75%"},"width":1405,"height":244,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-7.png","element":"img"}],[{"text":"We obtain Equation (","element":"span"},{"href":"#id-114","text":"D.4","element":"a"},{"text":") from the fact that the spectral norm of a matrix is submultiplicative, and both matrices are square. The inequality shown in Equation (","element":"span"},{"href":"#id-114","text":"D.6","element":"a"},{"text":") follows from ","element":"span"},{"style":{"height":15.14},"width":146.93,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-8.png","element":"img","alt":" 𝐻 ∈ S𝑛++","inline":true,"padRight":true},{"text":"and Corollary ","element":"span"},{"href":"#id-80","text":"A.8","element":"a"},{"text":". ","element":"span"},{"text":"Proceeding similarly, we can also bound the previous quantity as:","element":"span"}],[{"id":"id-115","style":{"width":"71%"},"width":1339,"height":115,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-9.png","element":"img"}],[{"text":"Where the inequality in Equation (","element":"span"},{"href":"#id-115","text":"D.8","element":"a"},{"text":") follows from fact that ","element":"span"},{"style":{"height":13.6},"width":90.72,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-10.png","element":"img","alt":" 𝜂 ≥ 1","inline":true},{"text":". Putting together these bounds, we have that:","element":"span"}],[{"style":{"width":"67%"},"width":1268,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-11.png","element":"img"}],[{"text":"Each of the terms in the maximization operator in the previous equation can be written as:","element":"span"}],[{"id":"id-116","style":{"width":"72%"},"width":1353,"height":209,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-12.png","element":"img"}],[{"text":"Where the equality in Equation (","element":"span"},{"href":"#id-116","text":"D.10","element":"a"},{"text":") follows from the fact that the maximum singular value of a square matrix is equal to the maximum absolute value of the eigenvalues of the matrix. This allows us to write:","element":"span"}],[{"id":"id-117","style":{"width":"109%"},"width":2043,"height":791,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/29-13.png","element":"img"}],[{"style":{"width":"84%"},"width":1588,"height":119,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-0.png","element":"img"}],[{"text":"This means that for all ","element":"span"},{"style":{"height":12},"width":151.67,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-1.png","element":"img","alt":" 1 ≤ 𝑖 ≤ 𝑛","inline":true,"padRight":true},{"text":"we have that:","element":"span"}],[{"style":{"width":"103%"},"width":1943,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-2.png","element":"img"}],[{"text":"Which allows us to write Equation (","element":"span"},{"href":"#id-117","text":"D.12","element":"a"},{"text":") as:","element":"span"}],[{"style":{"width":"79%"},"width":1498,"height":209,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-3.png","element":"img"}],[{"text":"Which immediately leads to:","element":"span"}],[{"id":"id-118","style":{"width":"79%"},"width":1497,"height":284,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-4.png","element":"img"}],[{"text":"Where Equation (","element":"span"},{"href":"#id-118","text":"D.15","element":"a"},{"text":") follows from the definition of ","element":"span"},{"style":{"height":10.4},"width":20,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-5.png","element":"img","alt":" 𝜂","inline":true,"padRight":true},{"text":"and Equation (","element":"span"},{"href":"#id-118","text":"D.16","element":"a"},{"text":") follows from Assumption ","element":"span"},{"href":"#id-41","text":"2","element":"a"},{"text":". Putting this all together allows us to write:","element":"span"}],[{"id":"id-119","style":{"width":"33%"},"width":636,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-6.png","element":"img"}],[{"text":"The claim shown in Equation (","element":"span"},{"href":"#id-113","text":"D.2","element":"a"},{"text":") follows from a very similar reasoning. With the only difference that:","element":"span"}],[{"style":{"width":"81%"},"width":1519,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-7.png","element":"img"}],[{"text":"The maximization term on the right-hand side of Equation (","element":"span"},{"href":"#id-119","text":"D.17","element":"a"},{"text":") can be bound exactly like in the first claim. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-8.png","element":"img","alt":"□","inline":true}],[{"id":"id-54","style":{"fontWeight":"bold"},"text":"D.1 Inexact Projected Variable-Metric steps","element":"span"}],[{"text":"We first begin by showing that if the PVM steps are computed inexactly using the error criterion shown in the SOCGS algorithm (Line ","element":"span"},{"href":"#id-44","text":"7 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") they still achieve local quadratic convergence in distance to the optimum.","element":"span"}],[{"id":"id-123","style":{"fontWeight":"bold"},"text":"Lemma D.2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-9.png","element":"img","alt":" 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex function ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-10.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a compact convex set ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if ","element":"span"},{"style":{"height":13.41},"width":70.73,"height":33.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-11.png","element":"img","alt":" ˜x𝑘+1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes an ","element":"span"},{"style":{"height":9.59},"width":36.67,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-12.png","element":"img","alt":"𝜀𝑘","inline":true},{"style":{"fontStyle":"italic"},"text":"-optimal solution to ","element":"span"},{"style":{"height":20.34},"width":405.9,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-13.png","element":"img","alt":" ˜x∗𝑘+1 = argminx∈X ˆ𝑓𝑘 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":16.99},"width":428.75,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-14.png","element":"img","alt":" 𝜀𝑘 = (𝑙𝑏(x𝑘)/∥∇ 𝑓 (x𝑘)∥)4","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-15.png","element":"img","alt":" 𝑙𝑏(x𝑘)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes a lower ","element":"span"},{"style":{"fontStyle":"italic"},"text":"bound on the primal gap such that ","element":"span"},{"style":{"height":15.2},"width":494.15,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-16.png","element":"img","alt":" 𝑙𝑏(x𝑘) ≤ 𝑓 (x𝑘) − 𝑓 (x∗) then:","inline":true}],[{"style":{"width":"30%"},"width":574,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-17.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the parameter ","element":"span"},{"style":{"height":19.02},"width":988.62,"height":47.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-18.png","element":"img","alt":" 𝜂𝑘 = max{𝜆max(𝐻−1𝑘 ∇2 𝑓 (x𝑘)), 𝜆max([∇2 𝑓 (x𝑘)]−1𝐻𝑘)} ≥ 1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"measures how well ","element":"span"},{"style":{"height":13.2},"width":189.02,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-19.png","element":"img","alt":" 𝐻𝑘 approx-","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"imates ","element":"span"},{"style":{"height":16.99},"width":154.2,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-20.png","element":"img","alt":" ∇2 𝑓 (x𝑘).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"By the strong convexity of ","element":"span"},{"style":{"height":18.69},"width":261.51,"height":46.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-21.png","element":"img","alt":"ˆ𝑓𝑘 (as 𝐻𝑘 ∈ S𝑛++","inline":true},{"text":") we have that:","element":"span"}],[{"id":"id-120","style":{"width":"75%"},"width":1416,"height":344,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/30-22.png","element":"img"}],[{"text":"The inequality in Equation (","element":"span"},{"href":"#id-120","text":"D.20","element":"a"},{"text":") follows from Corollary ","element":"span"},{"href":"#id-80","text":"A.8 ","element":"a"},{"text":"and the one in Equation (","element":"span"},{"href":"#id-120","text":"D.21","element":"a"},{"text":") from the first-order optimality conditions for the scaled projection problem, of which ","element":"span"},{"style":{"height":17.03},"width":70.73,"height":42.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-0.png","element":"img","alt":" ˜x∗𝑘+1 ","inline":true,"padRight":true},{"text":"is the exact solution. Rearranging ","element":"span"},{"text":"the previous expression allows us to conclude that","element":"span"},{"style":{"height":20.56},"width":445.06,"height":51.41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-1.png","element":"img","alt":"��˜x𝑘+1 − ˜x∗𝑘+1�� ≤√︁2𝜂𝑘𝜀𝑘/𝜇","inline":true},{"text":". If we plug in the value of ","element":"span"},{"style":{"height":13.19},"width":85.66,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-2.png","element":"img","alt":" 𝜀𝑘 in","inline":true,"padRight":true},{"text":"the previous bound:","element":"span"}],[{"id":"id-121","style":{"width":"69%"},"width":1306,"height":673,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-3.png","element":"img"}],[{"text":"Where the inequality in Equation (","element":"span"},{"href":"#id-121","text":"D.24","element":"a"},{"text":") follows from the fact that ","element":"span"},{"style":{"height":15.2},"width":112.68,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-4.png","element":"img","alt":" 𝑙𝑏 (x𝑘)","inline":true,"padRight":true},{"text":"is a lower bound on the primal gap, the one in Equation (","element":"span"},{"href":"#id-121","text":"D.25","element":"a"},{"text":") follows from the convexity of ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-5.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"and the last inequality, in Equation (","element":"span"},{"href":"#id-121","text":"D.26","element":"a"},{"text":"), follows from the Cauchy-Schwarz inequality. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-6.png","element":"img","alt":"□","inline":true}],[{"text":"Using the previous bound along with Corollary ","element":"span"},{"href":"#id-40","text":"C.12 ","element":"a"},{"text":"we can show that the iterates will converge quadratically in distance to the optimum (Lemma ","element":"span"},{"href":"#id-122","text":"D.3","element":"a"},{"text":"), despite not solving the problems to optimality.","element":"span"}],[{"id":"id-122","style":{"fontWeight":"bold"},"text":"Lemma D.3 ","element":"span"},{"text":"(Quadratic convergence in distance to the optimum of the Inexact Projected-Variable Metric (PMV) steps)","element":"span"},{"style":{"height":14.4},"width":196.36,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-7.png","element":"img","alt":". Given a 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-8.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth function ","element":"span"},{"style":{"height":15.2},"width":205.72,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-9.png","element":"img","alt":" 𝑓 (x) with 𝐿2","inline":true},{"style":{"fontStyle":"italic"},"text":"-Lipschitz Hessian and a compact convex set ","element":"span"},{"style":{"height":14.4},"width":401.02,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-10.png","element":"img","alt":" X, let ˜𝑥𝑘+1 denote an 𝜀𝑘","inline":true},{"style":{"fontStyle":"italic"},"text":"-optimal solution to ","element":"span"},{"style":{"height":20.34},"width":948.21,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-11.png","element":"img","alt":" ˜x∗𝑘+1 = argminx∈X ˆ𝑓𝑘 (x) where 𝜀𝑘 = (𝑙𝑏(x𝑘)/∥∇ 𝑓 (x𝑘)∥)4","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":15.2},"width":106.47,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-12.png","element":"img","alt":" 𝑙𝑏(x𝑘)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes a lower bound on the primal gap such that ","element":"span"},{"style":{"height":15.2},"width":395.42,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-13.png","element":"img","alt":" 𝑙𝑏(x𝑘) ≤ 𝑓 (x𝑘) − 𝑓 (x∗)","inline":true},{"style":{"fontStyle":"italic"},"text":", if Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"is satisfied then:","element":"span"}],[{"id":"id-124","style":{"width":"75%"},"width":1420,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-14.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the parameter ","element":"span"},{"style":{"height":19.03},"width":988.62,"height":47.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-15.png","element":"img","alt":" 𝜂𝑘 = max{𝜆max(𝐻−1𝑘 ∇2 𝑓 (x𝑘)), 𝜆max([∇2 𝑓 (x𝑘)]−1𝐻𝑘)} ≥ 1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"measures how well ","element":"span"},{"style":{"height":13.2},"width":189.02,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-16.png","element":"img","alt":" 𝐻𝑘 approx-","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"imates ","element":"span"},{"style":{"height":16.99},"width":261.84,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-17.png","element":"img","alt":" ∇2 𝑓 (x𝑘) and 𝜔","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is defined in Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Using the triangle inequality yields:","element":"span"}],[{"style":{"width":"51%"},"width":964,"height":303,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-18.png","element":"img"}],[{"text":"Where the second inequality follows from using the bounds shown in Corollary ","element":"span"},{"href":"#id-40","text":"C.12 ","element":"a"},{"text":"and Lemma ","element":"span"},{"href":"#id-123","text":"D.2","element":"a"},{"text":". ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-19.png","element":"img","alt":"□","inline":true}],[{"text":"The SOCGS algorithm chooses at each iteration between the ACG step and the Inexact PVM step according to which one provides more progress in primal gap (Lines ","element":"span"},{"href":"#id-44","text":"14","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"18 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":"). Therefore we need to translate the local rate in distance to the optimum of the PVM algorithm in Lemma ","element":"span"},{"href":"#id-122","text":"D.3 ","element":"a"},{"text":"to one in primal gap. It is immediate to see that we can upper bound the right-hand side of Equation (","element":"span"},{"href":"#id-124","text":"D.27","element":"a"},{"text":") using ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-20.png","element":"img","alt":"𝜇","inline":true},{"text":"-strong convexity, as:","element":"span"}],[{"style":{"width":"29%"},"width":545,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/31-21.png","element":"img"}],[{"text":"However, when we try to lower bound the norm that appears on the left-hand side of Equation (","element":"span"},{"href":"#id-124","text":"D.27","element":"a"},{"text":") using ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-0.png","element":"img","alt":"𝐿","inline":true},{"text":"-smoothness we arrive at:","element":"span"}],[{"id":"id-125","style":{"width":"77%"},"width":1461,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-1.png","element":"img"}],[{"text":"The only term preventing us from expressing the left-hand side of Equation (","element":"span"},{"href":"#id-125","text":"D.28","element":"a"},{"text":") solely in terms of primal gap values is ","element":"span"},{"style":{"height":15.22},"width":353.61,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-2.png","element":"img","alt":" − ⟨∇ 𝑓 (x∗), ˜x𝑘+1 − x∗⟩","inline":true},{"text":". As by Assumption ","element":"span"},{"href":"#id-28","text":"1 ","element":"a"},{"text":"for any ","element":"span"},{"style":{"height":15.2},"width":173.16,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-3.png","element":"img","alt":" x ∈ F (x∗)","inline":true,"padRight":true},{"text":"we have that ","element":"span"},{"style":{"height":15.2},"width":359.05,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-4.png","element":"img","alt":" ⟨∇ 𝑓 (x∗) , x − x∗⟩ = 0,","inline":true,"padRight":true},{"text":"if we can show that from some point onward the iterates ","element":"span"},{"style":{"height":15.2},"width":368.56,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-5.png","element":"img","alt":" ˜x𝑘+1 remain in F (x∗)","inline":true},{"text":", we will be able conclude that ","element":"span"},{"style":{"height":15.22},"width":397.75,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-6.png","element":"img","alt":"⟨∇ 𝑓 (x∗), ˜x𝑘+1 − x∗⟩ = 0.","inline":true}],[{"text":"The main tool that we will use for the analysis is based on the idea that for points ","element":"span"},{"style":{"height":9.19},"width":36.96,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-7.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"sufficiently close to ","element":"span"},{"style":{"height":11.39},"width":35.31,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-8.png","element":"img","alt":"x∗","inline":true},{"text":", when we minimize ","element":"span"},{"style":{"height":18.3},"width":81.93,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-9.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"over ","element":"span"},{"text":"X ","element":"span"},{"text":"using the ACG algorithm, the iterates ","element":"span"},{"style":{"height":13.41},"width":70.73,"height":33.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-10.png","element":"img","alt":" ˜x𝑘+1","inline":true,"padRight":true},{"text":"of the algorithm will reach ","element":"span"},{"style":{"height":15.2},"width":100.24,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-11.png","element":"img","alt":"F (x∗)","inline":true,"padRight":true},{"text":"in a finite number of iterations, remaining in ","element":"span"},{"style":{"height":15.2},"width":100.24,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-12.png","element":"img","alt":" F (x∗)","inline":true,"padRight":true},{"text":"for all subsequent iterations, that is, the ACG algorithm \"identifies\" the optimal face while computing the Inexact PVM steps. This is a variation of the proof originally presented in ","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"Guélat & Marcotte ","element":"a"},{"text":"(","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"1986","element":"a"},{"text":"), which was used to show for the first time that the ACG algorithm asymptotically converges linearly in primal gap when minimizing a strongly convex and smooth function over a polytope. We reproduce the original proof here, as it will be useful in the technical results to come.","element":"span"}],[{"id":"id-59","style":{"fontWeight":"bold"},"text":"Theorem D.4 ","element":"span"},{"text":"(Identification of the optimal face)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-29","referenceIndex":30,"style":{"fontStyle":"italic"},"text":"Guélat & Marcotte","element":"a"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"href":"#id-29","referenceIndex":30,"style":{"fontStyle":"italic"},"text":"1986","element":"a"},{"style":{"fontStyle":"italic"},"text":")[Theorem 5] Given a strongly convex and smooth function ","element":"span"},{"style":{"height":15.2},"width":69.41,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-13.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if Assumption ","element":"span"},{"href":"#id-28","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"is satisfied, then there is a ","element":"span"},{"style":{"height":14.19},"width":163.37,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-14.png","element":"img","alt":" 𝑟ACG > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that for ","element":"span"},{"style":{"height":19.43},"width":422.71,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-15.png","element":"img","alt":" xACG𝑘 ∈ B(x∗, 𝑟ACG) ∩ X","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":19.43},"width":239.34,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-16.png","element":"img","alt":" xACG𝑘 ∉ F (x∗)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"then the ACG algorithm (Algorithm ","element":"span"},{"href":"#id-25","style":{"fontStyle":"italic"},"text":"4","element":"a"},{"style":{"fontStyle":"italic"},"text":") with exact line search satisfies that ","element":"span"},{"style":{"height":19.42},"width":774.13,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-17.png","element":"img","alt":" |SACG𝑘+1 | < |SACG𝑘 | and SACG𝑘 \\ SACG𝑘+1 ∉ F (x∗)","inline":true},{"style":{"fontStyle":"italic"},"text":". That is, the ACG algorithm performs an away-step that drops a vertex from ","element":"span"},{"style":{"height":19.43},"width":101.62,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-18.png","element":"img","alt":" SACG𝑘","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"that is not a vertex of the optimal face ","element":"span"},{"style":{"height":15.2},"width":99.72,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-19.png","element":"img","alt":" F (x∗)","inline":true},{"style":{"fontStyle":"italic"},"text":". Moreover, there is a ","element":"span"},{"style":{"height":14.99},"width":173.39,"height":37.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-20.png","element":"img","alt":" 𝐾ACG ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that for ","element":"span"},{"style":{"height":14.99},"width":173.68,"height":37.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-21.png","element":"img","alt":" 𝑘 ≥ 𝐾ACG ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"we have that ","element":"span"},{"style":{"height":19.43},"width":255.1,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-22.png","element":"img","alt":" xACG𝑘 ∈ F (x∗).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"The proof starts by showing that there is an index ","element":"span"},{"style":{"height":11.6},"width":95.85,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-23.png","element":"img","alt":" 𝑇 ≥ 0","inline":true,"padRight":true},{"text":"such that for ","element":"span"},{"style":{"height":12.4},"width":95.36,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-24.png","element":"img","alt":" 𝑘 ≥ 𝑇","inline":true,"padRight":true},{"text":"all the steps taken by the ACG algorithm will be away-steps that reduce the cardinality of the active set if ","element":"span"},{"style":{"height":19.43},"width":237.91,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-25.png","element":"img","alt":" xACG𝑘 ∉ F (x∗)","inline":true},{"text":". Let ","element":"span"},{"style":{"height":13.19},"width":98.83,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-26.png","element":"img","alt":" 𝑟𝑖 > 0","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.2},"width":88.06,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-27.png","element":"img","alt":" 𝑐 > 0","inline":true,"padRight":true},{"text":"be such that:","element":"span"}],[{"id":"id-126","style":{"width":"83%"},"width":1560,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-28.png","element":"img"}],[{"text":"Taking ","element":"span"},{"style":{"height":18.8},"width":388.02,"height":46.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-29.png","element":"img","alt":" 𝑟ACG = minv𝑖 ∈vert(X) 𝑟𝑖","inline":true},{"text":", we know by strong convexity that there is an index ","element":"span"},{"style":{"height":11.6},"width":95.86,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-30.png","element":"img","alt":" 𝑇 ≥ 0","inline":true,"padRight":true},{"text":"such that for ","element":"span"},{"style":{"height":12.4},"width":95.36,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-31.png","element":"img","alt":" 𝑘 ≥ 𝑇","inline":true,"padRight":true},{"text":"we have that ","element":"span"},{"style":{"height":19.43},"width":418.96,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-32.png","element":"img","alt":" xACG𝑘 ∈ B(x∗, 𝑟ACG) ∩ X","inline":true},{"text":". Furthermore, suppose that ","element":"span"},{"style":{"height":19.43},"width":237.39,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-33.png","element":"img","alt":" xACG𝑘 ∉ F (x∗)","inline":true},{"text":", then we have that:","element":"span"}],[{"style":{"width":"64%"},"width":1213,"height":264,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-34.png","element":"img"}],[{"text":"Where the left-hand side follows from Equation (","element":"span"},{"href":"#id-126","text":"D.30","element":"a"},{"text":") and the right-hand side from Equation (","element":"span"},{"href":"#id-126","text":"D.29","element":"a"},{"text":"). As ","element":"span"},{"style":{"height":19.43},"width":935.24,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-35.png","element":"img","alt":"xACG𝑘 ∉ F (x∗), then SACG𝑘 ∩ vert(X) \\ vert(F (x∗)) ≠ ∅","inline":true},{"text":", as the active set ","element":"span"},{"style":{"height":19.43},"width":99.28,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-36.png","element":"img","alt":" SACG𝑘","inline":true,"padRight":true},{"text":"must include vertices that are not in the optimal face ","element":"span"},{"style":{"height":15.2},"width":99.39,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-37.png","element":"img","alt":" F (x∗)","inline":true,"padRight":true},{"text":"(otherwise we would have ","element":"span"},{"style":{"height":19.43},"width":238.14,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-38.png","element":"img","alt":" xACG𝑘 ∈ F (x∗)","inline":true},{"text":"). This means that the ACG algorithm in Line ","element":"span"},{"href":"#id-127","text":"3 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-85","text":"5 ","element":"a"},{"text":"will choose an away-step with a vertex ","element":"span"},{"style":{"height":19.42},"width":586.43,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-39.png","element":"img","alt":" v𝑖 ∈ SACG𝑘 ∩ vert(X) \\ vert(F (x∗))","inline":true},{"text":", and not a Frank-Wolfe step with a vertex ","element":"span"},{"style":{"height":16.39},"width":285.98,"height":40.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-40.png","element":"img","alt":" v 𝑗 ∈ vert(F (x∗))","inline":true},{"text":", for iterations ","element":"span"},{"style":{"height":12.4},"width":95.36,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-41.png","element":"img","alt":" 𝑘 ≥ 𝑇","inline":true},{"text":". We denote the vertex chosen in the away-step by ","element":"span"},{"style":{"height":19.43},"width":581.66,"height":48.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-42.png","element":"img","alt":" v ∈ SACG𝑘 ∩ vert(X) \\ vert(F (x∗))","inline":true},{"text":", and we remark that ","element":"span"},{"style":{"height":19.43},"width":225.52,"height":48.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-43.png","element":"img","alt":" d = xACG𝑘 − v","inline":true,"padRight":true},{"text":"is a descent direction at ","element":"span"},{"style":{"height":19.43},"width":91,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-44.png","element":"img","alt":"xACG𝑘","inline":true,"padRight":true},{"text":", and so the exact line search will output a step size ","element":"span"},{"style":{"height":15.2},"width":234.25,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-45.png","element":"img","alt":" 𝛾𝑘 ∈ (0, 𝛾max]","inline":true},{"text":". The proof proceeds by showing that we must have that ","element":"span"},{"href":"#id-128","style":{"height":14.4},"width":335.94,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-46.png","element":"img","alt":" 𝛾𝑘 = 𝛾max in Line 8","inline":true,"padRight":true},{"text":"of Algorithm ","element":"span"},{"href":"#id-85","text":"5 ","element":"a"},{"text":"for iterations ","element":"span"},{"style":{"height":12.4},"width":95.38,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/32-47.png","element":"img","alt":" 𝑘 ≥ 𝑇","inline":true},{"text":". Using proof by contradiction, we","element":"span"}],[{"text":"assume that ","element":"span"},{"style":{"height":11.6},"width":170.22,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-0.png","element":"img","alt":" 𝛾𝑘 < 𝛾max","inline":true,"padRight":true},{"text":"and we apply the first-order optimality conditions for the exact line search:","element":"span"}],[{"style":{"height":20.15},"width":1418.94,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-1.png","element":"img","alt":"0 =�d, ∇ 𝑓 (xACG𝑘+1 )� (D.31)","inline":true},{"style":{"height":20.15},"width":1387.95,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-2.png","element":"img","alt":"=�xACG𝑘+1 − v, ∇ 𝑓 (xACG𝑘+1 )�+�xACG𝑘 − xACG𝑘+1 , ∇ 𝑓 (xACG𝑘+1 )� (D.32)","inline":true},{"style":{"height":20.15},"width":1387.95,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-3.png","element":"img","alt":"=�xACG𝑘+1 − v, ∇ 𝑓 (xACG𝑘+1 )�− 𝛾𝑘�d, ∇ 𝑓 (xACG𝑘+1 )� (D.33)","inline":true},{"id":"id-129","style":{"height":20.15},"width":1387.95,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-4.png","element":"img","alt":"=�xACG𝑘+1 − v, ∇ 𝑓 (xACG𝑘+1 )� (D.34)","inline":true},{"id":"id-130","style":{"height":16.4},"width":1387.11,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-5.png","element":"img","alt":"< −𝑐. (D.35)","inline":true}],[{"text":"Which is the desired contradiction as ","element":"span"},{"style":{"height":11.2},"width":89.79,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-6.png","element":"img","alt":" 𝑐 > 0","inline":true},{"text":". The equality in Equation (","element":"span"},{"href":"#id-129","text":"D.34","element":"a"},{"text":") is due to","element":"span"},{"style":{"height":19.67},"width":319.34,"height":49.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-7.png","element":"img","alt":"�d, ∇ 𝑓 (xACG𝑘+1 )�= 0","inline":true,"padRight":true},{"text":"because of the optimality conditions of the exact line search and the inequality in Equation (","element":"span"},{"href":"#id-130","text":"D.35","element":"a"},{"text":") is due to","element":"span"},{"style":{"height":19.67},"width":481.14,"height":49.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-8.png","element":"img","alt":"�xACG𝑘+1 − v, ∇ 𝑓 (xACG𝑘+1 )�≤ −𝑐","inline":true,"padRight":true},{"text":"as ","element":"span"},{"style":{"height":15.2},"width":437.1,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-9.png","element":"img","alt":" v ∈ vert(X) \\ vert(F (x∗))","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.43},"width":343.54,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-10.png","element":"img","alt":" xACG𝑘+1 ∈ B(x∗, 𝑟ACG)","inline":true,"padRight":true},{"text":"(thus Equation (","element":"span"},{"href":"#id-126","text":"D.30","element":"a"},{"text":") holds). This proves that we must have ","element":"span"},{"style":{"height":19.43},"width":1192.93,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-11.png","element":"img","alt":" 𝛾𝑘 = 𝛾max and |SACG𝑘 | > |SACG𝑘+1 |. While 𝑘 ≥ 𝑇 and xACG𝑘 ∉ F (x∗) the","inline":true,"padRight":true},{"text":"ACG algorithm will drop a vertex ","element":"span"},{"style":{"height":19.43},"width":489.69,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-12.png","element":"img","alt":" SACG𝑘 ∩vert(X)\\vert(F (x∗))","inline":true,"padRight":true},{"text":"using an away-step. As ","element":"span"},{"style":{"height":19.43},"width":120.1,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-13.png","element":"img","alt":" |SACG𝑘 |","inline":true,"padRight":true},{"text":"is finite, we will have for some ","element":"span"},{"style":{"height":14.19},"width":175.49,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-14.png","element":"img","alt":" 𝐾ACG > 𝑇","inline":true,"padRight":true},{"text":"that ","element":"span"},{"style":{"height":20.42},"width":596.38,"height":51.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-15.png","element":"img","alt":" SACG𝐾ACG ∩ vert(X) \\ vert(F (x∗)) = ∅","inline":true},{"text":", and therefore ","element":"span"},{"style":{"height":20.42},"width":365.51,"height":51.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-16.png","element":"img","alt":" SACG𝐾ACG ⊆ vert(F (x∗))","inline":true},{"text":". This ","element":"span"},{"text":"is equivalent to ","element":"span"},{"style":{"height":20.42},"width":253.92,"height":51.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-17.png","element":"img","alt":" xACG𝐾ACG ∈ F (x∗)","inline":true},{"text":". Lastly, using Equation (","element":"span"},{"href":"#id-126","text":"D.29","element":"a"},{"text":") and ","element":"span"},{"style":{"height":19.18},"width":713.67,"height":47.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-18.png","element":"img","alt":" SACG𝐾 ∩ vert(X) \\ vert(F (x∗)) = ∅ we can","inline":true,"padRight":true},{"text":"show that the ACG algorithm will not perform any Frank-Wolfe steps with vertices ","element":"span"},{"style":{"height":15.2},"width":433.16,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-19.png","element":"img","alt":" v ∈ vert(X) \\ vert(F (x∗))","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":17.39},"width":1824.04,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-20.png","element":"img","alt":" 𝑘 ≥ 𝐾ACG, and so x𝑘 ∈ F (x∗). □","inline":true}],[{"text":"The consequence of Theorem ","element":"span"},{"href":"#id-59","text":"D.4 ","element":"a"},{"text":"is that after a finite number of iterations ","element":"span"},{"style":{"height":14.99},"width":171.53,"height":37.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-21.png","element":"img","alt":" 𝐾ACG ≥ 0","inline":true,"padRight":true},{"text":"the iterates of the ACG algorithm applied to Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") are \"stuck\" in the face ","element":"span"},{"style":{"height":15.2},"width":99.78,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-22.png","element":"img","alt":" F (x∗)","inline":true},{"text":", that is, we have that ","element":"span"},{"style":{"height":19.43},"width":238.9,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-23.png","element":"img","alt":" xACG𝑘 ∈ F (x∗)","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":14.99},"width":171.35,"height":37.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-24.png","element":"img","alt":" 𝑘 ≥ 𝐾ACG","inline":true},{"text":". The SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") uses the ACG algorithm to inexactly solve the scaled projection problem of the PVM steps in Lines ","element":"span"},{"href":"#id-44","text":"14","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"18 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":". The function being minimized in these steps is not ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-25.png","element":"img","alt":" 𝑓 (x)","inline":true},{"text":", but rather an approximation ","element":"span"},{"style":{"height":18.3},"width":82.08,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-26.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"that changes at each iteration. However for points sufficiently close to ","element":"span"},{"style":{"height":11.39},"width":34.69,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-27.png","element":"img","alt":" x∗ ","inline":true,"padRight":true},{"text":"we show in Theorem ","element":"span"},{"href":"#id-131","text":"D.5 ","element":"a"},{"text":"that the ACG steps that solve the scaled projection problem of the PVM steps (in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") will also get \"stuck\" to ","element":"span"},{"style":{"height":15.2},"width":99.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-28.png","element":"img","alt":" F (x∗)","inline":true},{"text":", that is, there is a ","element":"span"},{"style":{"height":12.4},"width":187.46,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-29.png","element":"img","alt":" 𝐾 ≥ 0 such","inline":true,"padRight":true},{"text":"that we will have that ","element":"span"},{"style":{"height":15.2},"width":463.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-30.png","element":"img","alt":" ˜x𝑘+1 ∈ F (x∗) for all 𝑘 ≥ 𝐾.","inline":true}],[{"id":"id-131","style":{"height":15.2},"width":436.2,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-31.png","element":"img","alt":"Theorem D.5. Let 𝑓 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be a strongly convex and smooth function with Lipschitz continuous Hessian and ","element":"span"},{"text":"X ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be a polytope such that Assumption ","element":"span"},{"href":"#id-28","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"is satisfied. We denote the quadratic approximation of ","element":"span"},{"style":{"height":15.2},"width":69.33,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-32.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"at ","element":"span"},{"style":{"height":9.59},"width":37.7,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-33.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"as ","element":"span"},{"style":{"height":20.87},"width":699.15,"height":52.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-34.png","element":"img","alt":"ˆ𝑓𝑘(x) = ⟨∇ 𝑓 (x𝑘), x𝑘 − x⟩ + 1/2 ∥x𝑘 − x∥2𝐻𝑘","inline":true},{"style":{"fontStyle":"italic"},"text":", where ","element":"span"},{"style":{"height":12.39},"width":45.48,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-35.png","element":"img","alt":" 𝐻𝑘","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"satisfies Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":". Assume that we use the ACG ","element":"span"},{"style":{"fontStyle":"italic"},"text":"algorithm (Algorithm ","element":"span"},{"href":"#id-25","style":{"fontStyle":"italic"},"text":"4","element":"a"},{"style":{"fontStyle":"italic"},"text":") with exact line search to minimize ","element":"span"},{"style":{"height":18.3},"width":81.98,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-36.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"over ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", and denote the iterate generated by this algorithm at iteration ","element":"span"},{"style":{"height":8.8},"width":12,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-37.png","element":"img","alt":" 𝑡","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"as ","element":"span"},{"style":{"height":18.42},"width":70.74,"height":46.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-38.png","element":"img","alt":" ˜x𝑡𝑘+1","inline":true},{"style":{"fontStyle":"italic"},"text":", then there is a ","element":"span"},{"style":{"height":11.2},"width":87.75,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-39.png","element":"img","alt":" 𝑟 > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that if ","element":"span"},{"style":{"height":19.22},"width":514.47,"height":48.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-40.png","element":"img","alt":" {x𝑘, ˜x𝑡𝑘+1, ˜x𝑡+1𝑘+1} ⊂ B(x∗, 𝑟) ∩ X","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":21.02},"width":993.24,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-41.png","element":"img","alt":"˜x𝑡𝑘+1 ∉ F (x∗) then | ˜S𝑡+1𝑘+1| < | ˜S𝑡𝑘+1| and ˜S𝑡𝑘+1 \\ ˜S𝑡+1𝑘+1 ∉ F (x∗)","inline":true},{"style":{"fontStyle":"italic"},"text":". That is, at iteration ","element":"span"},{"style":{"height":8.8},"width":12,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-42.png","element":"img","alt":" 𝑡","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the ACG algorithm drops a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"vertex from the active set ","element":"span"},{"style":{"height":21.02},"width":75.77,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-43.png","element":"img","alt":"˜S𝑡𝑘+1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"that is not a vertex of the optimal face ","element":"span"},{"style":{"height":15.2},"width":114.08,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-44.png","element":"img","alt":" F (x∗).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"This proof follows relies on the same concepts as the proof in Theorem ","element":"span"},{"href":"#id-59","text":"D.4 ","element":"a"},{"text":"from ","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"Guélat & Marcotte ","element":"a"},{"text":"(","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"1986","element":"a"},{"text":"). Let ","element":"span"},{"style":{"height":16.54},"width":298.26,"height":41.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-45.png","element":"img","alt":" 𝑟∗𝑖 > 0 and 𝑐∗ > 0","inline":true,"padRight":true},{"text":"be such that:","element":"span"}],[{"style":{"height":28.53},"width":1785.66,"height":71.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-46.png","element":"img","alt":"�v𝑖 − x, ∇ 𝑓 (x∗) + ∇2 𝑓 (x∗)(x − x∗)�≥ −𝑐2 if ∥x − x∗∥ ≤ 𝑟∗𝑖 and v𝑖 ∈ vert(F (x∗)) (D.36)","inline":true},{"style":{"height":19.75},"width":1785.66,"height":49.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-47.png","element":"img","alt":"�v𝑖 − x, ∇ 𝑓 (x∗) + ∇2 𝑓 (x∗)(x − x∗)�≥ 𝑐 if ∥x − x∗∥ ≤ 𝑟∗𝑖 and v𝑖 ∈ vert(X) \\ vert(F (x∗)). (D.37)","inline":true}],[{"text":"Where ","element":"span"},{"style":{"height":16.99},"width":433.56,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-48.png","element":"img","alt":" ∇ 𝑓 (x∗) + ∇2 𝑓 (x∗)(x − x∗)","inline":true,"padRight":true},{"text":"is the gradient of the quadratic approximation at ","element":"span"},{"style":{"height":11.39},"width":35.46,"height":28.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-49.png","element":"img","alt":" x∗","inline":true,"padRight":true},{"text":"using ","element":"span"},{"style":{"height":16.99},"width":136.77,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-50.png","element":"img","alt":" ∇2 𝑓 (x∗)","inline":true,"padRight":true},{"text":"(note that the minimizer of this quadratic approximation is ","element":"span"},{"style":{"height":11.39},"width":34.76,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-51.png","element":"img","alt":" x∗ ","inline":true,"padRight":true},{"text":"and that this approximation is strongly convex and smooth). We have that:","element":"span"}],[{"id":"id-132","style":{"width":"88%"},"width":1651,"height":243,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/33-52.png","element":"img"}],[{"text":"The term shown in Equation (","element":"span"},{"href":"#id-132","text":"D.39","element":"a"},{"text":") can be bounded using the triangle inequality and the fact that the Hessian of ","element":"span"},{"style":{"height":15.2},"width":166.49,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-0.png","element":"img","alt":" 𝑓 (x) is 𝐿2","inline":true},{"text":"-Lipschitz:","element":"span"}],[{"style":{"width":"86%"},"width":1620,"height":147,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-1.png","element":"img"}],[{"text":"The term shown in Equation (","element":"span"},{"href":"#id-132","text":"D.40","element":"a"},{"text":"), can be bounded using the triangle inequality and Lemma ","element":"span"},{"href":"#id-111","text":"D.1","element":"a"},{"text":", leading to:","element":"span"}],[{"style":{"width":"85%"},"width":1600,"height":114,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-2.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16.59},"width":225.84,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-3.png","element":"img","alt":" 1 + 𝜔𝐷2 ≥ 𝜂𝑘","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":12.4},"width":90.76,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-4.png","element":"img","alt":" 𝑘 ≥ 0","inline":true,"padRight":true},{"text":"from Assumption ","element":"span"},{"href":"#id-41","text":"2","element":"a"},{"text":". Lastly, the term in Equation (","element":"span"},{"href":"#id-132","text":"D.41","element":"a"},{"text":") can be bounded using the triangle inequality and the ","element":"span"},{"style":{"height":12.39},"width":40.07,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-5.png","element":"img","alt":" 𝐿2","inline":true},{"text":"-Lipschitz continuity of the Hessian, which allows us to write:","element":"span"}],[{"style":{"width":"82%"},"width":1546,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-6.png","element":"img"}],[{"text":"Using these bounds we have:","element":"span"}],[{"id":"id-133","style":{"width":"86%"},"width":1618,"height":424,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-7.png","element":"img"}],[{"text":"Where we note that ","element":"span"},{"style":{"height":20.88},"width":701.5,"height":52.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-8.png","element":"img","alt":"ˆ𝑓𝑘(x) = ⟨∇ 𝑓 (x𝑘), x − x𝑘⟩ + 1/2 ∥x − x𝑘∥2𝐻𝑘","inline":true},{"text":". Using the bound in Equation (","element":"span"},{"href":"#id-133","text":"D.50","element":"a"},{"text":") along ","element":"span"},{"text":"with Equations (","element":"span"},{"href":"#id-132","text":"D.36","element":"a"},{"text":")-(","element":"span"},{"href":"#id-132","text":"D.37","element":"a"},{"text":"), and setting ","element":"span"},{"style":{"height":18.44},"width":724.89,"height":46.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-9.png","element":"img","alt":" 𝐶 = �3𝐿2/2 + 𝐿𝜔𝐷(1 + 𝜔𝐷2)� 𝐷2 we have:","inline":true}],[{"id":"id-135","style":{"width":"92%"},"width":1742,"height":163,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-10.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":16.54},"width":344.36,"height":41.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-11.png","element":"img","alt":" 𝑟∗ = minv𝑖 ∈vert(X) 𝑟∗𝑖","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.22},"width":347.58,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-12.png","element":"img","alt":" 𝑟 = min {𝑟∗, 𝑐/(4𝐶)}","inline":true,"padRight":true},{"text":"and assume that ","element":"span"},{"style":{"height":15.2},"width":310.04,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-13.png","element":"img","alt":" x𝑘 ∈ B (x∗, 𝑟) ∩ X","inline":true,"padRight":true},{"text":"(we know by strong ","element":"span"},{"text":"convexity that there is an index ","element":"span"},{"style":{"height":11.6},"width":95.85,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-14.png","element":"img","alt":" 𝑇 ≥ 0","inline":true,"padRight":true},{"text":"such that for ","element":"span"},{"style":{"height":12.4},"width":95.36,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-15.png","element":"img","alt":" 𝑘 ≥ 𝑇","inline":true,"padRight":true},{"text":"the iterates ","element":"span"},{"style":{"height":9.19},"width":36.92,"height":22.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-16.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"of the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") will be in the aforementioned ball). If ","element":"span"},{"style":{"height":18.42},"width":333.1,"height":46.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-17.png","element":"img","alt":" ˜x𝑡𝑘+1 ∈ B (x∗, 𝑟) ∩ X","inline":true,"padRight":true},{"text":"then the bounds in Equations (","element":"span"},{"href":"#id-134","text":"D.51","element":"a"},{"text":")-(","element":"span"},{"href":"#id-135","text":"D.52","element":"a"},{"text":") hold, ","element":"span"},{"text":"as","element":"span"},{"style":{"height":20.26},"width":268.87,"height":50.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-18.png","element":"img","alt":"��˜x𝑡𝑘+1 − x∗�� ≤ 𝑟∗","inline":true},{"text":", this leads to:","element":"span"}],[{"id":"id-134","style":{"width":"80%"},"width":1506,"height":291,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-19.png","element":"img"}],[{"text":"Where the inequality in Equation (","element":"span"},{"href":"#id-134","text":"D.53","element":"a"},{"text":") follows from Equation (","element":"span"},{"href":"#id-135","text":"D.52","element":"a"},{"text":"), the inequality in Equation (","element":"span"},{"href":"#id-134","text":"D.54","element":"a"},{"text":") from the fact that ","element":"span"},{"style":{"height":15.22},"width":420.83,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-20.png","element":"img","alt":" ∥x𝑘 − x∗∥ < 𝑟 ≤ 𝑐/(4𝐶)","inline":true,"padRight":true},{"text":"and the last inequality from Equation (","element":"span"},{"href":"#id-134","text":"D.51","element":"a"},{"text":"). Therefore if ","element":"span"},{"style":{"height":18.42},"width":217.02,"height":46.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-21.png","element":"img","alt":"˜x𝑡𝑘+1 ∉ F (x∗)","inline":true,"padRight":true},{"text":"the ACG algorithm will take an away-step with a vertex ","element":"span"},{"style":{"height":21.01},"width":558.1,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-22.png","element":"img","alt":" v ∈ ˜S𝑡𝑘+1 ∩ vert(X) \\ vert(F (x∗))","inline":true,"padRight":true},{"text":"and ","element":"span"},{"text":"direction ","element":"span"},{"style":{"height":21.02},"width":1190.33,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/34-23.png","element":"img","alt":" d = ˜x𝑡𝑘+1 − v (where ˜S𝑡𝑘+1 ∩ vert(X) \\ vert(F (x∗)) ≠ ∅ as ˜x𝑡𝑘+1 ∉ F (x∗)","inline":true},{"text":"). Similarly as in the proof of ","element":"span"},{"text":"Theorem ","element":"span"},{"href":"#id-59","text":"D.4","element":"a"},{"text":", we show that ","element":"span"},{"style":{"height":10.4},"width":169.26,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-0.png","element":"img","alt":" 𝛾𝑘 = 𝛾max","inline":true,"padRight":true},{"text":"if ","element":"span"},{"style":{"height":19.22},"width":336.63,"height":48.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-1.png","element":"img","alt":" ˜x𝑡+1𝑘+1 ∈ B (x∗, 𝑟) ∩ X","inline":true},{"text":". We use proof by contradiction, and assume ","element":"span"},{"text":"that ","element":"span"},{"style":{"height":11.6},"width":170.22,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-2.png","element":"img","alt":" 𝛾𝑘 < 𝛾max","inline":true},{"text":". Using the optimality of the line search:","element":"span"}],[{"id":"id-136","style":{"width":"73%"},"width":1376,"height":556,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-3.png","element":"img"}],[{"text":"The inequality in Equation (","element":"span"},{"href":"#id-136","text":"D.60","element":"a"},{"text":") follows from Equation (","element":"span"},{"href":"#id-135","text":"D.52","element":"a"},{"text":"), as","element":"span"},{"style":{"height":20.26},"width":349.89,"height":50.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-4.png","element":"img","alt":"��˜x𝑡+1𝑘+1 − x∗�� < 𝑟 ≤ 𝑟∗","inline":true},{"text":", and the one in ","element":"span"},{"text":"Equation (","element":"span"},{"href":"#id-136","text":"D.61","element":"a"},{"text":") follows from ","element":"span"},{"style":{"height":15.22},"width":390.28,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-5.png","element":"img","alt":" ∥x𝑘 − x∗∥ < 𝑟 ≤ 𝑐/(4𝐶)","inline":true},{"text":". This is the desired contradiction, and we must therefore have that ","element":"span"},{"style":{"height":10.4},"width":166.75,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-6.png","element":"img","alt":" 𝛾𝑘 = 𝛾max","inline":true},{"text":". This means that ","element":"span"},{"style":{"height":21.02},"width":245.96,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-7.png","element":"img","alt":" | ˜S𝑡𝑘+1| > | ˜S𝑡+1𝑘+1|","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":21.02},"width":436.84,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-8.png","element":"img","alt":" ˜S𝑡𝑘+1 \\ ˜S𝑡+1𝑘+1 ∉ vert(F (x∗))","inline":true},{"text":", or stated equivalently, ","element":"span"},{"text":"the ACG algorithm has dropped one of the vertices in its active set ","element":"span"},{"style":{"height":21.02},"width":75.76,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-9.png","element":"img","alt":"˜S𝑡𝑘+1 ","inline":true,"padRight":true},{"text":"that is not present in ","element":"span"},{"style":{"height":15.2},"width":222.3,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-10.png","element":"img","alt":" F (x∗). □","inline":true}],[{"text":"One of the key requirements in Theorem ","element":"span"},{"href":"#id-131","text":"D.5 ","element":"a"},{"text":"is that ","element":"span"},{"style":{"height":19.22},"width":529,"height":48.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-11.png","element":"img","alt":" {x𝑘, ˜x𝑡𝑘+1, ˜x𝑡+1𝑘+1} ⊂ B(x∗, 𝑟) ∩ X","inline":true},{"text":". As the SOCGS ","element":"span"},{"text":"algorithm (Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") decreases the primal gap of Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") at least linearly (Theorem ","element":"span"},{"href":"#id-137","text":"3.4","element":"a"},{"text":"), we can guarantee by strong convexity that there is an index ","element":"span"},{"style":{"height":11.6},"width":99.48,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-12.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"text":"after which for ","element":"span"},{"style":{"height":12.4},"width":100.36,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-13.png","element":"img","alt":" 𝑘 ≥ 𝐾","inline":true,"padRight":true},{"text":"we have that ","element":"span"},{"style":{"height":15.2},"width":293.43,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-14.png","element":"img","alt":" x𝑘 ∈ B(x∗, 𝑟)∩X.","inline":true,"padRight":true},{"text":"But in order for Theorem ","element":"span"},{"href":"#id-131","text":"D.5 ","element":"a"},{"text":"to apply for all ACG iterations in Line ","element":"span"},{"href":"#id-44","text":"10","element":"a"},{"text":", when computing the Inexact PVM step, we also need to ensure that ","element":"span"},{"style":{"height":18.42},"width":327.81,"height":46.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-15.png","element":"img","alt":" ˜x𝑡𝑘+1 ∈ B(x∗, 𝑟) ∩ X","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":12},"width":83.93,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-16.png","element":"img","alt":" 𝑡 ≥ 0","inline":true},{"text":". In the next Lemma we show that ","element":"span"},{"style":{"height":21.01},"width":505.01,"height":52.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-17.png","element":"img","alt":"��˜x𝑡𝑘+1 − x∗�� ≤ O(∥x𝑘 − x∗∥1/2)","inline":true},{"text":", allowing us to claim that for any ","element":"span"},{"style":{"height":11.2},"width":87.69,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-18.png","element":"img","alt":" 𝑟 > 0","inline":true,"padRight":true},{"text":"we can ensure that","element":"span"},{"style":{"height":20.26},"width":316.19,"height":50.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-19.png","element":"img","alt":"��˜x𝑡𝑘+1 − x∗�� ≤ 𝑟 for","inline":true,"padRight":true},{"text":"small enough ","element":"span"},{"style":{"height":14.8},"width":168.96,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-20.png","element":"img","alt":" ∥x𝑘 − x∗∥.","inline":true}],[{"id":"id-140","style":{"fontWeight":"bold"},"text":"Lemma D.6. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-21.png","element":"img","alt":" 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-22.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth function ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-23.png","element":"img","alt":" 𝑓 (x)","inline":true},{"style":{"fontStyle":"italic"},"text":", a polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", and a quadratic approximation ","element":"span"},{"style":{"height":18.3},"width":81.34,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-24.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"that satisfies Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":", let ","element":"span"},{"style":{"height":18.42},"width":70.74,"height":46.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-25.png","element":"img","alt":" ˜x𝑡𝑘+1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denote the iterate obtained after applying ","element":"span"},{"style":{"height":14.8},"width":165.19,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-26.png","element":"img","alt":" 𝑡 steps of","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the ACG algorithm (Line ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"10 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"of Algorithm ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":") to minimize ","element":"span"},{"style":{"height":18.3},"width":213.8,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-27.png","element":"img","alt":"ˆ𝑓𝑘(x) over X","inline":true},{"style":{"fontStyle":"italic"},"text":", starting from ","element":"span"},{"style":{"height":19.02},"width":155.26,"height":47.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-28.png","element":"img","alt":" ˜x0𝑘+1 = x𝑘","inline":true},{"style":{"fontStyle":"italic"},"text":", then for any ","element":"span"},{"style":{"height":12},"width":96.2,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-29.png","element":"img","alt":"𝑡 ≥ 0:","inline":true}],[{"style":{"width":"78%"},"width":1469,"height":550,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-30.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"By the triangle inequality we have:","element":"span"}],[{"id":"id-138","style":{"width":"68%"},"width":1287,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/35-31.png","element":"img"}],[{"text":"The first term in Equation (","element":"span"},{"href":"#id-138","text":"D.63","element":"a"},{"text":") can be bounded as follows:","element":"span"}],[{"id":"id-139","style":{"width":"80%"},"width":1517,"height":950,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-0.png","element":"img"}],[{"text":"Where Equation (","element":"span"},{"href":"#id-139","text":"D.65","element":"a"},{"text":") follows from the fact that the ACG algorithm decreases the primal gap at each iteration ","element":"span"},{"style":{"height":8.8},"width":12,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-1.png","element":"img","alt":" 𝑡","inline":true,"padRight":true},{"text":"and Equation (","element":"span"},{"href":"#id-139","text":"D.68","element":"a"},{"text":") is obtained by applying the Cauchy-Schwarz inequality to the first term in Equation (","element":"span"},{"href":"#id-139","text":"D.67","element":"a"},{"text":") and using the fact that ","element":"span"},{"style":{"height":24.89},"width":335.08,"height":62.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-2.png","element":"img","alt":" −��˜x∗𝑘+1 − x𝑘��2𝐻𝑘 ≤ 0","inline":true},{"text":". Moreover, in Equation (","element":"span"},{"href":"#id-139","text":"D.69","element":"a"},{"text":") we have set ","element":"span"},{"style":{"height":15.22},"width":371.46,"height":38.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-3.png","element":"img","alt":"𝐺 = maxx∈X ∥∇ 𝑓 (x)∥","inline":true},{"text":". Note that the","element":"span"},{"style":{"height":20.1},"width":186.77,"height":50.25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-4.png","element":"img","alt":"��˜x∗𝑘+1 − x∗��","inline":true,"padRight":true},{"text":"term appearing in Equations (","element":"span"},{"href":"#id-138","text":"D.63","element":"a"},{"text":") and (","element":"span"},{"href":"#id-139","text":"D.70","element":"a"},{"text":") can be ","element":"span"},{"text":"bounded using Corollary ","element":"span"},{"href":"#id-40","text":"C.12","element":"a"},{"text":", which results in","element":"span"},{"style":{"height":20.1},"width":475.81,"height":50.25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-5.png","element":"img","alt":"��˜x∗𝑘+1 − x∗�� ≤ O(∥x𝑘 − x∗∥2)","inline":true},{"text":". Combining the bound shown in ","element":"span"},{"text":"Equation (","element":"span"},{"href":"#id-139","text":"D.70","element":"a"},{"text":") with the bound in Lemma ","element":"span"},{"href":"#id-122","text":"D.3 ","element":"a"},{"text":"allows us to conclude that that:","element":"span"}],[{"style":{"width":"78%"},"width":1469,"height":440,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-6.png","element":"img"}],[{"text":"With Lemma ","element":"span"},{"text":"D.6 ","element":"span"},{"text":"we can guarantee that for any radius ","element":"span"},{"style":{"height":10.8},"width":87.69,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-7.png","element":"img","alt":" 𝑟 > 0","inline":true},{"text":", there is a ","element":"span"},{"href":"#id-140","style":{"height":32.95},"width":1870.76,"height":82.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-8.png","element":"img","alt":" 𝐾 ≥ 0 such that ˜x𝑡𝑘+1 ∈ B(x∗, 𝑟) ∩Xfor all 𝑘 ≥ 𝐾 and all 𝑡 ≥ 0","inline":true},{"text":". With this, we can move on to prove that after a finite number of iterations ","element":"span"},{"style":{"height":11.6},"width":99.48,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-9.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"text":"we can guarantee that ","element":"span"},{"style":{"height":15.2},"width":431.68,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-10.png","element":"img","alt":" x𝑘 ∈ F (x∗) for all 𝑘 ≥ 𝐾.","inline":true}],[{"id":"id-60","style":{"fontWeight":"bold"},"text":"Corollary D.7. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a strongly convex and smooth function ","element":"span"},{"style":{"height":15.2},"width":68.6,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-11.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with Lipschitz continuous Hessian and a polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if Assumptions ","element":"span"},{"href":"#id-28","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"are satisfied, then there is a ","element":"span"},{"style":{"height":13.79},"width":165.08,"height":34.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-12.png","element":"img","alt":" 𝑟PVM > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that if ","element":"span"},{"style":{"height":16.99},"width":370.92,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-13.png","element":"img","alt":" x𝑘 ∈ B(x∗, 𝑟PVM) ∩ X","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and for any ","element":"span"},{"style":{"height":12},"width":83.26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-14.png","element":"img","alt":" 𝑡 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"we have that ","element":"span"},{"style":{"height":21.02},"width":1012.67,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-15.png","element":"img","alt":" ˜x𝑡𝑘+1 ∉ F (x∗) then | ˜S𝑡+1𝑘+1| < | ˜S𝑡𝑘+1| and ˜S𝑡𝑘+1 \\ ˜S𝑡+1𝑘+1 ∉ F (x∗).","inline":true}],[{"style":{"height":14.4},"width":280.8,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-16.png","element":"img","alt":"Proof. Let 𝑟 > 0","inline":true,"padRight":true},{"text":"be the radius in Theorem ","element":"span"},{"href":"#id-131","text":"D.5 ","element":"a"},{"text":"such that if ","element":"span"},{"style":{"height":21.02},"width":856.55,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-17.png","element":"img","alt":" {x𝑘, ˜x𝑡𝑘+1, ˜x𝑡+1𝑘+1} ⊂ B(x∗, 𝑟) ∩ X then | ˜S𝑡+1𝑘+1| < | ˜S𝑡𝑘+1|","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":21.02},"width":345.31,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-18.png","element":"img","alt":"˜S𝑡𝑘+1 \\ ˜S𝑡+1𝑘+1 ∉ F (x∗)","inline":true},{"text":". Since we want this to hold for all ","element":"span"},{"style":{"height":12},"width":89.87,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-19.png","element":"img","alt":" 𝑡 ≥ 0","inline":true,"padRight":true},{"text":"for a given ","element":"span"},{"style":{"height":9.59},"width":37.76,"height":23.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-20.png","element":"img","alt":" x𝑘","inline":true},{"text":", we need to ensure that ","element":"span"},{"style":{"height":18.42},"width":331.28,"height":46.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-21.png","element":"img","alt":"˜x𝑡𝑘+1 ∈ B(x∗, 𝑟) ∩ X","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":12},"width":86.41,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-22.png","element":"img","alt":" 𝑡 ≥ 0","inline":true},{"text":". This can be accomplished with Lemma ","element":"span"},{"href":"#id-140","text":"D.6","element":"a"},{"text":", which allows us to ensure that ","element":"span"},{"text":"there is a ","element":"span"},{"style":{"height":14.19},"width":163.27,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-23.png","element":"img","alt":" 𝑟PVM > 0","inline":true,"padRight":true},{"text":"such that for any ","element":"span"},{"style":{"height":17.39},"width":370.33,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-24.png","element":"img","alt":" x𝑘 ∈ B(x∗, 𝑟PVM) ∩ X","inline":true,"padRight":true},{"text":"we have that ","element":"span"},{"style":{"height":19.22},"width":514.58,"height":48.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-25.png","element":"img","alt":" {x𝑘, ˜x𝑡𝑘+1, ˜x𝑡+1𝑘+1} ⊂ B(x∗, 𝑟) ∩ X","inline":true,"padRight":true},{"text":"for ","element":"span"},{"text":"all ","element":"span"},{"style":{"height":12},"width":1831.48,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/36-26.png","element":"img","alt":" 𝑡 ≥ 0. □","inline":true}],[{"id":"id-53","style":{"fontWeight":"bold"},"text":"Corollary D.8. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a strongly convex and smooth function ","element":"span"},{"style":{"height":15.2},"width":68.6,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-0.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with Lipschitz continuous Hessian and a polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if Assumptions ","element":"span"},{"href":"#id-28","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"are satisfied, then there is a ","element":"span"},{"style":{"height":11.2},"width":98.85,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-1.png","element":"img","alt":" 𝐾 > 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that for all ","element":"span"},{"style":{"height":12.4},"width":100.4,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-2.png","element":"img","alt":" 𝑘 ≥ 𝐾","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the iterates of the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":") satisfy that ","element":"span"},{"style":{"height":15.2},"width":200.49,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-3.png","element":"img","alt":" x𝑘 ∈ F (x∗).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"By Theorem ","element":"span"},{"href":"#id-59","text":"D.4 ","element":"a"},{"text":"we know that there is a ","element":"span"},{"style":{"height":14.99},"width":171.53,"height":37.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-4.png","element":"img","alt":" 𝐾ACG ≥ 0","inline":true,"padRight":true},{"text":"such that for ","element":"span"},{"style":{"height":14.99},"width":171.35,"height":37.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-5.png","element":"img","alt":" 𝑘 ≥ 𝐾ACG ","inline":true,"padRight":true},{"text":"we have that ","element":"span"},{"style":{"height":19.42},"width":251.41,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-6.png","element":"img","alt":" xACG𝑘 ∈ F (x∗).","inline":true,"padRight":true},{"text":"Moreover, from Corollary ","element":"span"},{"href":"#id-60","text":"D.7 ","element":"a"},{"text":"we know that there is a radius ","element":"span"},{"style":{"height":14.19},"width":165.25,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-7.png","element":"img","alt":" 𝑟PVM > 0","inline":true,"padRight":true},{"text":"such that if ","element":"span"},{"style":{"height":17.39},"width":373.36,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-8.png","element":"img","alt":" x𝑘 ∈ B(x∗, 𝑟PVM) ∩ X","inline":true,"padRight":true},{"text":"then ","element":"span"},{"style":{"height":19.22},"width":514.58,"height":48.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-9.png","element":"img","alt":" {x𝑘, ˜x𝑡𝑘+1, ˜x𝑡+1𝑘+1} ⊂ B(x∗, 𝑟) ∩ X","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":12},"width":83.26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-10.png","element":"img","alt":" 𝑡 ≥ 0","inline":true},{"text":", where ","element":"span"},{"style":{"height":11.2},"width":87.69,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-11.png","element":"img","alt":" 𝑟 > 0","inline":true,"padRight":true},{"text":"is the radius in Theorem ","element":"span"},{"href":"#id-131","text":"D.5","element":"a"},{"text":". As the SOCGS ","element":"span"},{"text":"algorithm contracts the primal gap at least linearly, there is a ","element":"span"},{"style":{"height":14.99},"width":175.11,"height":37.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-12.png","element":"img","alt":" 𝐾PVM ≥ 0","inline":true,"padRight":true},{"text":"after which we can guarantee that ","element":"span"},{"style":{"height":17.39},"width":689.78,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-13.png","element":"img","alt":"x𝑘 ∈ B(x∗, 𝑟PVM) ∩ X for all 𝑘 ≥ 𝐾PVM.","inline":true},{"text":"Assume that ","element":"span"},{"style":{"height":16.99},"width":427.8,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-14.png","element":"img","alt":" 𝐾′ = max{𝐾ACG, 𝐾PVM}","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.2},"width":208.71,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-15.png","element":"img","alt":" x𝐾′ ∉ F (x∗)","inline":true},{"text":". Then for all subsequent iterations ","element":"span"},{"style":{"height":12.4},"width":121.57,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-16.png","element":"img","alt":" 𝑘 ≥ 𝐾′","inline":true,"padRight":true},{"text":"we either choose the ACG step (Line ","element":"span"},{"href":"#id-44","text":"18 ","element":"a"},{"text":"in Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") and have that ","element":"span"},{"style":{"height":19.43},"width":370.23,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-17.png","element":"img","alt":" x𝑘+1 = xACG𝑘+1 ∈ F (x∗)","inline":true,"padRight":true},{"text":"and the claim is true, or we choose the Inexact PVM step (Line ","element":"span"},{"href":"#id-44","text":"15 ","element":"a"},{"text":"in Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") and have that ","element":"span"},{"style":{"height":14.8},"width":226.14,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-18.png","element":"img","alt":" |S𝑘| > |S𝑘+1|","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.2},"width":649.06,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-19.png","element":"img","alt":"|S𝑘| \\ |S𝑘+1| ∈ (vert(X) \\ vert(F (x∗)))","inline":true,"padRight":true},{"text":"by Theorem ","element":"span"},{"href":"#id-131","text":"D.5","element":"a"},{"text":". The latter case can only happen a finite number of times before ","element":"span"},{"style":{"height":15.2},"width":194.6,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-20.png","element":"img","alt":" x𝐾 ∈ F (x∗)","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":10.4},"width":124.16,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-21.png","element":"img","alt":" 𝐾 > 𝐾′","inline":true},{"text":", as ","element":"span"},{"style":{"height":14.8},"width":82.28,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-22.png","element":"img","alt":" |S𝐾′|","inline":true,"padRight":true},{"text":"is finite. Thereafter we will have that ","element":"span"},{"style":{"height":15.2},"width":186.65,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-23.png","element":"img","alt":" x𝑘 ∈ F (x∗)","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":99.7,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-24.png","element":"img","alt":"𝑘 > 𝐾","inline":true,"padRight":true},{"text":"(as Theorem ","element":"span"},{"href":"#id-59","text":"D.4 ","element":"a"},{"text":"and Theorem ","element":"span"},{"href":"#id-131","text":"D.5 ","element":"a"},{"text":"will still hold). ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-25.png","element":"img","alt":"□","inline":true}],[{"text":"This allows us to conclude in the next theorem that the quadratic convergence in distance to the optimum of the Inexact PVM steps translates into quadratic convergence in the primal gap for the SOCGS algorithm.","element":"span"}],[{"id":"id-142","style":{"fontWeight":"bold"},"text":"Theorem D.9 ","element":"span"},{"text":"(Quadratic convergence in primal gap of the SOCGS algorithm)","element":"span"},{"style":{"height":14.8},"width":207.58,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-26.png","element":"img","alt":". Given a 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-27.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth function ","element":"span"},{"style":{"height":15.2},"width":212.52,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-28.png","element":"img","alt":" 𝑓 (x) with 𝐿2","inline":true},{"style":{"fontStyle":"italic"},"text":"-Lipschitz Hessian and a polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", if Assumption ","element":"span"},{"href":"#id-28","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"are satisfied, then there is a ","element":"span"},{"style":{"height":12},"width":99.49,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-29.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that for ","element":"span"},{"style":{"height":12.4},"width":100.35,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-30.png","element":"img","alt":" 𝑘 ≥ 𝐾","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the iterates of the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":") satisfy:","element":"span"}],[{"style":{"width":"61%"},"width":1158,"height":89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-31.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the parameter ","element":"span"},{"style":{"height":19.03},"width":988.62,"height":47.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-32.png","element":"img","alt":" 𝜂𝑘 = max{𝜆max(𝐻−1𝑘 ∇2 𝑓 (x𝑘)), 𝜆max([∇2 𝑓 (x𝑘)]−1𝐻𝑘)} ≥ 1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"measures how well ","element":"span"},{"style":{"height":13.2},"width":189.02,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-33.png","element":"img","alt":" 𝐻𝑘 approx-","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"imates ","element":"span"},{"style":{"height":16.99},"width":261.84,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-34.png","element":"img","alt":" ∇2 𝑓 (x𝑘) and 𝜔","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is defined in Assumption ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"From Corollary ","element":"span"},{"href":"#id-53","text":"D.8 ","element":"a"},{"text":"we know that there is an index ","element":"span"},{"style":{"height":12},"width":105.14,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-35.png","element":"img","alt":" 𝐾 ≥ 0","inline":true,"padRight":true},{"text":"such that for ","element":"span"},{"style":{"height":12.4},"width":106.02,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-36.png","element":"img","alt":" 𝑘 ≥ 𝐾","inline":true,"padRight":true},{"text":"we know that the Inexact PVM iterates and the ACG iterates will be contained in ","element":"span"},{"style":{"height":15.2},"width":100.24,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-37.png","element":"img","alt":" F (x∗)","inline":true},{"text":". This allows us to convert the quadratic convergence in distance to the optimum in Lemma ","element":"span"},{"href":"#id-122","text":"D.3 ","element":"a"},{"text":"for the Inexact PVM steps to a quadratic convergence in primal gap. Using strong-convexity we can bound bound ","element":"span"},{"style":{"height":18.01},"width":557.98,"height":45.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-38.png","element":"img","alt":" ∥x𝑘 − x∗∥2 ≤ 2/𝜇( 𝑓 (x𝑘) − 𝑓 (x∗))","inline":true},{"text":". Using ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-39.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smoothness along with the strict-complementary assumption (Assumption ","element":"span"},{"href":"#id-28","text":"1","element":"a"},{"text":") and the fact that ","element":"span"},{"style":{"height":15.2},"width":218.82,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-40.png","element":"img","alt":"˜x𝑘+1 ∈ F (x∗)","inline":true,"padRight":true},{"text":"leads to ","element":"span"},{"style":{"height":18.01},"width":634.62,"height":45.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-41.png","element":"img","alt":" ∥˜x𝑘+1 − x∗∥2 ≥ 2/𝐿( 𝑓 (˜x𝑘+1) − 𝑓 (x∗)))","inline":true},{"text":". Plugging these bounds into the convergence in distance to the optimum from Lemma ","element":"span"},{"href":"#id-122","text":"D.3 ","element":"a"},{"text":"results in:","element":"span"}],[{"id":"id-141","style":{"width":"80%"},"width":1517,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-42.png","element":"img"}],[{"text":"As the SOCGS contracts the primal gap at least linearly (see Theorem ","element":"span"},{"href":"#id-137","text":"3.4","element":"a"},{"text":"), then for small enough ","element":"span"},{"style":{"height":15.2},"width":211.88,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-43.png","element":"img","alt":" 𝑓 (x𝑘)− 𝑓 (x∗)","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":12.4},"width":100.45,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-44.png","element":"img","alt":" 𝑘 ≥ 𝐾","inline":true,"padRight":true},{"text":"we know that the quadratic convergence shown in Equation (","element":"span"},{"href":"#id-141","text":"D.71","element":"a"},{"text":") for the Inexact PVM steps in Line ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"will provide more primal progress than the ACG steps in Line ","element":"span"},{"href":"#id-44","text":"4","element":"a"},{"text":". Therefore the Inexact PVM steps will be chosen in Line ","element":"span"},{"href":"#id-44","text":"14 ","element":"a"},{"text":"and we will have that:","element":"span"}],[{"style":{"width":"80%"},"width":1514,"height":143,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-45.png","element":"img"}],[{"id":"id-57","style":{"fontWeight":"bold"},"text":"D.2 Complexity Analysis","element":"span"}],[{"text":"Throughout this section we make the simplifying assumption that we have at our disposal the tightest possible lower bound ","element":"span"},{"style":{"height":15.2},"width":106.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-46.png","element":"img","alt":" 𝑙𝑏(x𝑘)","inline":true,"padRight":true},{"text":"on the primal gap, that is, ","element":"span"},{"style":{"height":15.2},"width":395.84,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-47.png","element":"img","alt":" 𝑙𝑏(x𝑘) = 𝑓 (x𝑘) − 𝑓 (x∗)","inline":true},{"text":". Providing a looser lower bound on the primal gap does not affect the number of first-order or Hessian oracle calls, however it can significantly increase the number of linear optimization oracle calls used to compute the Inexact PVM steps. Let ","element":"span"},{"style":{"height":19.36},"width":448.3,"height":48.39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-48.png","element":"img","alt":" 𝑟 = min�𝑟ACG, 𝑟PVM�> 0","inline":true},{"text":", where ","element":"span"},{"style":{"height":14.19},"width":88.88,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-49.png","element":"img","alt":" 𝑟ACG","inline":true,"padRight":true},{"text":"is described in Theorem ","element":"span"},{"href":"#id-59","text":"D.4 ","element":"a"},{"text":"and ","element":"span"},{"style":{"height":14.19},"width":92.43,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-50.png","element":"img","alt":" 𝑟PVM","inline":true,"padRight":true},{"text":"in Corollary ","element":"span"},{"href":"#id-60","text":"D.7","element":"a"},{"text":". Note that ","element":"span"},{"style":{"height":7.2},"width":17,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-51.png","element":"img","alt":" 𝑟","inline":true,"padRight":true},{"text":"is independent of the target accuracy ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/37-52.png","element":"img","alt":" 𝜀","inline":true},{"text":". For ease of exposition we can divide the behaviour of the SOCGS algorithm (Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":") into three phases:","element":"span"}],[{"text":"1. ","element":"span"},{"style":{"height":19.42},"width":916.52,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-0.png","element":"img","alt":" Phase 1: x𝑘 ∉ B(x∗, 𝑟) ∩ X or xACG𝑘 ∉ B(x∗, 𝑟) ∩ X.","inline":true,"padRight":true},{"text":"In this phase the SOCGS algorithm will contract the primal gap at least linearly, as dictated by Theorem ","element":"span"},{"href":"#id-137","text":"3.4","element":"a"},{"text":". Using strong-convexity we can upper bound the number of iterations needed until ","element":"span"},{"style":{"height":19.42},"width":365.62,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-1.png","element":"img","alt":" {x𝑘, xACG𝑘 } ∈ B(x∗, 𝑟)","inline":true},{"text":", which marks the end of this first phase.","element":"span"}],[{"text":"2. ","element":"span"},{"style":{"height":19.43},"width":1128.4,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-2.png","element":"img","alt":" Phase 2: {x𝑘, xACG𝑘 } ∈ B(x∗, 𝑟) ∩ X and {x𝑘, xACG𝑘 } ∉ F (x∗).","inline":true,"padRight":true},{"text":"The primal gap convergence of the SOCGS algorithm in this phase is also at least linear, and the convergence bound of Theorem ","element":"span"},{"href":"#id-137","text":"3.4 ","element":"a"},{"text":"still holds. However in this phase, the ACG steps in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"and the ACG steps used to compute the Inexact PVM iterates in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"will drop any vertices in their respective active sets that are not in ","element":"span"},{"style":{"height":15.2},"width":99.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-3.png","element":"img","alt":"F (x∗)","inline":true},{"text":". That is, if ","element":"span"},{"style":{"height":19.43},"width":1465.54,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-4.png","element":"img","alt":" xACG𝑘 ∈ B(x∗, 𝑟) ∩ X \\ F (x∗) then |SACG𝑘 | > |SACG𝑘+1 | and SACG𝑘 \\ SACG𝑘+1 ∉ vert(F (x∗)).","inline":true,"padRight":true},{"text":"Similarly, if ","element":"span"},{"style":{"height":15.2},"width":440.74,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-5.png","element":"img","alt":" x𝑘 ∈ B(x∗, 𝑟) ∩ X \\ F (x∗)","inline":true,"padRight":true},{"text":"then ","element":"span"},{"style":{"height":13.41},"width":70.73,"height":33.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-6.png","element":"img","alt":" ˜x𝑘+1","inline":true,"padRight":true},{"text":"in Line ","element":"span"},{"href":"#id-44","text":"13 ","element":"a"},{"text":"in Algorithm ","element":"span"},{"href":"#id-44","text":"2 ","element":"a"},{"text":"satisfies after exiting the while loop in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"that ","element":"span"},{"style":{"height":18.41},"width":219.47,"height":46.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-7.png","element":"img","alt":" |S𝑘| > | ˜S𝑘+1|","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.81},"width":416.74,"height":47.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-8.png","element":"img","alt":" S𝑘 \\ ˜S𝑘+1 ⊄ vert(F (x∗))","inline":true},{"text":". As the cardinality of both active sets is finite, after a finite number of iterations we must have that ","element":"span"},{"style":{"height":19.43},"width":521.99,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-9.png","element":"img","alt":" {x𝑘, xACG𝑘 } ∈ B(x∗, 𝑟) ∩ F (x∗),","inline":true,"padRight":true},{"text":"which marks the end of this phase.","element":"span"}],[{"text":"3. ","element":"span"},{"style":{"height":19.42},"width":735.76,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-10.png","element":"img","alt":" Phase 3: {x𝑘, xACG𝑘 } ∈ B(x∗, 𝑟) ∩ F (x∗).","inline":true,"padRight":true},{"text":"In this final phase the SOCGS algorithm has a quadratic convergence rate in primal gap, as shown in Theorem ","element":"span"},{"href":"#id-142","text":"D.9","element":"a"},{"text":". Once ","element":"span"},{"style":{"height":19.43},"width":519.69,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-11.png","element":"img","alt":" {x𝑘, xACG𝑘 } ∈ B(x∗, 𝑟) ∩ F (x∗)","inline":true,"padRight":true},{"text":"the ACG steps in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"and in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"will not pick up any vertices in ","element":"span"},{"style":{"height":15.2},"width":369.92,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-12.png","element":"img","alt":" vert(X) \\ vert(F (x∗))","inline":true},{"text":", and the iterates will remain in ","element":"span"},{"style":{"height":15.2},"width":278.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-13.png","element":"img","alt":" B(x∗, 𝑟) ∩ F (x∗)","inline":true,"padRight":true},{"text":"for all subsequent steps.","element":"span"}],[{"text":"As in the classical analysis of PVM and Newton algorithms, the SOCGS algorithm shows local quadratic convergence (in primal gap and distance to the optimum) after a number of iterations that is independent of ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-14.png","element":"img","alt":" 𝜀","inline":true,"padRight":true},{"text":"(but dependent on ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-15.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"text":"X","element":"span"},{"text":"). The SOCGS algorithm makes use of three different types of oracle calls, namely, Hessian, first-order and linear optimization oracle calls. The Hessian oracle is called once per iteration (in Line ","element":"span"},{"href":"#id-44","text":"5","element":"a"},{"text":"), while the first-order oracle is called at most twice (to compute the independent ACG step in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"and to build the quadratic approximation in Line ","element":"span"},{"href":"#id-44","text":"6","element":"a"},{"text":"). The linear minimization oracle will be called once in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"for the independent ACG step and potentially multiple times in Line ","element":"span"},{"href":"#id-44","text":"10 ","element":"a"},{"text":"while computing the Inexact PVM step.","element":"span"}],[{"text":"In order to study the number of linear optimization oracle calls needed to achieve a ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-16.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution to Problem (","element":"span"},{"href":"#id-0","text":"1.1","element":"a"},{"text":") we first review the convergence of the Frank-Wolfe gap of the ACG algorithm, which is used as a stopping criterion in the SOCGS algorithm to compute the Inexact PVM steps (Line ","element":"span"},{"href":"#id-44","text":"9 ","element":"a"},{"text":"in Algorithm ","element":"span"},{"href":"#id-44","text":"2","element":"a"},{"text":").","element":"span"}],[{"id":"id-143","style":{"fontWeight":"bold"},"text":"Theorem D.10 ","element":"span"},{"text":"(Convergence of the Frank-Wolfe gap of the ACG algorithm)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-27","referenceIndex":39,"style":{"fontStyle":"italic"},"text":"Lacoste-Julien & Jaggi","element":"a"},{"style":{"fontStyle":"italic"},"text":",","element":"span"}],[{"href":"#id-27","referenceIndex":39,"style":{"fontStyle":"italic"},"text":"2015","element":"a"},{"style":{"fontStyle":"italic"},"text":", Theorem 2) Given a ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-17.png","element":"img","alt":" 𝜇","inline":true},{"style":{"fontStyle":"italic"},"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-18.png","element":"img","alt":" 𝐿","inline":true},{"style":{"fontStyle":"italic"},"text":"-smooth function ","element":"span"},{"style":{"height":15.2},"width":69.4,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-19.png","element":"img","alt":" 𝑓 (x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":", then for any ","element":"span"},{"style":{"height":12.4},"width":90.76,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-20.png","element":"img","alt":"𝑘 ≥ 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the ACG algorithm satisfies:","element":"span"}],[{"style":{"width":"70%"},"width":1317,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-21.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":10.8},"width":30,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-22.png","element":"img","alt":" 𝐷","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes the diameter of the polytope ","element":"span"},{"text":"X","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"width":"99%"},"width":1872,"height":133,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-23.png","element":"img"}],[{"text":"The number of outer iterations needed for ","element":"span"},{"style":{"height":19.43},"width":91.84,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-24.png","element":"img","alt":" xACG𝑘","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":9.59},"width":37.77,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-25.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"to reach ","element":"span"},{"style":{"height":15.2},"width":211.19,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-26.png","element":"img","alt":" B(x∗, 𝑟) ∩ X","inline":true,"padRight":true},{"text":"can be upper bounded using strong convexity. As ","element":"span"},{"style":{"height":18.01},"width":504.24,"height":45.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-27.png","element":"img","alt":" 𝑓 (x) − 𝑓 (x∗) ≥ 𝜇/2 ∥x − x∗∥2","inline":true,"padRight":true},{"text":"then if ","element":"span"},{"style":{"height":16.99},"width":374.78,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-28.png","element":"img","alt":" 𝑓 (x) − 𝑓 (x∗) ≤ 𝜇/2𝑟2","inline":true,"padRight":true},{"text":"we can conclude that ","element":"span"},{"style":{"height":15.2},"width":282.96,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-29.png","element":"img","alt":" x ∈ B(x∗, 𝑟) ∩ X","inline":true},{"text":". As the iterates ","element":"span"},{"style":{"height":9.59},"width":37.76,"height":23.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-30.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.42},"width":91.84,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-31.png","element":"img","alt":" xACG𝑘","inline":true,"padRight":true},{"text":"have a primal gap convergence that is at least linear (see Theorem ","element":"span"},{"href":"#id-137","text":"3.4 ","element":"a"},{"text":"and Theorem ","element":"span"},{"href":"#id-30","text":"2.1 ","element":"a"},{"text":"respectively) then the number of iterations ","element":"span"},{"style":{"height":12.39},"width":36.44,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-32.png","element":"img","alt":" 𝑇1","inline":true,"padRight":true},{"text":"needed to ensure that ","element":"span"},{"style":{"height":19.43},"width":678.31,"height":48.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-33.png","element":"img","alt":"{x𝑘, xACG𝑘 } ∈ B(x∗, 𝑟) ∩ X for all 𝑘 ≥ 𝑇1","inline":true,"padRight":true},{"text":"can be upper bounded by:","element":"span"}],[{"id":"id-144","style":{"width":"67%"},"width":1267,"height":105,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-34.png","element":"img"}],[{"text":"Where we have used the primal gap convergence of Theorem ","element":"span"},{"href":"#id-137","text":"3.4 ","element":"a"},{"text":"and ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-35.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strong convexity. If we denote by ","element":"span"},{"style":{"height":13.99},"width":70.48,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/38-36.png","element":"img","alt":" 𝑁𝑘,1","inline":true,"padRight":true},{"text":"the number of inner ACG steps in Line ","element":"span"},{"href":"#id-44","text":"10 ","element":"a"},{"text":"that we need to take to satisfy the exit criterion shown in Line ","element":"span"},{"href":"#id-44","text":"9","element":"a"}],[{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2 ","element":"a"},{"text":"at iteration ","element":"span"},{"style":{"height":11.6},"width":19,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-0.png","element":"img","alt":" 𝑘","inline":true,"padRight":true},{"text":"during this phase, and we use Theorem ","element":"span"},{"href":"#id-143","text":"D.10 ","element":"a"},{"text":"we have that:","element":"span"}],[{"style":{"width":"96%"},"width":1815,"height":530,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-1.png","element":"img"}],[{"text":"The inequality follows from the fact that for ","element":"span"},{"style":{"height":15.2},"width":282.07,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-2.png","element":"img","alt":" x𝑘 ∉ B(x∗, 𝑟) ∩X","inline":true,"padRight":true},{"text":"we can bound ","element":"span"},{"style":{"height":16.99},"width":519.5,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-3.png","element":"img","alt":" 𝜇𝑟2/2 ≤ 𝑓 (x𝑘) − 𝑓 (x∗), and the","inline":true,"padRight":true},{"text":"fact that ","element":"span"},{"style":{"height":24.89},"width":1710.52,"height":62.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-4.png","element":"img","alt":"ˆ𝑓𝑘(x𝑘) − ˆ𝑓𝑘(x∗𝑘+1) =�−∇ 𝑓 (x𝑘), x𝑘 − x∗𝑘+1�− 1/2��x𝑘 − x∗𝑘+1��2𝐻𝑘 ≤ ∥∇ 𝑓 (x𝑘)∥��x𝑘 − x∗𝑘+1�� ≤ ∥∇ 𝑓 (x𝑘)∥ 𝐷.","inline":true,"padRight":true},{"text":"If we denote:","element":"span"}],[{"style":{"width":"62%"},"width":1180,"height":65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-5.png","element":"img"}],[{"text":"then, using the fact that ","element":"span"},{"style":{"height":16.59},"width":226.75,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-6.png","element":"img","alt":" 𝜂𝑘 ≤ 1 + 𝜔𝐷2","inline":true},{"text":", we can bound the number of inner ACG steps in Line ","element":"span"},{"href":"#id-44","text":"10 ","element":"a"},{"text":"needed for any iteration ","element":"span"},{"style":{"height":12.4},"width":90.76,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-7.png","element":"img","alt":" 𝑘 ≥ 0","inline":true,"padRight":true},{"text":"in the first phase such that ","element":"span"},{"style":{"height":15.2},"width":353.72,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-8.png","element":"img","alt":" x𝑘 ∉ B(x∗, 𝑟) ∩ X as:","inline":true}],[{"id":"id-145","style":{"width":"69%"},"width":1296,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-9.png","element":"img"}],[{"text":"As the SOCGS algorithm calls the Hessian oracle once, and the first-order oracle at most twice per iteration we can upper bound the total number of first-order and Hessian oracle calls using the bound shown in Equation (","element":"span"},{"href":"#id-144","text":"D.72","element":"a"},{"text":"). Combining the aforementioned bound with the bound on the total number of linear minimization oracle calls per iteration in Equation (","element":"span"},{"href":"#id-145","text":"D.76","element":"a"},{"text":") we can bound the total number of linear minimization oracle calls. Therefore in this phase we will need:","element":"span"}],[{"style":{"width":"99%"},"width":1872,"height":338,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-10.png","element":"img"}],[{"text":"In this phase we can guarantee that if ","element":"span"},{"style":{"height":19.43},"width":496.41,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-11.png","element":"img","alt":" xACG𝑘 ∈ B(x∗, 𝑟) ∩ X \\ F (x∗)","inline":true,"padRight":true},{"text":"then the ACG step in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"will be an away-step that reduces the cardinality of the active set ","element":"span"},{"style":{"height":19.43},"width":99.28,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-12.png","element":"img","alt":" SACG𝑘","inline":true,"padRight":true},{"text":", satisfying that ","element":"span"},{"style":{"height":19.43},"width":293.2,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-13.png","element":"img","alt":" |SACG𝑘 | > |SACG𝑘+1 |","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.43},"width":483.18,"height":48.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-14.png","element":"img","alt":"SACG𝑘 \\ SACG𝑘+1 ∉ vert(F (x∗))","inline":true},{"text":". Similarly, if ","element":"span"},{"style":{"height":15.2},"width":432.56,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-15.png","element":"img","alt":" x𝑘 ∈ B(x∗, 𝑟) ∩ X \\ F (x∗)","inline":true,"padRight":true},{"text":"then the ACG steps in Line ","element":"span"},{"href":"#id-44","text":"10 ","element":"a"},{"text":"will also be away-steps that reduce the cardinality of the active set ","element":"span"},{"style":{"height":13.59},"width":42.37,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-16.png","element":"img","alt":" S𝑘","inline":true},{"text":", that is, after exiting the while loop in Line ","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2 ","element":"a"},{"text":"we have that ","element":"span"},{"style":{"height":18.41},"width":214.47,"height":46.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-17.png","element":"img","alt":" |S𝑘| > | ˜S𝑘+1|","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.81},"width":409.74,"height":47.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-18.png","element":"img","alt":" S𝑘 \\ ˜S𝑘+1 ⊄ vert(F (x∗))","inline":true},{"text":". This behaviour will continue until ","element":"span"},{"style":{"height":19.59},"width":563.78,"height":48.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-19.png","element":"img","alt":"xACG𝑘 ∈ F (x∗) and ˜x𝑡+1𝑘+1 ∈ F (x∗).","inline":true}],[{"text":"Therefore we need to bound the number of vertices that have to be dropped from both ","element":"span"},{"style":{"height":19.42},"width":99.28,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-20.png","element":"img","alt":" SACG𝑘","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.59},"width":42.37,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-21.png","element":"img","alt":" S𝑘","inline":true,"padRight":true},{"text":"in order for ","element":"span"},{"style":{"height":19.43},"width":352.86,"height":48.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-22.png","element":"img","alt":" SACG𝑘 ⊆ vert(F (x∗))","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.2},"width":297.28,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-23.png","element":"img","alt":" S𝑘 ⊆ vert(F (x∗))","inline":true},{"text":". The ACG algorithm in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"will have picked up at most ","element":"span"},{"style":{"height":12.39},"width":36.44,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-24.png","element":"img","alt":" 𝑇1","inline":true,"padRight":true},{"text":"vertices in the first phase (as each iteration can only add one vertex to ","element":"span"},{"style":{"height":14.59},"width":99.28,"height":36.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-25.png","element":"img","alt":" SACG","inline":true,"padRight":true},{"text":"in Line ","element":"span"},{"href":"#id-44","text":"4","element":"a"},{"text":"), on the other hand, the PVM steps in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12 ","element":"a"},{"text":"will have picked up at most ","element":"span"},{"style":{"height":20.77},"width":163.22,"height":51.93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-26.png","element":"img","alt":"�𝑇1𝑘=1 𝑁𝑘,1","inline":true,"padRight":true},{"text":"vertices. As once inside the ","element":"span"},{"text":"ball all ACG steps (both in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"and Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12","element":"a"},{"text":") reduce the cardinality of the active set, and using the bounds in Equation (","element":"span"},{"href":"#id-144","text":"D.72","element":"a"},{"text":") and (","element":"span"},{"href":"#id-145","text":"D.76","element":"a"},{"text":"), we will need:","element":"span"}],[{"style":{"width":"88%"},"width":1653,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/39-27.png","element":"img"}],[{"text":"We now need to bound the number of first-order oracle calls needed to drop the aforementioned vertices. The ACG algorithm in Line ","element":"span"},{"href":"#id-44","text":"4 ","element":"a"},{"text":"will need to call the first-order oracle at most ","element":"span"},{"style":{"height":12.39},"width":36.44,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-0.png","element":"img","alt":" 𝑇1","inline":true,"padRight":true},{"text":"times. On the other hand, we need to bound the number of vertices that the PVM steps will drop per first-order oracle call in Lines ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":"-","element":"span"},{"href":"#id-44","text":"12","element":"a"},{"text":", for which we will use the following Lemma:","element":"span"}],[{"style":{"height":16.99},"width":661.5,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-1.png","element":"img","alt":"Lemma D.11. If 𝑓 (x𝑘) − 𝑓 (x∗) ≤ 4𝜇2 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"then the Inexact PVM steps in Lines ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"9","element":"a"},{"style":{"fontStyle":"italic"},"text":"-","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"12 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"of Algorithm ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"will perform at least one ACG step in Line ","element":"span"},{"href":"#id-44","style":{"fontStyle":"italic"},"text":"10","element":"a"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"We use proof by contradiction, and we assume that to compute the Inexact PVM step to the necessary accuracy we did not perform any ACG steps in Line ","element":"span"},{"href":"#id-44","text":"10","element":"a"},{"text":", that is:","element":"span"}],[{"style":{"width":"42%"},"width":796,"height":303,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-2.png","element":"img"}],[{"text":"Where the last inequality follows from convexity. ","element":"span"},{"text":"Using the previous chain of inequalities along with ","element":"span"},{"style":{"height":18.01},"width":544.67,"height":45.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-3.png","element":"img","alt":"𝑓 (x𝑘) − 𝑓 (x∗) ≤ ∥∇ 𝑓 (x𝑘)∥2 /2𝜇","inline":true,"padRight":true},{"text":"from ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-4.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strong convexity we have that ","element":"span"},{"style":{"height":16.99},"width":351.89,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-5.png","element":"img","alt":" 𝑓 (x𝑘) − 𝑓 (x∗) > 4𝜇2","inline":true},{"text":", which is the desired contradiction. ","element":"span"},{"style":{"height":0},"width":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-6.png","element":"img","alt":"□","inline":true,"padRight":true},{"text":"We assume that ","element":"span"},{"style":{"height":16.4},"width":140.54,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-7.png","element":"img","alt":" 𝑟 < √8𝜇","inline":true},{"text":", which allows us to claim that the primal gap for any point ","element":"span"},{"style":{"height":34.33},"width":1864.63,"height":85.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-8.png","element":"img","alt":" x𝑘 ∈ B(x∗, 𝑟) satisfies𝑓 (x𝑘) − 𝑓 (x∗) ≤ 4𝜇2 ","inline":true,"padRight":true},{"text":"(otherwise it simply takes a constant number of iterations to achieve this once in ","element":"span"},{"style":{"height":15.2},"width":144.96,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-9.png","element":"img","alt":" B(x∗, 𝑟),","inline":true,"padRight":true},{"text":"as the primal gap contracts at least linearly). Therefore in this phase we will need:","element":"span"}],[{"style":{"width":"99%"},"width":1872,"height":188,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-10.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":10.4},"width":26,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-11.png","element":"img","alt":" 𝑇","inline":true,"padRight":true},{"text":"denote the first iteration of the final phase, where ","element":"span"},{"style":{"height":19.18},"width":512.34,"height":47.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-12.png","element":"img","alt":" {x𝑇, xACG𝑇 } ∈ B(x∗, 𝑟) ∩ F (x∗)","inline":true,"padRight":true},{"text":"and the quadratic rate dominates over the linear rate. Using the quadratic convergence in primal gap shown in Theorem ","element":"span"},{"href":"#id-142","text":"D.9 ","element":"a"},{"text":"we have that:","element":"span"}],[{"style":{"width":"80%"},"width":1506,"height":239,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-13.png","element":"img"}],[{"text":"Where we have used the fact that by Assumption ","element":"span"},{"href":"#id-41","text":"2 ","element":"a"},{"text":"we have that ","element":"span"},{"style":{"height":17.61},"width":534.57,"height":44.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-14.png","element":"img","alt":" 𝜂𝑘 ≤ 1 + 𝜔 ∥x𝑘 − x∗∥2 ≤ 1 + 𝜔𝑟2","inline":true},{"text":". Therefore in order to reach a ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-15.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution starting from this phase we need:","element":"span"}],[{"style":{"width":"74%"},"width":1393,"height":98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-16.png","element":"img"}],[{"text":"Where we have only included the dependence on ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-17.png","element":"img","alt":" 𝜀","inline":true,"padRight":true},{"text":"for notational convenience. If we denote by ","element":"span"},{"style":{"height":13.99},"width":70.48,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-18.png","element":"img","alt":" 𝑁𝑘,3","inline":true,"padRight":true},{"text":"the number of inner ACG steps in Line ","element":"span"},{"href":"#id-44","text":"10 ","element":"a"},{"text":"that we need to take to satisfy the exit criterion shown in Line ","element":"span"},{"href":"#id-44","text":"9 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-44","text":"2 ","element":"a"},{"text":"at iteration ","element":"span"},{"style":{"height":11.6},"width":19,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-19.png","element":"img","alt":" 𝑘","inline":true,"padRight":true},{"text":"during this last phase and we use the fact that ","element":"span"},{"style":{"height":15.2},"width":291.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-20.png","element":"img","alt":" 𝑓 (x𝑘) − 𝑓 (x∗) ≥ 𝜀","inline":true,"padRight":true},{"text":"for all suboptimal iterates, resulting in:","element":"span"}],[{"style":{"width":"65%"},"width":1218,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-21.png","element":"img"}],[{"text":"Therefore combining the bound on the total number of iterations in this phase with the bound on the number of linear minimization oracle calls per iteration we need:","element":"span"}],[{"style":{"width":"86%"},"width":1622,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/40-22.png","element":"img"}],[{"text":"The results for all these phases can be seen in Table ","element":"span"},{"href":"#id-146","text":"2","element":"a"},{"text":".","element":"span"}],[{"id":"id-146","style":{"width":"80%"},"width":1500,"height":367,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-0.png","element":"img"}],[{"text":"Table 2: Oracle complexity to reach an ","element":"figcaption","subtype":"caption"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-1.png","element":"img","alt":" 𝜀","inline":true},{"text":"-optimal solution to Problem ","element":"figcaption","subtype":"caption"},{"href":"#id-0","text":"1.1 ","element":"a","subtype":"caption"},{"text":"for the SOCGS algorithm (Algorithm ","element":"figcaption","subtype":"caption"},{"href":"#id-44","text":"2","element":"a","subtype":"caption"},{"text":").","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"Remark D.12. ","element":"span"},{"text":"The constant ","element":"span"},{"style":{"height":7.2},"width":17,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-2.png","element":"img","alt":" 𝑟","inline":true,"padRight":true},{"text":"is an invariant of the function and feasible region under consideration and has been used in a similar fashion in (","element":"span"},{"href":"#id-22","referenceIndex":60,"text":"Wolfe","element":"a"},{"text":", ","element":"span"},{"href":"#id-22","referenceIndex":60,"text":"1970","element":"a"},{"text":"; ","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"Guélat & Marcotte","element":"a"},{"text":", ","element":"span"},{"href":"#id-29","referenceIndex":30,"text":"1986","element":"a"},{"text":") and more recently in (","element":"span"},{"href":"#id-147","referenceIndex":22,"text":"Garber","element":"a"},{"text":", ","element":"span"},{"href":"#id-147","referenceIndex":22,"text":"2020","element":"a"},{"text":"), and although unknown, still makes the convergence analysis and complexity estimate conceptually useful, as it adds at most a constant number of iterations independent of ","element":"span"},{"style":{"height":7.6},"width":31.56,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-3.png","element":"img","alt":" 𝜀.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark D.13. ","element":"span"},{"text":"Note that for simplicity we are implicitly assuming in the complexity analysis that the last iterate of the SOCGS algorithm at the end of Phase 2 satisfies ","element":"span"},{"style":{"height":18.05},"width":767.24,"height":45.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-4.png","element":"img","alt":" 𝑓 (x𝑘) − 𝑓 (x∗) ≤ [𝐿𝜂𝑘/(2𝜇4)(√8𝜇(1 +√𝐿𝜔) +","inline":true},{"style":{"height":18.71},"width":179.36,"height":46.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-5.png","element":"img","alt":"√𝜂𝑘𝐿2)]−2","inline":true},{"text":", as otherwise the convergence guarantee in Theorem ","element":"span"},{"href":"#id-142","text":"D.9 ","element":"a"},{"text":"does not provide a contraction. If this is not the case at the end of Phase 2, then after an additional finite number of linearly convergent iterations in primal gap, the iterates will indeed satisfy ","element":"span"},{"style":{"height":19.73},"width":956.27,"height":49.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-6.png","element":"img","alt":" 𝑓 (x𝑘) − 𝑓 (x∗) ≤ [𝐿𝜂𝑘/(2𝜇4)(√8𝜇(1 +√𝐿𝜔) + √𝜂𝑘𝐿2)]−2","inline":true},{"text":", after which the complexity analysis from Phase 3 will apply.","element":"span"}],[{"id":"id-65","style":{"fontWeight":"bold"},"text":"Appendix E. Computational Results","element":"span"}],[{"text":"In this section we compare the performance of the SOCGS algorithm with that of other first-order projection-free algorithms for several problems of interest. In the first problem the Hessian oracle will be inexact, but will satisfy Assumption ","element":"span"},{"href":"#id-41","text":"2 ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":11.2},"width":125.46,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-7.png","element":"img","alt":" 𝜔 = 0.1","inline":true},{"text":", moreover we will also assume knowledge of the primal gap, by first computing a solution to high accuracy. In the remaining problems the Hessian oracle will be exact, and we will assume that we do not have knowledge of the primal gap, and will use the strategy outlined in Remark ","element":"span"},{"href":"#id-58","text":"3.8","element":"a"},{"text":". In the second experiment, in addition to using the exact Hessian, we will also implement SOCGS with an LBFGS Hessian update (SOCGS LBFGS) (note that this does not satisfy Assumption ","element":"span"},{"href":"#id-41","text":"2","element":"a"},{"text":"). In the second and third experiment we will also cap the maximum number of inner iterations for the SOCGS and NCG algorithms, as is done in the computational experiments of NCG and SVRCG.","element":"span"}],[{"text":"In all three experiments we compare the performance of the SOCGS algorithm with the vanilla Conditional Gradients algorithm (denoted by CG), the Away-Step and Pairwise-Step Conditional Gradients algorithms (ACG and PCG), the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Lazy Away-Step Conditional Gradients algorithm ","element":"span"},{"text":"(","element":"span"},{"href":"#id-62","referenceIndex":9,"text":"Braun et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-62","referenceIndex":9,"text":"2017","element":"a"},{"text":") (ACG (L)). In the first problem the Hessian oracle will be inexact, but will satisfy Assumption ","element":"span"},{"href":"#id-41","text":"2","element":"a"},{"text":". In the remaining problems the Hessian oracle will be exact.","element":"span"}],[{"text":"In the first experiment we also compare the performance of the algorithm with the Decomposition Invariant Conditional Gradient (DICG) algorithm (","element":"span"},{"href":"#id-63","referenceIndex":24,"text":"Garber & Meshi","element":"a"},{"text":", ","element":"span"},{"href":"#id-63","referenceIndex":24,"text":"2016","element":"a"},{"text":"), as the feasible region is a ","element":"span"},{"style":{"height":14.4},"width":258.78,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/41-8.png","element":"img","alt":" 0 − 1 polytope.","inline":true}],[{"text":"We also compare against the Conditional Gradient Sliding (CGS) algorithm (","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"Lan & Zhou","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"2016","element":"a"},{"text":") in the first experiment. This algorithm was also used in the second and third experiment, however the results were not competitive with the ones obtained for the other algorithms, both in terms of iteration count and wall-clock time, and so the CGS results are not included in the images for the second and third experiment.","element":"span"}],[{"text":"Additionally, in the first experiment we also compare against the Stochastic Variance-Reduced Conditional Gradients (SVRCG) algorithm (","element":"span"},{"href":"#id-64","referenceIndex":33,"text":"Hazan & Luo","element":"a"},{"text":", ","element":"span"},{"href":"#id-64","referenceIndex":33,"text":"2016","element":"a"},{"text":"), as we can take stochastic first-order oracles of the objective function in question. The third experiment has an objective function that is also amenable to stochastic first-order oracle calls, however the results obtained were not competitive with the other algorithms, both in terms of iteration count and wall-clock time, and so the results for this algorithm were not included in the images for the third experiment.","element":"span"}],[{"text":"In the second and third experiments, which use an exact second-order oracle, we also compare the performance against the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Newton Conditional Gradients ","element":"span"},{"text":"(NCG) algorithm in ","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"Liu et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"2022","element":"a"},{"text":") which is similar in spirit to the SOCGS algorithm. One of the key features of this algorithm is that it does not require an exact line search strategy, as it provides a specific step size strategy (however it requires selecting five hyperparameters), and it does not require estimating an upper bound on the primal gap.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark E.1 ","element":"span"},{"text":"(Hyperparameter search for the NCG algorithm)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"We tested 27 hyperparameters for the NCG algorithm, and the one that provided the best performance was selected. The parameters used (see (","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"Liu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"2022","element":"a"},{"text":") for their meaning) were combinations of ","element":"span"},{"style":{"height":14.8},"width":1065.47,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-0.png","element":"img","alt":" 𝐶1 ∈ {0.1, 0, 25, 0.4}, 𝛿 ∈ {0.01, 0, 5, 0.99} and 𝐶 = {1.1, 1.5, 2}.","inline":true,"padRight":true},{"text":"The two remaining hyperparemeters were chosen as ","element":"span"},{"style":{"height":21.78},"width":290.62,"height":54.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-1.png","element":"img","alt":" 𝛽 = 12 (1 − 12−1/𝐶 )","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":24.02},"width":436.42,"height":60.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-2.png","element":"img","alt":" 𝜎 = 1𝐶(1−𝛽) + 𝛽(1−2𝛽) (1−𝛽)2","inline":true,"padRight":true},{"text":"so as ","element":"span"},{"text":"to satisfy the requirements in Theorem 4.2 in (","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"Liu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"2022","element":"a"},{"text":"). The hyperparameters that gave the best performance were ","element":"span"},{"style":{"height":15.2},"width":901.32,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-3.png","element":"img","alt":" 𝜎 = 0.96, 𝛽 = 1/6.0, 𝐶 = 2.0, 𝐶1 = 0.25 and 𝛿 = 0.99.","inline":true}],[{"text":"One of the key challenges that we found when implementing the NCG algorithm is the management of the active set. Starting from a given point ","element":"span"},{"style":{"height":9.59},"width":37.21,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-4.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"the algorithm builds a quadratic approximation and performs a series of CG variant steps until the algorithm reaches a certain Frank-Wolfe gap (like in the SOCGS algorithm), which we denote by ","element":"span"},{"style":{"height":19.43},"width":92.28,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-5.png","element":"img","alt":" ˜xNCG𝑘","inline":true,"padRight":true},{"text":". At that point the algorithm either takes a step with ","element":"span"},{"style":{"height":16.4},"width":261.61,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-6.png","element":"img","alt":" 𝛾𝑘 = 1 (what is","inline":true,"padRight":true},{"text":"called a full step), or it takes a step size ","element":"span"},{"style":{"height":13.6},"width":107.81,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-7.png","element":"img","alt":" 𝛾𝑘 ≠ 1","inline":true,"padRight":true},{"text":"(which is called a damped step). In the former case the active set and the barycentric coordinates used for ","element":"span"},{"style":{"height":9.59},"width":70.94,"height":23.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-8.png","element":"img","alt":" x𝑘+1","inline":true,"padRight":true},{"text":"are simply those of ","element":"span"},{"style":{"height":19.43},"width":92.28,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-9.png","element":"img","alt":" ˜xNCG𝑘","inline":true,"padRight":true},{"text":", which is the point returned by the CG variant steps. In the latter case, however, we set ","element":"span"},{"style":{"height":19.43},"width":656.56,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-10.png","element":"img","alt":" x𝑘+1 = x𝑘 + 𝛾𝑘(˜xNCG𝑘 − x𝑘) with 𝛾𝑘 ≠ 1","inline":true},{"text":", and we need to combine the active sets and barycentric coordinates of the points ","element":"span"},{"style":{"height":9.59},"width":37.76,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-11.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.43},"width":92.29,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-12.png","element":"img","alt":" ˜xNCG𝑘","inline":true,"padRight":true},{"text":"to form ","element":"span"},{"style":{"height":9.59},"width":71.16,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-13.png","element":"img","alt":" x𝑘+1","inline":true},{"text":". This is a computationally expensive task in general, as the CG variant can drop and pick-up an arbitrary number of vertices going from ","element":"span"},{"style":{"height":9.59},"width":37.76,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-14.png","element":"img","alt":" x𝑘","inline":true,"padRight":true},{"text":"to ","element":"span"},{"style":{"height":19.43},"width":92.29,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-15.png","element":"img","alt":" ˜xNCG𝑘","inline":true,"padRight":true},{"text":", and we need to reconcile the two active sets and barycentric coordinates. This process involves checking if each vertex in the active set of ","element":"span"},{"style":{"height":19.43},"width":92.28,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-16.png","element":"img","alt":" ˜xNCG𝑘","inline":true,"padRight":true},{"text":"is in the active set of ","element":"span"},{"style":{"height":9.19},"width":36.92,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-17.png","element":"img","alt":" x𝑘","inline":true},{"text":", and vice-versa. When the dimensionality of the problem and the cardinality of the active set is high this can become too costly. That is why in general this algorithm is easiest to implement with CG variants that do not maintain an active set, like the vanilla CG algorithm or the DICG algorithm. We have chosen to use the vanilla CG algorithm in out implementation, as it gave good performance. Note however that there are simple feasible regions where updating the active set and the barycentric coordinates is trivial, like in the probability simplex.","element":"span"}],[{"text":"The experiments were run on a laptop with Windows 10, an Intel Core i7 2.4GHz CPU and 6GB RAM.","element":"span"}],[{"id":"id-67","style":{"fontWeight":"bold"},"text":"E.1 Sparse Coding over the Birkhoff Polytope","element":"span"}],[{"text":"Given a set of ","element":"span"},{"style":{"height":7.2},"width":29,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-18.png","element":"img","alt":" 𝑚","inline":true,"padRight":true},{"text":"input data points ","element":"span"},{"style":{"height":16.98},"width":511.8,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-19.png","element":"img","alt":" 𝑌 = [y1, · · · , y𝑚] with y𝑖 ∈ ℝ𝑑","inline":true},{"text":", sparse dictionary learning attempts to find a dictionary ","element":"span"},{"style":{"height":14.19},"width":158.67,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-20.png","element":"img","alt":" 𝑋 ∈ ℝ𝑑×𝑛 ","inline":true,"padRight":true},{"text":"and a sparse representation ","element":"span"},{"style":{"height":14.4},"width":502.18,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-21.png","element":"img","alt":" 𝑍 = [z1, · · · , z𝑚] with z𝑖 ∈ ℝ𝑛 ","inline":true,"padRight":true},{"text":"that minimizes:","element":"span"}],[{"style":{"width":"63%"},"width":1195,"height":119,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-22.png","element":"img"}],[{"text":"Where ","element":"span"},{"style":{"height":21.34},"width":717.13,"height":53.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-23.png","element":"img","alt":" C = {𝑋 ∈ ℝ𝑑×𝑛 | �𝑛𝑗=1 𝑋2𝑗,𝑖 ≤ 1, ∀𝑖 ∈ [1, 𝑑]}","inline":true,"padRight":true},{"text":"is the set of matrices with columns with ","element":"span"},{"style":{"height":13.59},"width":31.82,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-24.png","element":"img","alt":" ℓ2","inline":true,"padRight":true},{"text":"norm less than ","element":"span"},{"text":"one. This problem is of interest as many signal processing tasks see performance boosts when given a learned dictionary ","element":"span"},{"style":{"height":10.4},"width":29,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-25.png","element":"img","alt":" 𝑋","inline":true,"padRight":true},{"text":"that is able to give a sparse representation (","element":"span"},{"href":"#id-148","referenceIndex":47,"text":"Mairal et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-148","referenceIndex":47,"text":"2010","element":"a"},{"text":"), as opposed to a predefined dictionary obtained from Fourier or wavelet transforms. The elements in this learned dictionary are not required to be orthogonal, and they can form an undercomplete or an overcomplete dictionary.","element":"span"}],[{"style":{"width":"100%"},"width":1876,"height":528,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/42-26.png","element":"img"}],[{"text":"The gradient of ","element":"span"},{"style":{"height":15.2},"width":77.61,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-0.png","element":"img","alt":" 𝑓 (𝑋)","inline":true,"padRight":true},{"text":"amounts to computing ","element":"span"},{"style":{"height":18.54},"width":482.39,"height":46.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-1.png","element":"img","alt":" ∇ 𝑓 (𝑋) = �𝑚𝑖=1 −2(y𝑖 −𝑋z𝑖)z𝑇𝑖 ","inline":true,"padRight":true},{"text":"and the Hessian is given by the block ","element":"span"},{"text":"diagonal matrix ","element":"span"},{"style":{"height":19.21},"width":1115.32,"height":48.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-2.png","element":"img","alt":" ∇2 𝑓 (𝑋) ∈ ℝ𝑛2×𝑛2 with ∇2 𝑓 (𝑋) = diag [𝐵, · · · , 𝐵] where 𝐵 ∈ ℝ𝑛×𝑛 ","inline":true,"padRight":true},{"text":"has the form ","element":"span"},{"style":{"height":18.54},"width":235.6,"height":46.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-3.png","element":"img","alt":" 𝐵 = �𝑚𝑖=1 z𝑖z𝑇𝑖 .","inline":true,"padRight":true},{"text":"Therefore ","element":"span"},{"style":{"height":11.2},"width":25,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-4.png","element":"img","alt":" 𝐵","inline":true,"padRight":true},{"text":"will be positive definite as long as we can form a basis for ","element":"span"},{"style":{"height":10.99},"width":46.35,"height":27.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-5.png","element":"img","alt":" ℝ𝑛 ","inline":true,"padRight":true},{"text":"with the vectors ","element":"span"},{"style":{"height":14.4},"width":327.95,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-6.png","element":"img","alt":" z𝑖, with 𝑚 ∈ [1, 𝑚].","inline":true,"padRight":true},{"text":"This is verified numerically. As the eigenvalues of a block-diagonal matrix are the eigenvalues of the blocks that form the diagonal, and as we verify that ","element":"span"},{"style":{"height":11.2},"width":25,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-7.png","element":"img","alt":" 𝐵","inline":true,"padRight":true},{"text":"is positive definite, the function ","element":"span"},{"style":{"height":15.2},"width":77.62,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-8.png","element":"img","alt":" 𝑓 (𝑋)","inline":true,"padRight":true},{"text":"is ","element":"span"},{"style":{"height":10.8},"width":23,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-9.png","element":"img","alt":" 𝜇","inline":true},{"text":"-strongly convex and ","element":"span"},{"style":{"height":10.4},"width":24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-10.png","element":"img","alt":" 𝐿","inline":true},{"text":"-smooth. The complexity of the gradient computation scales as ","element":"span"},{"style":{"height":16.99},"width":142.41,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-11.png","element":"img","alt":" O(𝑚𝑛2).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark E.2 ","element":"span"},{"text":"(On the complexity of linear oracles for the Birkhoff polytope)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"Solving an LP exactly over the Birkhoff polytope using the Hungarian algorithm (from combinatorial optimization) has complexity ","element":"span"},{"style":{"height":16.99},"width":98.43,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-12.png","element":"img","alt":" O(𝑛3)","inline":true},{"text":". Thus it is more expensive to compute the gradient ","element":"span"},{"style":{"height":15.2},"width":111.41,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-13.png","element":"img","alt":" ∇ 𝑓 (𝑋)","inline":true,"padRight":true},{"text":"than it is to solve an LP over the Birkhoff polytope if ","element":"span"},{"style":{"height":14.4},"width":179.02,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-14.png","element":"img","alt":" 𝑚 is large.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark E.3 ","element":"span"},{"text":"(On the complexity of projection oracles for the Birkhoff polytope)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"There are no known algorithms to compute exact projections onto the Birkhoff polytope, and as such projections onto this feasible region have to be computed approximately. For example, if we use an interior-point method to compute a projection onto the Birkhoff polytope, the projection is computed to a certain accuracy (say ","element":"span"},{"style":{"height":11.24},"width":23.13,"height":28.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-15.png","element":"img","alt":" ˆ𝜀","inline":true},{"text":"), and as such the complexity will depends on a ","element":"span"},{"style":{"height":14.8},"width":120.1,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-16.png","element":"img","alt":" log 1/ˆ𝜀","inline":true,"padRight":true},{"text":"term. Moreover, to represent the constraints of the Birkhoff polytope we need ","element":"span"},{"style":{"height":13.79},"width":36.44,"height":34.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-17.png","element":"img","alt":" 𝑛2 ","inline":true,"padRight":true},{"text":"linear inequality constraints and ","element":"span"},{"style":{"height":10.8},"width":101.38,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-18.png","element":"img","alt":" 2𝑛 − 1","inline":true,"padRight":true},{"text":"linear equality constraints. We can get rid of the equality constraints by adding ","element":"span"},{"style":{"height":15.2},"width":154.79,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-19.png","element":"img","alt":" 2(2𝑛 − 1)","inline":true,"padRight":true},{"text":"inequality constraints. We can transform the projection problem with a quadratic objective function and linear inequality constraints into a problem with a linear objective function and quadratic/linear inequality constraints using standard optimization techniques. This means that we have ","element":"span"},{"style":{"height":16.99},"width":98.43,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-20.png","element":"img","alt":" O(𝑛2)","inline":true,"padRight":true},{"text":"inequality constraints, and the dimensionality of our problem is ","element":"span"},{"style":{"height":13.79},"width":36.44,"height":34.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-21.png","element":"img","alt":" 𝑛2","inline":true},{"text":". If we use a path following interior-point method, and we use the complexity guarantee from Equation 10.12 in (","element":"span"},{"href":"#id-149","referenceIndex":50,"text":"Nemirovski","element":"a"},{"text":", ","element":"span"},{"href":"#id-149","referenceIndex":50,"text":"2004","element":"a"},{"text":") the resulting complexity to reach an ","element":"span"},{"style":{"height":11.24},"width":23.13,"height":28.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-22.png","element":"img","alt":" ˆ𝜀","inline":true},{"text":"-optimal solution is ","element":"span"},{"style":{"height":17.39},"width":222.38,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-23.png","element":"img","alt":" O(𝑛7 log 1/ˆ𝜀)","inline":true},{"text":". Note that in the complexity guarantee in the reference, the ambient dimension is ","element":"span"},{"style":{"height":7.2},"width":20,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-24.png","element":"img","alt":" 𝑛","inline":true},{"text":", whereas in our case it is ","element":"span"},{"style":{"height":13.79},"width":36.44,"height":34.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-25.png","element":"img","alt":" 𝑛2","inline":true},{"text":", and the number of constraints is ","element":"span"},{"style":{"height":13.79},"width":36.45,"height":34.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-26.png","element":"img","alt":" 𝑛2","inline":true,"padRight":true},{"text":"as opposed to ","element":"span"},{"style":{"height":7.2},"width":29,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-27.png","element":"img","alt":" 𝑚","inline":true},{"text":". The cost of these projection oracles justifies the use of conditional gradient algorithms to minimize convex functions over the Birkhoff polytope.","element":"span"}],[{"text":"We generate synthetic data by creating a matrix ","element":"span"},{"style":{"height":12.99},"width":360.63,"height":32.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-28.png","element":"img","alt":" 𝐵 ∈ ℝ𝑛×𝑛 with 𝑛 = 80","inline":true,"padRight":true},{"text":"and entries sampled from a standard normal distribution, and ","element":"span"},{"style":{"height":7.2},"width":29,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-29.png","element":"img","alt":" 𝑚","inline":true,"padRight":true},{"text":"vectors ","element":"span"},{"style":{"height":10.99},"width":116.21,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-30.png","element":"img","alt":" x ∈ ℝ𝑛","inline":true},{"text":", with entries sampled from a standard normal distribution, in order to form ","element":"span"},{"style":{"height":14.8},"width":279.38,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-31.png","element":"img","alt":" 𝑍 = {z1, · · · , z𝑚}","inline":true},{"text":". The set of vectors ","element":"span"},{"style":{"height":14.82},"width":286.98,"height":37.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-32.png","element":"img","alt":" 𝑌 = {y1, · · · , y𝑚}","inline":true,"padRight":true},{"text":"is generated by computing ","element":"span"},{"style":{"height":13.6},"width":133.4,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-33.png","element":"img","alt":" y𝑖 = 𝐵z𝑖","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":16},"width":179.18,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-34.png","element":"img","alt":" 𝑖 ∈ ⟦1, 𝑚⟧.","inline":true}],[{"text":"Let us denote the Frobenius norm by ","element":"span"},{"style":{"height":18.78},"width":70.34,"height":46.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-35.png","element":"img","alt":" ∥·∥2𝐹","inline":true},{"text":", and the uniform distribution between ","element":"span"},{"style":{"height":7.6},"width":20,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-36.png","element":"img","alt":" 𝑎","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.6},"width":20,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-37.png","element":"img","alt":" 𝑏","inline":true,"padRight":true},{"text":"as ","element":"span"},{"style":{"height":15.2},"width":129.27,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-38.png","element":"img","alt":" U(𝑎, 𝑏)","inline":true},{"text":". ","element":"span"},{"text":"In this problem the Hessian oracle will return a matrix ","element":"span"},{"style":{"height":18.77},"width":615.06,"height":46.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-39.png","element":"img","alt":" 𝐻𝑘 = ∇2 𝑓 (𝑋𝑘) + 𝛽𝑘𝜔 ∥𝑋𝑘 − 𝑋∗∥2𝐹 𝐼𝑛","inline":true},{"text":", where ","element":"span"},{"style":{"height":14},"width":78.38,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-40.png","element":"img","alt":" 𝛽𝑘 ∈","inline":true},{"style":{"height":18.77},"width":987.6,"height":46.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-41.png","element":"img","alt":"U(−𝜆max(∇2 𝑓 (𝑋𝑘))/(𝜔 ∥𝑋𝑘 − 𝑋∗∥2𝐹 + 1), 𝜆min(∇2 𝑓 (𝑋𝑘))).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark E.4. ","element":"span"},{"text":"The approximate matrix ","element":"span"},{"style":{"height":18.77},"width":703.95,"height":46.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-42.png","element":"img","alt":" 𝐻𝑘 = ∇2 𝑓 (𝑋𝑘) + 𝛽𝑘𝜔 ∥𝑋𝑘 − 𝑋∗∥2𝐹 𝐼𝑛 with:","inline":true}],[{"style":{"width":"69%"},"width":1302,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-43.png","element":"img"}],[{"text":"satisfies Assumption ","element":"span"},{"href":"#id-41","text":"2","element":"a"},{"text":".","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"To see this note that ","element":"span"},{"style":{"height":19.03},"width":941.43,"height":47.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-44.png","element":"img","alt":" 𝜂𝑘 = max{𝜆max(𝐻−1𝑘 ∇2 𝑓 (𝑋𝑘)), 𝜆max([∇2 𝑓 (𝑋𝑘)]−1𝐻𝑘)}","inline":true,"padRight":true},{"text":"and if we plug in the ","element":"span"},{"text":"approximation for the Hessian we have that:","element":"span"}],[{"style":{"height":18.99},"width":1593.42,"height":47.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-45.png","element":"img","alt":"𝜆max([∇2 𝑓 (𝑋𝑘)]−1𝐻𝑘) = 𝜆max([∇2 𝑓 (𝑋𝑘)]−1(∇2 𝑓 (𝑋𝑘) + 𝛽𝑘𝜔 ∥𝑋𝑘 − 𝑋∗∥2𝐹 𝐼𝑛)) (E.3)","inline":true},{"style":{"height":18.99},"width":1198.29,"height":47.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-46.png","element":"img","alt":"= 1 + 𝛽𝑘𝜔 ∥𝑋𝑘 − 𝑋∗∥2𝐹 𝜆max([∇2 𝑓 (𝑋𝑘)]−1) (E.4)","inline":true},{"style":{"height":18.99},"width":1198.29,"height":47.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/43-47.png","element":"img","alt":"= 1 + 𝛽𝑘𝜔 ∥𝑋𝑘 − 𝑋∗∥2𝐹 /𝜆min(∇2 𝑓 (𝑋𝑘)). (E.5)","inline":true}],[{"text":"On the other hand:","element":"span"}],[{"style":{"width":"101%"},"width":1895,"height":609,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-0.png","element":"img"}],[{"text":"The results for ","element":"span"},{"style":{"height":11.2},"width":180.5,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-1.png","element":"img","alt":" 𝑚 = 10000","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.2},"width":200.82,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-2.png","element":"img","alt":" 𝑚 = 100000","inline":true,"padRight":true},{"text":"can be seen in Figure ","element":"span"},{"href":"#id-150","text":"7 ","element":"a"},{"text":"and Figure ","element":"span"},{"href":"#id-151","text":"8 ","element":"a"},{"text":"respectively. In both cases, the initial point used for all the algorithms is the identity matrix ","element":"span"},{"style":{"height":12.59},"width":70.96,"height":31.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-3.png","element":"img","alt":" 𝐼𝑛×𝑛","inline":true},{"text":". We can see that the SOCGS algorithm (with the DICG algorithm as a subproblem solver for the PVM steps) outperforms all the other algorithms being considered for both moderate to high values of ","element":"span"},{"style":{"height":7.2},"width":29,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-4.png","element":"img","alt":" 𝑚","inline":true},{"text":". The performance of the SVRCG algorithm improves relative to the other algorithms as we increase the value of ","element":"span"},{"style":{"height":7.2},"width":29,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-5.png","element":"img","alt":" 𝑚","inline":true},{"text":", as expected. We use the original implementation of the CGS algorithm for strongly-convex and smooth functions shown in ","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"Lan & Zhou ","element":"a"},{"text":"(","element":"span"},{"href":"#id-11","referenceIndex":41,"text":"2016","element":"a"},{"text":"), which uses CG to solve the Euclidean projection subproblems that arise in Nesterov’s Accelerated Gradient Descent. The poor performance of the CGS algorithm can be explained with the fact that the CG algorithm does not contract the Frank-Wolfe gap linearly in general, and the accuracy to which the subproblems are solved increases with each iteration, and so at some point the subproblems become very computationally expensive to solve.","element":"span"}],[{"style":{"width":"64%"},"width":1217,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-6.png","element":"img"}],[{"text":"Given a binary classification task with ","element":"span"},{"style":{"height":7.2},"width":29,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-7.png","element":"img","alt":" 𝑚","inline":true,"padRight":true},{"text":"labels ","element":"span"},{"style":{"height":14.8},"width":299.55,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-8.png","element":"img","alt":" 𝑌 = {y1, · · · , y𝑚}","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":7.2},"width":29,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-9.png","element":"img","alt":" 𝑚","inline":true,"padRight":true},{"text":"samples ","element":"span"},{"style":{"height":14.8},"width":291.82,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-10.png","element":"img","alt":" 𝑍 = {z1, · · · , z𝑚}","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":15.2},"width":695.03,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-11.png","element":"img","alt":"𝑦𝑖 ∈ {−1, 1} and z𝑖 ∈ ℝ𝑛 for all 𝑖 ∈ [1, 𝑚]","inline":true},{"text":", we wish to solve:","element":"span"}],[{"style":{"width":"45%"},"width":845,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-12.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"X ","element":"span"},{"text":"is the ","element":"span"},{"style":{"height":13.59},"width":31.82,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-13.png","element":"img","alt":" ℓ1","inline":true,"padRight":true},{"text":"unit ball centered at the origin and ","element":"span"},{"style":{"height":14.8},"width":141.48,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-14.png","element":"img","alt":" 𝜆 = 1/𝑚","inline":true},{"text":". Although projecting into the ","element":"span"},{"style":{"height":13.59},"width":31.82,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-15.png","element":"img","alt":" ℓ1","inline":true,"padRight":true},{"text":"ball has complexity ","element":"span"},{"href":"#id-152","referenceIndex":15,"style":{"height":16.4},"width":343.96,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-16.png","element":"img","alt":" O(𝑛) (Condat, 2016","inline":true},{"text":"), and so projections are cheap, this feasible region is often used to compare the performance of projection-free algorithms between each other (see ","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"Lacoste-Julien & Jaggi ","element":"a"},{"text":"(","element":"span"},{"href":"#id-27","referenceIndex":39,"text":"2015","element":"a"},{"text":"); ","element":"span"},{"href":"#id-153","referenceIndex":57,"text":"Rao ","element":"a"},{"href":"#id-153","referenceIndex":57,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-153","referenceIndex":57,"text":"2015","element":"a"},{"text":"); ","element":"span"},{"href":"#id-154","referenceIndex":10,"text":"Braun et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-154","referenceIndex":10,"text":"2019","element":"a"},{"text":")). Solving a linear program over the ","element":"span"},{"style":{"height":13.59},"width":31.82,"height":33.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-17.png","element":"img","alt":" ℓ1","inline":true,"padRight":true},{"text":"ball also has complexity ","element":"span"},{"style":{"height":15.2},"width":189.12,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-18.png","element":"img","alt":" O(𝑛). This","inline":true,"padRight":true},{"text":"experiment was also considered in ","element":"span"},{"href":"#id-155","referenceIndex":26,"text":"Ghanbari & Scheinberg ","element":"a"},{"text":"(","element":"span"},{"href":"#id-155","referenceIndex":26,"text":"2018","element":"a"},{"text":") and ","element":"span"},{"href":"#id-5","referenceIndex":58,"text":"Scheinberg & Tang ","element":"a"},{"text":"(","element":"span"},{"href":"#id-5","referenceIndex":58,"text":"2016","element":"a"},{"text":") to compare the performance of several Proximal Quasi-Newton methods in the context of minimization with a projection oracle. The gradient of the objective function has the form given by:","element":"span"}],[{"style":{"width":"30%"},"width":566,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-19.png","element":"img"}],[{"text":"The Hessian of the objective function can be written as:","element":"span"}],[{"id":"id-156","style":{"width":"72%"},"width":1359,"height":111,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-20.png","element":"img"}],[{"text":"Note that the ","element":"span"},{"style":{"height":16.99},"width":252.56,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-21.png","element":"img","alt":" ∇2 𝑓 (x) ∈ ℝ𝑛×𝑛 ","inline":true,"padRight":true},{"text":"in Equation (","element":"span"},{"href":"#id-156","text":"E.10","element":"a"},{"text":"), and so for large ","element":"span"},{"style":{"height":7.2},"width":20,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-22.png","element":"img","alt":" 𝑛","inline":true,"padRight":true},{"text":"even storing the Hessian might become problematic. However, the quadratic approximation does not need to store the matrix, as the function ","element":"span"},{"style":{"height":18.3},"width":81.38,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/44-23.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true}],[{"text":"can be written as:","element":"span"}],[{"style":{"width":"72%"},"width":1364,"height":369,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-0.png","element":"img"}],[{"text":"Which means that the gradient of ","element":"span"},{"style":{"height":18.3},"width":81.65,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-1.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"is given by:","element":"span"}],[{"style":{"width":"60%"},"width":1134,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-2.png","element":"img"}],[{"text":"When computing the Inexact PVM steps we compute ","element":"span"},{"style":{"height":15.2},"width":122.1,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-3.png","element":"img","alt":" ∇ 𝑓 (x𝑘)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.8},"width":552.75,"height":44.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-4.png","element":"img","alt":" 1/((1 + 𝑒−𝑦𝑖⟨x𝑘,z𝑖⟩)(1 + 𝑒𝑦𝑖⟨x𝑘,z𝑖⟩))","inline":true,"padRight":true},{"text":"for each ","element":"span"},{"style":{"height":14.4},"width":155.07,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-5.png","element":"img","alt":"𝑖 ∈ [1, 𝑚]","inline":true,"padRight":true},{"text":"at the beginning of the iteration, as these quantities do not change for a fixed ","element":"span"},{"style":{"height":11.6},"width":19,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-6.png","element":"img","alt":" 𝑘","inline":true},{"text":". This significantly decreases the time it takes to compute an ACG step with ","element":"span"},{"href":"#id-110","style":{"height":18.3},"width":279.94,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-7.png","element":"img","alt":" ∇ ˆ𝑓𝑘(x) in Line 4","inline":true,"padRight":true},{"text":"of Algorithm ","element":"span"},{"href":"#id-110","text":"7","element":"a"},{"text":", as we only perform operations with transcendental operations once at the beginning of the PVM step. Moreover, as in the previous numerical experiments, we can find a closed-form expression for the line search, that is:","element":"span"}],[{"style":{"width":"57%"},"width":1074,"height":174,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-8.png","element":"img"}],[{"text":"Where we only need to compute a series of inner products with quantities that in many cases we have already pre-computed in previous operations and stored. This makes line searches with ","element":"span"},{"style":{"height":18.3},"width":81.91,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-9.png","element":"img","alt":"ˆ𝑓𝑘(x)","inline":true,"padRight":true},{"text":"significantly cheaper than line searches with ","element":"span"},{"style":{"height":15.2},"width":82.24,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-10.png","element":"img","alt":" 𝑓 (x).","inline":true}],[{"text":"The labels and samples used are taken from the training set of the ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"gissette ","element":"span"},{"text":"(","element":"span"},{"href":"#id-71","referenceIndex":31,"text":"Guyon et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-71","referenceIndex":31,"text":"2007","element":"a"},{"text":") (Figure ","element":"span"},{"href":"#id-157","text":"9","element":"a"},{"text":") and the ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"real-sim ","element":"span"},{"text":"(","element":"span"},{"href":"#id-72","referenceIndex":12,"text":"Chang & Lin","element":"a"},{"text":", ","element":"span"},{"href":"#id-72","referenceIndex":12,"text":"2011","element":"a"},{"text":") (Figure ","element":"span"},{"href":"#id-158","text":"10","element":"a"},{"text":") dataset, where ","element":"span"},{"style":{"height":11.6},"width":715.68,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-11.png","element":"img","alt":" 𝑛 = 5000 and 𝑚 = 6000 and 𝑛 = 72309 and","inline":true},{"style":{"height":11.2},"width":174.28,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-12.png","element":"img","alt":"𝑚 = 20958","inline":true},{"text":", respectively. Figure ","element":"span"},{"href":"#id-70","text":"2 ","element":"a"},{"text":"shows the performance of Algorithm ","element":"span"},{"href":"#id-44","text":"2 ","element":"a"},{"text":"with the Lazy Away-Step Conditional Gradient algorithm (","element":"span"},{"href":"#id-154","referenceIndex":10,"text":"Braun et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-154","referenceIndex":10,"text":"2019","element":"a"},{"text":"). We also limit the maximum number of inner iterations that the SOCGS algorithm and the NCG algorithm perform at each outer iteration to ","element":"span"},{"text":"1000","element":"span"},{"text":". In this last example we substituted the step size strategy of the NCG algorithm with a line search, as otherwise we were not getting comparable performance to the other algorithms using the step size strategy defined in ","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"Liu et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-16","referenceIndex":46,"text":"2022","element":"a"},{"text":"). We use a golden-section bounded line search for all the line searches for which we cannot find a closed-form solution.","element":"span"}],[{"text":"The results for this experiment can be seen in Figure ","element":"span"},{"href":"#id-157","text":"9 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-158","text":"10","element":"a"},{"text":". The initial point used for all the algorithms is the vector ","element":"span"},{"style":{"height":15.2},"width":297.55,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-13.png","element":"img","alt":" x0 = (1, 0, · · · , 0)","inline":true},{"text":". We can see that the SOCGS algorithm (with the AFW algorithm as a subproblem solver for the PVM steps) and the NCG algorithm outperform all the other algorithms, with the SOCGS performing better than the NCG algorithm. The quadratic approximation in this example is easier to evaluate than the original function, as we only need to perform operations with transcendental functions once when we build the approximation, reusing these quantities for all remaining inner iterations. Like in the previous two examples, the SOCGS algorithm and the NCG algorithm benefit from the fact that there is a closed-form solution to the step size at each inner iteration when computing the PVM steps, and so avoid a potentially expensive golden section line search.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"E.3 Inverse covariance estimation over spectrahedron","element":"span"}],[{"text":"In many applications the relationships between variables can be modeled with the use of undirected graphical models, such is the case for example in gene expression problems, where the goal is to find out which groups of genes are responsible for producing a certain outcome, given a gene dataset. When the underlying distribution of these variables is Gaussian, the problem of determining the relationship between variables boils down to finding patterns of zeros in the inverse covariance matrix ","element":"span"},{"style":{"height":13.39},"width":62.69,"height":33.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-14.png","element":"img","alt":" Σ−1","inline":true,"padRight":true},{"text":"of the distribution. A common approach to solving this problem relies on finding a ","element":"span"},{"style":{"height":13.59},"width":31.82,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-15.png","element":"img","alt":" ℓ1","inline":true},{"text":"-regularized maximum likelihood estimator of ","element":"span"},{"style":{"height":13.39},"width":62.69,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/45-16.png","element":"img","alt":" Σ−1","inline":true},{"text":", so as to ","element":"span"},{"text":"encourage sparsity, over the positive definite cone (","element":"span"},{"href":"#id-159","referenceIndex":4,"text":"Banerjee et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-159","referenceIndex":4,"text":"2008","element":"a"},{"text":"; ","element":"span"},{"href":"#id-160","referenceIndex":20,"text":"Friedman et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-160","referenceIndex":20,"text":"2008","element":"a"},{"text":"), this is often called the Graphical Lasso.","element":"span"}],[{"text":"Several optimization algorithms have been used to tackle this problem, such as interior point methods (","element":"span"},{"href":"#id-161","referenceIndex":61,"text":"Yuan & Lin","element":"a"},{"text":", ","element":"span"},{"href":"#id-161","referenceIndex":61,"text":"2007","element":"a"},{"text":"), block coordinate descent or accelerated first-order algorithms (","element":"span"},{"href":"#id-159","referenceIndex":4,"text":"Banerjee et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-159","referenceIndex":4,"text":"2008","element":"a"},{"text":"), coordinate descent algorithms (","element":"span"},{"href":"#id-160","referenceIndex":20,"text":"Friedman et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-160","referenceIndex":20,"text":"2008","element":"a"},{"text":") and even projected limited-memory quasi-Newton algorithms (","element":"span"},{"href":"#id-10","referenceIndex":59,"text":"Schmidt et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":59,"text":"2009","element":"a"},{"text":"). We solve a variation of the Graphical Lasso problem over the space of positive semidefinite matrices of unit trace, that is:","element":"span"}],[{"style":{"width":"73%"},"width":1372,"height":113,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-0.png","element":"img"}],[{"text":"Where ","element":"span"},{"style":{"height":11.2},"width":90.96,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-1.png","element":"img","alt":" 𝛿 > 0","inline":true,"padRight":true},{"text":"is a small constant that we add to make to problem smooth, ","element":"span"},{"style":{"height":18.74},"width":428.22,"height":46.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-2.png","element":"img","alt":" 𝑆 = �𝑁𝑖=1(z𝑖 − 𝜇)(z𝑖 − 𝜇)𝑇","inline":true,"padRight":true},{"text":"is the ","element":"span"},{"text":"empirical covariance matrix of a set of datapoints ","element":"span"},{"style":{"height":14.8},"width":281.81,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-3.png","element":"img","alt":" 𝑍 = {z1, · · · , z𝑁 }","inline":true,"padRight":true},{"text":"drawn from a Gaussian distribution with ","element":"span"},{"style":{"height":13.38},"width":130.53,"height":33.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-4.png","element":"img","alt":"z𝑖 ∈ ℝ𝑚","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.2},"width":91.8,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-5.png","element":"img","alt":" 𝜆 > 0","inline":true,"padRight":true},{"text":"is a regularization parameter. This feasible region (known as the spectrahedron) is not a polytope, and so the guarantees shown in the paper do not apply as they crucially rely on Theorem ","element":"span"},{"href":"#id-30","text":"2.1","element":"a"},{"text":". However, we include the results to show the promising numerical performance of the method. Evaluating ","element":"span"},{"style":{"height":15.2},"width":77.62,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-6.png","element":"img","alt":"𝑓 (𝑋)","inline":true,"padRight":true},{"text":"has complexity ","element":"span"},{"style":{"height":16.99},"width":98.42,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-7.png","element":"img","alt":" O(𝑛3)","inline":true,"padRight":true},{"text":"if we compute the determinant with a LU decomposition, and evaluating the gradient ","element":"span"},{"style":{"height":16.99},"width":537.94,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-8.png","element":"img","alt":" ∇ 𝑓 (𝑋) = −(𝑋 + 𝛿𝐼𝑛)−1 + 𝑆 + 𝜆𝑋","inline":true,"padRight":true},{"text":"has complexity ","element":"span"},{"style":{"height":16.99},"width":98.42,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-9.png","element":"img","alt":" O(𝑛3)","inline":true},{"text":", dominated by the matrix inversion. Solving the linear program ","element":"span"},{"style":{"height":18.63},"width":651.95,"height":46.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-10.png","element":"img","alt":" min𝑌 ∈X�𝑛𝑖, 𝑗=1(∇ 𝑓 (𝑋) ⊗ 𝑌)𝑖, 𝑗, where ⊗","inline":true,"padRight":true},{"text":"denotes the Hadamard product, amounts to finding ","element":"span"},{"text":"the largest eigenvector of ","element":"span"},{"style":{"height":15.2},"width":137.15,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-11.png","element":"img","alt":" −∇ 𝑓 (𝑋)","inline":true},{"text":". We do this approximately by using the Implicitly Restarted Lanczos algorithm (","element":"span"},{"href":"#id-162","referenceIndex":44,"text":"Lehoucq et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-162","referenceIndex":44,"text":"1998","element":"a"},{"text":") (implemented in ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"eigsh ","element":"span"},{"text":"in the ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"scipy.spars","element":"span"},{"href":"#id-110","style":{"fontFamily":"monospace"},"text":"e.l","element":"a"},{"style":{"fontFamily":"monospace"},"text":"inalg ","element":"span"},{"text":"librar","element":"span"},{"href":"#id-110","text":"y).","element":"a"}],[{"text":"The quadratic approximation ","element":"span"},{"style":{"height":18.3},"width":90.29,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-12.png","element":"img","alt":"ˆ𝑓𝑘(𝑋)","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"height":15.2},"width":77.61,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-13.png","element":"img","alt":" 𝑓 (𝑋)","inline":true,"padRight":true},{"text":"that the PVM steps in Line ","element":"span"},{"href":"#id-110","text":"10 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-110","text":"7 ","element":"a"},{"text":"uses can be written as:","element":"span"}],[{"style":{"height":19.75},"width":1565.46,"height":49.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-14.png","element":"img","alt":"ˆ𝑓𝑘(𝑋) = trace ��−(𝑋𝑘 + 𝛿𝐼𝑛)−1 + 𝑆 + 𝜆𝑋𝑘� (𝑋 − 𝑋𝑘)� (E.12)","inline":true,"padRight":true},{"text":"+ ","element":"span"},{"text":"1","element":"span"},{"text":"2","element":"span"}],[{"id":"id-163","style":{"width":"77%"},"width":1456,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-15.png","element":"img"}],[{"text":"This allows us to write the gradient ","element":"span"},{"style":{"height":18.3},"width":124.08,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-16.png","element":"img","alt":" ∇ ˆ𝑓𝑘(𝑋)","inline":true,"padRight":true},{"text":"of the quadratic approximation as:","element":"span"}],[{"style":{"width":"60%"},"width":1130,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-17.png","element":"img"}],[{"text":"The complexity of evaluating the gradient of ","element":"span"},{"style":{"height":18.3},"width":323.56,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-18.png","element":"img","alt":"ˆ𝑓𝑘(𝑋) is also O(𝑛3)","inline":true},{"text":", dominated by the matrix inversion and the matrix multiplication operations. In practice, we only invert the matrix ","element":"span"},{"style":{"height":16.98},"width":209.63,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-19.png","element":"img","alt":" (𝑋𝑘 + 𝛿𝐼𝑛)−1 ","inline":true,"padRight":true},{"text":"once per iteration when we form the quadratic approximation in Line ","element":"span"},{"href":"#id-110","text":"6 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-110","text":"7","element":"a"},{"text":". Nevertheless, this means that the complexity of computing ","element":"span"},{"style":{"height":18.29},"width":328.18,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-20.png","element":"img","alt":" ∇ 𝑓 (𝑋) and ∇ ˆ𝑓𝑘(𝑋)","inline":true,"padRight":true},{"text":"is the same, so in this respect there is no advantage to using the quadratic approximation. However, for the quadratic approximation ","element":"span"},{"style":{"height":18.3},"width":90.29,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-21.png","element":"img","alt":"ˆ𝑓𝑘(𝑋)","inline":true,"padRight":true},{"text":"we can find a closed-form expression for the optimal step size when moving along a direction ","element":"span"},{"style":{"height":10.8},"width":30,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-22.png","element":"img","alt":" 𝐷","inline":true},{"text":". It suffices to take the derivative of ","element":"span"},{"style":{"height":18.3},"width":278.68,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-23.png","element":"img","alt":"ˆ𝑓𝑘(𝑋 + 𝛾𝐷) with","inline":true,"padRight":true},{"text":"respect to ","element":"span"},{"style":{"height":10.4},"width":22,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-24.png","element":"img","alt":" 𝛾","inline":true,"padRight":true},{"text":"using the expression shown in Equation (","element":"span"},{"href":"#id-163","text":"E.15","element":"a"},{"text":") and set the derivative to zero. This leads to:","element":"span"}],[{"id":"id-164","style":{"width":"73%"},"width":1385,"height":135,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-25.png","element":"img"}],[{"text":"If we use a golden section search to perform a line search over the original function ","element":"span"},{"style":{"height":15.2},"width":77.62,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-26.png","element":"img","alt":" 𝑓 (𝑋)","inline":true,"padRight":true},{"text":"to compute the optimal step size we will potentially need to evaluate ","element":"span"},{"style":{"height":15.2},"width":77.61,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-27.png","element":"img","alt":" 𝑓 (𝑋)","inline":true,"padRight":true},{"text":"multiple times, and each evaluation has complexity ","element":"span"},{"style":{"height":16.98},"width":98.42,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-28.png","element":"img","alt":"O(𝑛3)","inline":true},{"text":". On the other hand, to compute the exact line search for ","element":"span"},{"style":{"height":18.29},"width":90.29,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-29.png","element":"img","alt":"ˆ𝑓𝑘(𝑋)","inline":true,"padRight":true},{"text":"we only need to evaluate the expression in Equation (","element":"span"},{"href":"#id-164","text":"E.16","element":"a"},{"text":") once, with complexity ","element":"span"},{"style":{"height":16.99},"width":98.42,"height":42.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-30.png","element":"img","alt":" O(𝑛3)","inline":true},{"text":". This makes the line search operation with ","element":"span"},{"style":{"height":18.3},"width":90.29,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-31.png","element":"img","alt":"ˆ𝑓𝑘(𝑋)","inline":true,"padRight":true},{"text":"significantly cheaper than the line search with ","element":"span"},{"style":{"height":15.2},"width":77.61,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-32.png","element":"img","alt":" 𝑓 (𝑋)","inline":true},{"text":", and makes the ACG iterations in Line ","element":"span"},{"href":"#id-110","text":"10 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-110","text":"7 ","element":"a"},{"text":"significantly cheaper than the iterations in Line ","element":"span"},{"href":"#id-110","text":"18 ","element":"a"},{"text":"of Algorithm ","element":"span"},{"href":"#id-110","text":"7","element":"a"},{"text":".","element":"span"}],[{"text":"The matrix ","element":"span"},{"style":{"height":11.2},"width":21,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-33.png","element":"img","alt":" 𝑆","inline":true,"padRight":true},{"text":"is generated by computing a random orthonormal basis ","element":"span"},{"style":{"height":14.8},"width":306.5,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-34.png","element":"img","alt":" B = {v1, · · · , v𝑚}","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":10.99},"width":54.36,"height":27.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-35.png","element":"img","alt":" ℝ𝑚","inline":true,"padRight":true},{"text":"and computing ","element":"span"},{"style":{"height":18.36},"width":279.28,"height":45.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-36.png","element":"img","alt":" 𝑆 = �𝑖=1 𝜎𝑖v1v𝑇1","inline":true,"padRight":true},{"text":", where ","element":"span"},{"style":{"height":9.59},"width":33.67,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-37.png","element":"img","alt":" 𝜎𝑖","inline":true,"padRight":true},{"text":"is uniformly distributed between ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"5 ","element":"span"},{"text":"and ","element":"span"},{"text":"1 ","element":"span"},{"text":"for ","element":"span"},{"style":{"height":14.4},"width":164.76,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/46-38.png","element":"img","alt":" 𝑖 ∈ [1, 𝑚]","inline":true},{"text":". We use ","element":"span"},{"style":{"height":11.6},"width":139.55,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/47-0.png","element":"img","alt":"𝜆 = 0.05","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.19},"width":142.69,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/47-1.png","element":"img","alt":" 𝛿 = 10−5","inline":true,"padRight":true},{"text":"in the experiments. We also limit the maximum number of inner iterations that the SOCGS algorithm and the NCG algorithm perform at each outer iteration to ","element":"span"},{"text":"1000","element":"span"},{"text":". We use a golden-section bounded line search for all the line searches for which we cannot find a closed-form solution.","element":"span"}],[{"text":"We also implemented an LBFGS algorithm to build an approximate Hessian from first order information from previous iterations. This is specially useful if we cannot find an analytical expression to the exact Hessian, or its matrix-vector products. Note however that the matrix outputted by the LBFGS algorithm does not satisfy Assumption ","element":"span"},{"href":"#id-41","text":"2","element":"a"},{"text":", and so the best we can hope for is for the linear-quadratic convergence in primal gap of the SOCGS algorithm. The implementation used stores the Hessian approximation in outer-product form, and so does not explicitly store the full Hessian matrix, as that could be computationally prohibitive (see Section 7.2 in ","element":"span"},{"href":"#id-165","referenceIndex":54,"text":"Nocedal & Wright ","element":"a"},{"text":"(","element":"span"},{"href":"#id-165","referenceIndex":54,"text":"2006","element":"a"},{"text":")).","element":"span"}],[{"text":"The results for this experiment can be seen in Figures ","element":"span"},{"href":"#id-166","text":"11 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-167","text":"12","element":"a"},{"text":". The initial point for all the algorithms is the matrix ","element":"span"},{"style":{"height":14.8},"width":93.49,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/47-2.png","element":"img","alt":" 1/𝑛𝐼𝑛","inline":true},{"text":". We can see that the SOCGS (with the PCG algorithm as a subproblem solver for the PVM steps) and the NCG algorithm outperform all the other algorithms, with the SOCGS performing better than the NCG algorithm. Note that the in this case the main advantage that the SOCGS and the NCG algorithms have over all the other algorithms is the fact that there is a closed-form solution to the step size at each inner iteration when computing the PVM steps. As discussed earlier, the complexity of evaluating the original function ","element":"span"},{"style":{"height":15.2},"width":77.62,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/47-3.png","element":"img","alt":" 𝑓 (𝑋)","inline":true,"padRight":true},{"text":"is the same as that of evaluating ","element":"span"},{"style":{"height":18.3},"width":90.29,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/47-4.png","element":"img","alt":"ˆ𝑓𝑘(𝑋)","inline":true},{"text":". The SOCGS algorithm that uses the LBFGS algorithm to build up an approximate Hessian also performs well in terms of iterations and in terms of time, despite Assumption ","element":"span"},{"href":"#id-41","text":"2 ","element":"a"},{"text":"not holding in this case.","element":"span"}],[{"id":"id-150","style":{"width":"87%"},"width":1635,"height":2026,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/48-0.png","element":"img"}],[{"text":"Figure 7: ","element":"figcaption","subtype":"caption"},{"id":"id-168","style":{"fontWeight":"bold"},"text":"Sparse Coding over the Birkhoff polytope: ","element":"figcaption","subtype":"caption"},{"text":"Algorithm comparison for ","element":"figcaption","subtype":"caption"},{"style":{"height":16.4},"width":358.08,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/48-1.png","element":"img","alt":" 𝑚 = 10, 000 (medium","inline":true,"padRight":true},{"text":"size) samples in terms of primal gap ","element":"figcaption","subtype":"caption"},{"href":"#id-150","text":"(a)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-150","text":"(b)","element":"a","subtype":"caption"},{"text":", Frank-Wolfe gap ","element":"figcaption","subtype":"caption"},{"href":"#id-168","text":"(c)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-150","text":"(d) ","element":"a","subtype":"caption"},{"text":"and distance to the optimum ","element":"figcaption","subtype":"caption"},{"href":"#id-168","text":"(e)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-150","text":"(f)","element":"a","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"id":"id-151","style":{"width":"87%"},"width":1637,"height":2026,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/49-0.png","element":"img"}],[{"text":"Figure 8: ","element":"figcaption","subtype":"caption"},{"id":"id-169","style":{"fontWeight":"bold"},"text":"Sparse Coding over the Birkhoff polytope: ","element":"figcaption","subtype":"caption"},{"text":"Algorithm comparison for ","element":"figcaption","subtype":"caption"},{"style":{"height":13.6},"width":214.04,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/49-1.png","element":"img","alt":" 𝑚 = 100, 000","inline":true,"padRight":true},{"text":"(large size) samples in terms of primal gap ","element":"figcaption","subtype":"caption"},{"href":"#id-151","text":"(a)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-151","text":"(b)","element":"a","subtype":"caption"},{"text":", Frank-Wolfe gap ","element":"figcaption","subtype":"caption"},{"href":"#id-169","text":"(c)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-151","text":"(d) ","element":"a","subtype":"caption"},{"text":"and distance to the optimum ","element":"figcaption","subtype":"caption"},{"href":"#id-169","text":"(e)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-151","text":"(f)","element":"a","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"id":"id-157","style":{"width":"87%"},"width":1637,"height":2026,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/50-0.png","element":"img"}],[{"text":"Figure 9: ","element":"figcaption","subtype":"caption"},{"id":"id-170","style":{"height":14.8},"width":950.42,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/50-1.png","element":"img","alt":" Structured Logistic Regression over ℓ1 unit ball:","inline":true,"padRight":true},{"text":"Algorithm comparison in terms of primal gap ","element":"figcaption","subtype":"caption"},{"href":"#id-157","text":"(a)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-157","text":"(b)","element":"a","subtype":"caption"},{"text":", Frank-Wolfe gap ","element":"figcaption","subtype":"caption"},{"href":"#id-170","text":"(c)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-157","text":"(d) ","element":"a","subtype":"caption"},{"text":"and distance to the optimum ","element":"figcaption","subtype":"caption"},{"href":"#id-170","text":"(e)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-157","text":"(f) ","element":"a","subtype":"caption"},{"text":"for the ","element":"figcaption","subtype":"caption"},{"style":{"fontFamily":"monospace"},"text":"gissette ","element":"figcaption","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"href":"#id-71","referenceIndex":31,"text":"Guyon ","element":"a","subtype":"caption"},{"href":"#id-71","referenceIndex":31,"text":"et al.","element":"a","subtype":"caption"},{"text":", ","element":"figcaption","subtype":"caption"},{"href":"#id-71","referenceIndex":31,"text":"2007","element":"a","subtype":"caption"},{"text":") dataset, where ","element":"figcaption","subtype":"caption"},{"style":{"height":11.6},"width":403.78,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/50-2.png","element":"img","alt":" 𝑛 = 5000 and 𝑚 = 6000.","inline":true}],[{"id":"id-158","style":{"width":"87%"},"width":1637,"height":2026,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/51-0.png","element":"img"}],[{"id":"id-171","text":"Figure 10: ","element":"figcaption","subtype":"caption"},{"id":"id-172","style":{"height":14.4},"width":938.82,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/51-1.png","element":"img","alt":" Structured Logistic Regression over ℓ1 unit ball:","inline":true,"padRight":true},{"text":"Algorithm comparison in terms of primal gap ","element":"figcaption","subtype":"caption"},{"href":"#id-158","text":"(a)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-158","text":"(b)","element":"a","subtype":"caption"},{"text":", Frank-Wolfe gap ","element":"figcaption","subtype":"caption"},{"href":"#id-171","text":"(c)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-158","text":"(d) ","element":"a","subtype":"caption"},{"text":"and distance to the optimum ","element":"figcaption","subtype":"caption"},{"href":"#id-172","text":"(e)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-158","text":"(f) ","element":"a","subtype":"caption"},{"text":"for the ","element":"figcaption","subtype":"caption"},{"style":{"fontFamily":"monospace"},"text":"real-sim ","element":"figcaption","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"href":"#id-72","referenceIndex":12,"text":"Chang ","element":"a","subtype":"caption"},{"href":"#id-72","referenceIndex":12,"text":"& Lin","element":"a","subtype":"caption"},{"text":", ","element":"figcaption","subtype":"caption"},{"href":"#id-72","referenceIndex":12,"text":"2011","element":"a","subtype":"caption"},{"text":") dataset, where ","element":"figcaption","subtype":"caption"},{"style":{"height":12},"width":443.64,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/51-2.png","element":"img","alt":" 𝑛 = 72309 and 𝑚 = 20958.","inline":true}],[{"id":"id-166","style":{"width":"87%"},"width":1635,"height":2026,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/52-0.png","element":"img"}],[{"id":"id-173","text":"Figure 11: ","element":"figcaption","subtype":"caption"},{"id":"id-174","style":{"fontWeight":"bold"},"text":"Inverse covariance estimation over spectrahedron: ","element":"figcaption","subtype":"caption"},{"text":"Algorithm comparison for ","element":"figcaption","subtype":"caption"},{"style":{"height":11.6},"width":172.35,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/52-1.png","element":"img","alt":" 𝑛 = 100 in","inline":true,"padRight":true},{"text":"terms of primal gap ","element":"figcaption","subtype":"caption"},{"href":"#id-166","text":"(a)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-166","text":"(b)","element":"a","subtype":"caption"},{"text":", Frank-Wolfe gap ","element":"figcaption","subtype":"caption"},{"href":"#id-173","text":"(c)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-166","text":"(d) ","element":"a","subtype":"caption"},{"text":"and distance to the optimum ","element":"figcaption","subtype":"caption"},{"href":"#id-174","text":"(e)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-166","text":"(f)","element":"a","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"id":"id-167","style":{"width":"87%"},"width":1637,"height":2026,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/53-0.png","element":"img"}],[{"id":"id-175","text":"Figure 12: ","element":"figcaption","subtype":"caption"},{"id":"id-176","style":{"fontWeight":"bold"},"text":"Inverse covariance estimation over spectrahedron: ","element":"figcaption","subtype":"caption"},{"text":"Algorithm comparison for ","element":"figcaption","subtype":"caption"},{"style":{"height":11.6},"width":106.38,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.08907/images/53-1.png","element":"img","alt":" 𝑛 = 50","inline":true,"padRight":true},{"text":"in terms of primal gap ","element":"figcaption","subtype":"caption"},{"href":"#id-167","text":"(a)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-167","text":"(b)","element":"a","subtype":"caption"},{"text":", Frank-Wolfe gap ","element":"figcaption","subtype":"caption"},{"href":"#id-175","text":"(c)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-167","text":"(d) ","element":"a","subtype":"caption"},{"text":"and distance to the optimum ","element":"figcaption","subtype":"caption"},{"href":"#id-176","text":"(e)","element":"a","subtype":"caption"},{"text":",","element":"figcaption","subtype":"caption"},{"href":"#id-167","text":"(f)","element":"a","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}]]}],"_version":"3.3.4"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]