1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTkwMS4xMTUxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2019-10-11T07:32:04.000Z","paperID":"1901.11518","published":"2019-01-31T18:40:48.000Z","authors":"[\"Dongruo Zhou\",\"Quanquan Gu\"]","title":"Stochastic Recursive Variance-Reduced Cubic Regularization Methods","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-05T22:21:32.389Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9zdG9jaGFzdGljLXJlY3Vyc2l2ZS12YXJpYW5jZS1yZWR1Y2VkLWN1YmljIn0=","type":"pwc","url":"https://paperswithcode.com/paper/stochastic-recursive-variance-reduced-cubic","data":"{\"date\":\"2024-09-04T20:15:24.475Z\"}"}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"quanquan gu","node":{"id":"eyJhZGRyZXNzIjoicWd1QGNzLnVjbGEuZWR1In0=","address":"qgu@cs.ucla.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/22385378?v=4","username":"uclaml"}],"scholar":[{"thirdPartyID":"GU9HgNAAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJjYzUyZTQ5NC05ZmQ2LTQ0NTUtYWRlYS1kOTIyNmQ4MjFmNTAifQ==","name":"quanquan gu","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgxMS4wODg4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.08888"},{"id":"eyJwYXBlcklEIjoiMjEwMS4wMjE5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2101.02195"},{"id":"eyJwYXBlcklEIjoiMjQwMS4wMTMzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.01335"},{"id":"eyJwYXBlcklEIjoiMTIwMi4zNzI1IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1202.3725"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wNTY3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.05671"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNjc2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.06763"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wMTM4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.01384"},{"id":"eyJwYXBlcklEIjoiMTkxMS4xMjM2MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.12360"},{"id":"eyJwYXBlcklEIjoiMjExMi4wODMwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.08304"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNzgwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.07808"},{"id":"eyJwYXBlcklEIjoiMTkxMi4wMTE5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.01198"},{"id":"eyJwYXBlcklEIjoiMTkwNS4xMzIxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.13210"},{"id":"eyJwYXBlcklEIjoiMTQxMi44NzI5IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1412.8729"},{"id":"eyJwYXBlcklEIjoiMTcwNy4wNjYxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1707.06618"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNzgxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.07811"},{"id":"eyJwYXBlcklEIjoiMjQwNS4wMDY3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.00675"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNDY4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.04688"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wNDIwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.04205"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMzE2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.13165"},{"id":"eyJwYXBlcklEIjoiMjAxMi4wODUwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.08507"},{"id":"eyJwYXBlcklEIjoiMjAwNS4wMTM1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.01350"},{"id":"eyJwYXBlcklEIjoiMTYxMi4wOTI5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1612.09297"},{"id":"eyJwYXBlcklEIjoiMTkwOS4wODYxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.08610"},{"id":"eyJwYXBlcklEIjoiMjAxMS4xMTU2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.11566"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNzMyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.07323"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wODM5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.08391"},{"id":"eyJwYXBlcklEIjoiMjAxMi4wMTc4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.01780"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNDc5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.04796"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wMDgyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.00827"},{"id":"eyJwYXBlcklEIjoiMTcwNC4wNjI1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1704.06256"},{"id":"eyJwYXBlcklEIjoiMjIwOC4wMjgxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2208.02813"},{"id":"eyJwYXBlcklEIjoiMTkwMS4xMTIyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.11224"},{"id":"eyJwYXBlcklEIjoiMTkxMi4wNDUxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.04511"},{"id":"eyJwYXBlcklEIjoiMTUwMi4wMjM0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1502.02347"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNDQ2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.04462"},{"id":"eyJwYXBlcklEIjoiMTgxMS4xMDgyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.10828"},{"id":"eyJwYXBlcklEIjoiMjEwOC4xMTM3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.11371"},{"id":"eyJwYXBlcklEIjoiMTkxMC4xMzY1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.13659"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wNjUyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.06526"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wODc4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.08782"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMjYxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.12615"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wODY4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.08680"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wMTI3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.01279"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wNTI3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.05275"},{"id":"eyJwYXBlcklEIjoiMjAwNS4xNDQyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.14426"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wNDAyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.04026"},{"id":"eyJwYXBlcklEIjoiMjEwNC4xMzYyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.13628"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wOTQwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.09401"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMTkzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.11935"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMzAyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.13028"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNzQwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.07404"},{"id":"eyJwYXBlcklEIjoiMjIwNS4wNjgxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.06811"},{"id":"eyJwYXBlcklEIjoiMjIxMi4wNjEzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2212.06132"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xMTUwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.11507"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wOTE3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.09174"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTgwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01803"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wODY1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.08651"},{"id":"eyJwYXBlcklEIjoiMTUwNS4wNDc4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1505.04780"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMjc5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.12792"},{"id":"eyJwYXBlcklEIjoiMjEwMy4xMjY5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.12692"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMTA5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.01094"},{"id":"eyJwYXBlcklEIjoiMTkwMS4xMTUxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.11518"},{"id":"eyJwYXBlcklEIjoiMTcwMS4wMjMwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1701.02301"},{"id":"eyJwYXBlcklEIjoiMjQwMi4xMDIxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.10210"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wNDE0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.04145"},{"id":"eyJwYXBlcklEIjoiMjAwNS4wMDIxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.00218"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wMDU4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.00587"},{"id":"eyJwYXBlcklEIjoiMjMxMi4wOTE5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.09193"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMjcyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.12727"},{"id":"eyJwYXBlcklEIjoiMjQwNC4wNjAxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.06013"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wMDkyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.00927"},{"id":"eyJwYXBlcklEIjoiMTcwMS4wMDQ4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1701.00481"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xMDM3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.10371"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wMjUzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.02538"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wODQzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.08433"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNzMwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.07301"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNjE5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.06198"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wMDk2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.00968"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMjkzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.02934"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMDEzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.10133"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wNDc5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.04798"},{"id":"eyJwYXBlcklEIjoiMTkwNi4xMjA1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.12056"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xNTIzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.15238"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wMDUzOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.00539"},{"id":"eyJwYXBlcklEIjoiMTUxMi4wODg2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1512.08861"},{"id":"eyJwYXBlcklEIjoiMTgxMS4xMTk4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.11989"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMDM3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.00378"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wMTI3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.01278"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wODgxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.08816"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wMzEwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.03106"},{"id":"eyJwYXBlcklEIjoiMjIwMi4xMzYwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.13603"},{"id":"eyJwYXBlcklEIjoiMjEwNC4wOTQzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.09437"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNDc5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.04791"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xNDU1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.14550"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMTk2MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.11960"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNTA1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.05059"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wOTU5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.09597"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wODk0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.08940"},{"id":"eyJwYXBlcklEIjoiMjAxMS4xMDEzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.10134"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xMjM3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.12376"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wODk5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.08991"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wNDk0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.04949"},{"id":"eyJwYXBlcklEIjoiMjEwMS4wMTE1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2101.01152"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNzQzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.07435"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wNjU4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.06585"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wMzk1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.03950"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wMTM4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.01380"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wNzI2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.07269"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNjM5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.06394"},{"id":"eyJwYXBlcklEIjoiMTkxMi4wMTIxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.01211"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wNDU1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.04552"},{"id":"eyJwYXBlcklEIjoiMjExMi4xNTI1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.15250"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wOTkxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.09910"},{"id":"eyJwYXBlcklEIjoiMTUwMy4wMTQ0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1503.01442"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMTYxMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.11612"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMzgwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.13805"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wOTM5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.09390"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wNjUyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.06525"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xMTAxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.11015"},{"id":"eyJwYXBlcklEIjoiMjIwMy4wMzE1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.03159"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNjg3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.06878"},{"id":"eyJwYXBlcklEIjoiMjQwMy4wMDE3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.00178"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xMjMxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.12314"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wMDc4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.00782"},{"id":"eyJwYXBlcklEIjoiMTkwOS4wNjMyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.06322"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wMjI1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.02255"},{"id":"eyJwYXBlcklEIjoiMTgwOS4wODIwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1809.08204"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMjAzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.12034"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xMDc3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.10776"},{"id":"eyJwYXBlcklEIjoiMjMwMy4xMDE2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.10165"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wNjQ0NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.06446"},{"id":"eyJwYXBlcklEIjoiMjIwMy4wODk2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.08961"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xNDIyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.14222"},{"id":"eyJwYXBlcklEIjoiMjIwOC4wMTg1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2208.01857"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xNjI1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.16255"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMTY4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.11680"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMzc5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.13792"},{"id":"eyJwYXBlcklEIjoiMjIxMi4wNTk0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2212.05949"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wODM1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.08359"},{"id":"eyJwYXBlcklEIjoiMTYwNi4wMDgzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1606.00832"},{"id":"eyJwYXBlcklEIjoiMjExMC4wMDEzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.00137"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMzE0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.13144"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNDEzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.04136"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wODM1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.08350"},{"id":"eyJwYXBlcklEIjoiNTI4OTQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"52894"},{"id":"eyJwYXBlcklEIjoiNTMwMjUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53025"},{"id":"eyJwYXBlcklEIjoiNTM1MjYiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53526"},{"id":"eyJwYXBlcklEIjoiNTM1NDMiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53543"},{"id":"eyJwYXBlcklEIjoiNTM0MDEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53401"},{"id":"eyJwYXBlcklEIjoiNTM5OTQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53994"},{"id":"eyJwYXBlcklEIjoiNTM5NDQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53944"},{"id":"eyJwYXBlcklEIjoiNTM1OTkiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53599"},{"id":"eyJwYXBlcklEIjoiNTQ0MTciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54417"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wODk5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.08998"},{"id":"eyJwYXBlcklEIjoiNzI5MDEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72901"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xODkzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.18935"},{"id":"eyJwYXBlcklEIjoiNzIyMzAiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72230"},{"id":"eyJwYXBlcklEIjoiNzI1OTkiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72599"},{"id":"eyJwYXBlcklEIjoiNzIwMjciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72027"},{"id":"eyJwYXBlcklEIjoiNzEyNjIiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71262"},{"id":"eyJwYXBlcklEIjoiMjMxMi4xNjc5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.16793"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xMDc0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.10745"}]}]}},{"author":"dongruo zhou","node":{"id":"eyJhZGRyZXNzIjoiZHJ6aG91QGNzLnVjbGEuZWR1In0=","address":"drzhou@cs.ucla.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"1780wr0AAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIzYmRjZDQ1YS1iMjM1LTQzODUtYWI0Ni1hNzM5NDUwMWQ4MGQifQ==","name":"dongruo zhou","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgxMS4wODg4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.08888"},{"id":"eyJwYXBlcklEIjoiMjEwMS4wMjE5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2101.02195"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wNTY3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.05671"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNjc2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.06763"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wNzgxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.07811"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMzE2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.13165"},{"id":"eyJwYXBlcklEIjoiMjAxMi4wODUwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.08507"},{"id":"eyJwYXBlcklEIjoiMjAxMS4xMTU2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.11566"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNDc5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.04796"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wMDgyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.00827"},{"id":"eyJwYXBlcklEIjoiMTkwMS4xMTIyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.11224"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNDQ2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.04462"},{"id":"eyJwYXBlcklEIjoiMTgxMS4xMDgyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.10828"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wODc4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.08782"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMjYxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.12615"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMTkzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.11935"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMzAyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.13028"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNzQwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.07404"},{"id":"eyJwYXBlcklEIjoiMjIwNS4wNjgxMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.06811"},{"id":"eyJwYXBlcklEIjoiMjIxMi4wNjEzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2212.06132"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xMTUwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.11507"},{"id":"eyJwYXBlcklEIjoiMTkwMS4xMTUxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.11518"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wMDU4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.00587"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xMDM3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.10371"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wNzMwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.07301"},{"id":"eyJwYXBlcklEIjoiMTgxMS4xMTk4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.11989"},{"id":"eyJwYXBlcklEIjoiMjIwMi4xMzYwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.13603"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMTk2MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.11960"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wODk0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.08940"},{"id":"eyJwYXBlcklEIjoiMjAxMS4xMDEzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.10134"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNjM5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.06394"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wOTkxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.09910"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMTYxMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.11612"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMjAzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.12034"},{"id":"eyJwYXBlcklEIjoiMjExMC4wMDEzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.00137"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMzE0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.13144"},{"id":"eyJwYXBlcklEIjoiMjExMi4wNDIxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.04216"},{"id":"eyJwYXBlcklEIjoiNTM5OTQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53994"},{"id":"eyJwYXBlcklEIjoiNTM5NDQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53944"},{"id":"eyJwYXBlcklEIjoiNTM1OTkiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53599"}]}]}}]},"__typename":"paper","authorArray":["Dongruo Zhou","Quanquan Gu"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"1901.11518","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"1901.11518","publisher":"arxiv","paperJSON":{"title":"Stochastic Recursive Variance-Reduced Cubic Regularization Methods","paperID":"1901.11518","avgLineHeight":14.24,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"Stochastic Variance-Reduced Cubic regularization (SVRC) algorithms have received increasing attention due to its improved gradient/Hessian complexities (i.e., number of queries to stochastic gradient/Hessian oracles) to find local minima for nonconvex finite-sum optimization. However, it is unclear whether existing SVRC algorithms can be further improved. ","element":"span"},{"text":"Moreover, the semi-stochastic Hessian estimator adopted in existing SVRC algorithms prevents the use of Hessian-vector product-based fast cubic subproblem solvers, which makes SVRC algorithms computationally intractable for high-dimensional problems. In this paper, we first present a Stochastic Recursive Variance-Reduced Cubic regularization method (SRVRC) using a recursively updated semi-stochastic gradient and Hessian estimators. It enjoys improved gradient and Hessian complexities to find an (","element":"span"},{"style":{"height":16},"width":87.1,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/0-0.png","element":"img","alt":"ϵ, √ϵ","inline":true},{"text":")-approximate local minimum, and outperforms the state-of-the-art SVRC algorithms. Built upon SRVRC, we further propose a Hessian-free SRVRC algorithm, namely SRVRC","element":"span"},{"style":{"height":7.6},"width":50.48,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/0-1.png","element":"img","alt":"free","inline":true},{"text":", which only needs ","element":"span"},{"style":{"height":17.39},"width":236.5,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/0-2.png","element":"img","alt":"�O(nϵ−2 ∧ ϵ−3","inline":true},{"text":") stochastic gradient and Hessian-vector product computations, where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"is the number of component functions in the finite-sum objective and ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/0-3.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"is the optimization precision. This outperforms the best-known result ","element":"span"},{"style":{"height":17.79},"width":308.37,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/0-4.png","element":"img","alt":"�O(ϵ−3.5) achieved","inline":true,"padRight":true},{"text":"by stochastic cubic regularization algorithm proposed in ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":").","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"Many machine learning problems can be formulated as empirical risk minimization, which is in the form of finite-sum optimization as follows:","element":"span"}],[{"id":"id-2","style":{"width":"64%"},"width":1213,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/0-5.png","element":"img"}],[{"text":"where each ","element":"span"},{"style":{"height":19.13},"width":222.29,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/0-6.png","element":"img","alt":" fi : Rd → R","inline":true,"padRight":true},{"text":"can be a convex or nonconvex function. In this paper, we are particularly interested in nonconvex finite-sum optimization, where each ","element":"span"},{"style":{"height":16.4},"width":33.37,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/0-7.png","element":"img","alt":" fi","inline":true,"padRight":true},{"text":"is nonconvex. This is often the case for deep learning (","element":"span"},{"href":"#id-1","referenceIndex":34,"text":"LeCun et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":34,"text":"2015","element":"a"},{"text":"). In principle, it is hard to find the global minimum of (","element":"span"},{"href":"#id-2","text":"1.1","element":"a"},{"text":") because of the NP-hardness of the problem (","element":"span"},{"href":"#id-3","referenceIndex":27,"text":"Hillar and Lim","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":27,"text":"2013","element":"a"},{"text":"), thus it is reasonable to resort to finding local minima (a.k.a., second-order stationary points). It has been shown that local minima can be the global minima in certain machine learning problems, such as low-rank matrix factorization (","element":"span"},{"href":"#id-4","referenceIndex":24,"text":"Ge et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":24,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-5","referenceIndex":6,"text":"Bhojanapalli et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-5","referenceIndex":6,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-6","referenceIndex":55,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-6","referenceIndex":55,"text":"2018b","element":"a"},{"text":") and training deep ","element":"span"},{"text":"linear neural networks (","element":"span"},{"href":"#id-7","referenceIndex":32,"text":"Kawaguchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":32,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-8","referenceIndex":26,"text":"Hardt and Ma","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":26,"text":"2016","element":"a"},{"text":"). Therefore, developing algorithms to find local minima is important both in theory and in practice. More specifically, we define an (","element":"span"},{"style":{"height":12.62},"width":104.08,"height":31.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-0.png","element":"img","alt":"ϵg, ϵH","inline":true},{"text":")-approximate local minimum ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":") as follows","element":"span"}],[{"id":"id-59","style":{"width":"70%"},"width":1314,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-1.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":14.62},"width":160.02,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-2.png","element":"img","alt":" ϵg, ϵH >","inline":true,"padRight":true},{"text":"0 are predefined precision parameters. The most classic algorithm to find the approximate local minimum is cubic-regularized (CR) Newton method, which was originally proposed in the seminal paper by ","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"Nesterov and Polyak ","element":"a"},{"text":"(","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"2006","element":"a"},{"text":"). Generally speaking, in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"-th iteration, cubic regularization method solves a subproblem, which minimizes a cubic-regularized second-order Taylor expansion at the current iterate ","element":"span"},{"style":{"height":10.84},"width":44.48,"height":27.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-3.png","element":"img","alt":" xk","inline":true},{"text":". The update rule can be written as follows:","element":"span"}],[{"id":"id-22","style":{"width":"79%"},"width":1489,"height":155,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M > ","element":"span"},{"text":"0 is a penalty parameter. ","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"Nesterov and Polyak ","element":"a"},{"text":"(","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"2006","element":"a"},{"text":") proved that to find an (","element":"span"},{"style":{"height":17.6},"width":91.47,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-5.png","element":"img","alt":"ϵ, √ϵ","inline":true},{"text":")-approximate local minimum of a nonconvex function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":", cubic regularization requires at most ","element":"span"},{"style":{"height":20.33},"width":164.99,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-6.png","element":"img","alt":" O(ϵ−3/2)","inline":true,"padRight":true},{"text":"iterations. However, when applying cubic regularization to nonconvex finite-sum optimization in (","element":"span"},{"href":"#id-2","text":"1.1","element":"a"},{"text":"), a major bottleneck of cubic regularization is that it needs to compute ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"individual gradients ","element":"span"},{"style":{"height":17.2},"width":132.37,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-7.png","element":"img","alt":"∇fi(xk","inline":true},{"text":") and Hessian matrices ","element":"span"},{"style":{"height":19.14},"width":151.3,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-8.png","element":"img","alt":" ∇2fi(xk","inline":true},{"text":") at each iteration, which leads to a total ","element":"span"},{"style":{"height":20.34},"width":358.81,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-9.png","element":"img","alt":" O(nϵ−3/2) gradient","inline":true,"padRight":true},{"text":"complexity (i.e., number of queries to the stochastic gradient oracle ","element":"span"},{"style":{"height":17.6},"width":565.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-10.png","element":"img","alt":" ∇fi(x) for some i and x) and","inline":true},{"style":{"height":20.33},"width":172.55,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-11.png","element":"img","alt":"O(nϵ−3/2","inline":true},{"text":") Hessian complexity (i.e., number of queries to the stochastic Hessian oracle ","element":"span"},{"style":{"height":19.13},"width":217.23,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-12.png","element":"img","alt":" ∇2fi(x) for","inline":true,"padRight":true},{"text":"some ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":"). Such computational overhead will be extremely expensive when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"is large as is in many large-scale machine learning applications.","element":"span"}],[{"text":"To overcome the aforementioned computational burden of cubic regularization, ","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"Kohler and ","element":"a"},{"href":"#id-10","referenceIndex":33,"text":"Lucchi ","element":"a"},{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"2017","element":"a"},{"text":"); ","element":"span"},{"href":"#id-11","referenceIndex":50,"text":"Xu et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-11","referenceIndex":50,"text":"2017","element":"a"},{"text":") used subsampled gradient and subsampled Hessian, which achieve ","element":"span"},{"style":{"height":20.33},"width":307.82,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-13.png","element":"img","alt":"�O(nϵ−3/2 ∧ϵ−7/2","inline":true},{"text":") gradient complexity and ","element":"span"},{"style":{"height":20.33},"width":307.82,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-14.png","element":"img","alt":" �O(nϵ−3/2 ∧ϵ−5/2","inline":true},{"text":") Hessian complexity. ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":") proposed a stochastic variance reduced cubic regularization method (SVRC), which uses novel semi-stochastic gradient and semi-stochastic Hessian estimators inspired by variance reduction for first-order finite-sum optimization (","element":"span"},{"href":"#id-13","referenceIndex":31,"text":"Johnson and Zhang","element":"a"},{"text":", ","element":"span"},{"href":"#id-13","referenceIndex":31,"text":"2013","element":"a"},{"text":"; ","element":"span"},{"href":"#id-14","referenceIndex":40,"text":"Reddi et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-14","referenceIndex":40,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-15","referenceIndex":4,"text":"Allen-Zhu ","element":"a"},{"href":"#id-15","referenceIndex":4,"text":"and Hazan","element":"a"},{"text":", ","element":"span"},{"href":"#id-15","referenceIndex":4,"text":"2016","element":"a"},{"text":"), which attains ","element":"span"},{"style":{"height":20.33},"width":225.58,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-15.png","element":"img","alt":" O(n4/5ϵ−3/2","inline":true},{"text":") Second-order Oracle (SO) complexity","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-16.png","element":"img","alt":"1","inline":true},{"text":". ","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"2018b","element":"a"},{"text":"); ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"); ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"Zhang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"2018a","element":"a"},{"text":") used a simpler semi-stochastic gradient compared with (","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":"), and semi-stochastic Hessian, which achieves a better Hessian complexity, i.e., ","element":"span"},{"style":{"height":20.33},"width":225.73,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-17.png","element":"img","alt":" O(n2/3ϵ−3/2","inline":true},{"text":"). However, it is unclear whether the gradient and Hessian complexities of the aforementioned SVRC algorithms can be further improved. Furthermore, all these algorithms need to use the semi-stochastic Hessian estimator, which is not compatible with Hessian-vector product-based cubic subproblem solvers (","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"Agarwal et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":", ","element":"span"},{"href":"#id-21","referenceIndex":9,"text":"2018","element":"a"},{"text":"). Therefore, the cubic subproblem (","element":"span"},{"href":"#id-22","text":"1.4","element":"a"},{"text":") in each iteration of existing SVRC algorithms has to be solved by computing the inverse of the Hessian matrix, whose computational complexity is at least ","element":"span"},{"style":{"height":19.13},"width":148.56,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/1-18.png","element":"img","alt":" O(dw)2.","inline":true,"padRight":true},{"text":"This makes existing SVRC algorithms not very practical for high-dimensional problems.","element":"span"}],[{"text":"In this paper, we first show that the gradient and Hessian complexities of SVRC-type algorithms can be further improved. The core idea is to use novel recursively updated semi-stochastic gradient and Hessian estimators, which are inspired by the stochastic path-integrated differential estimator (SPIDER) (","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":") and the StochAstic Recursive grAdient algoritHm (SARAH) (","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"Nguyen ","element":"a"},{"href":"#id-24","referenceIndex":37,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"2017","element":"a"},{"text":") for first-order optimization. We show that such kind of estimators can be extended to second-order optimization to reduce the Hessian complexity. Nevertheless, our analysis is very different from that in ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":"); ","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"Nguyen et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"2017","element":"a"},{"text":"), because we study a fundamentally different optimization problem (i.e., finding local minima against finding first-order stationary points) and a completely different optimization algorithm (i.e., cubic regularization versus gradient method). In addition, in order to reduce the runtime complexity of existing SVRC algorithms, we further propose a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Hessian-free ","element":"span"},{"text":"SVRC method that can not only use the novel semi-stochastic gradient estimator, but also leverage the Hessian-vector product-based fast cubic subproblem solvers. Experiments on benchmark nonconvex finite-sum optimization problems illustrate the superiority of our newly proposed SVRC algorithms over the state-of-the-art (Due to space limit, we include the experiments in Appendix ","element":"span"},{"text":"6","element":"span"},{"text":").","element":"span"}],[{"text":"In detail, our contributions are summarized as follows:","element":"span"}],[{"text":"1. We propose a new SVRC algorithm, namely SRVRC, which can find an (","element":"span"},{"style":{"height":17.6},"width":91.47,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-0.png","element":"img","alt":"ϵ, √ϵ","inline":true},{"text":")-approximate local minimum with ","element":"span"},{"style":{"height":20.33},"width":284.56,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-1.png","element":"img","alt":"�O(nϵ−3/2 ∧ ϵ−3","inline":true},{"text":") gradient complexity and ","element":"span"},{"style":{"height":19.13},"width":187.7,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-2.png","element":"img","alt":"�O(n ∧ ϵ−1","inline":true,"padRight":true},{"text":"+ ","element":"span"},{"style":{"height":16.33},"width":285.56,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-3.png","element":"img","alt":" n1/2ϵ−3/2 ∧ ϵ−2","inline":true},{"text":") Hessian complexity. Compared with previous work in cubic regularization, the gradient and Hessian complexity of SRVRC is strictly better than the algorithms in ","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"2018b","element":"a"},{"text":"); ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"); ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"Zhang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"2018a","element":"a"},{"text":"), and better than that in ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":"); ","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"Shen et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"2019","element":"a"},{"text":") in a wide regime.","element":"span"}],[{"text":"2. We further propose a new algorithm SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-4.png","element":"img","alt":"free","inline":true},{"text":", which requires ","element":"span"},{"style":{"height":19.13},"width":251.86,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-5.png","element":"img","alt":"�O(ϵ−3 ∧ nϵ−2","inline":true},{"text":") stochastic gra-dient and Hessian-vector product computations to find an (","element":"span"},{"style":{"height":17.6},"width":91.47,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-6.png","element":"img","alt":"ϵ, √ϵ","inline":true},{"text":")-approximate local minimum. SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-7.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"is strictly better than the algorithms in (","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"Agarwal et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":") when ","element":"span"},{"style":{"height":11.2},"width":82.31,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-8.png","element":"img","alt":" n ≫","inline":true,"padRight":true},{"text":"1. The runtime complexity of SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-9.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"is also better than that of SRVRC when the problem dimension ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"is large.","element":"span"}],[{"text":"In an independent and concurrent work (","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"Shen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"2019","element":"a"},{"text":"), two stochastic trust region methods namely STR1 and STR2 were proposed, which are based on the same idea of variance reduction using SPIDER, and are related to our first algorithm SRVRC. Our SRVRC is better than STR1 because it enjoys the same Hessian complexity but a better gradient complexity than STR1. Compared with STR2, our SRVRC has a consistently lower Hessian complexity and lower gradient complexity in a wide regime (i.e., ","element":"span"},{"style":{"height":17.14},"width":202.1,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/2-10.png","element":"img","alt":" ϵ ≫ n−1/2","inline":true},{"text":"). Since Hessian complexity is the dominating term in cubic regularization method (","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"), our SRVRC is arguably better than STR2, as verified by our experiments.","element":"span"}],[{"text":"For the ease of comparison, we summarize the comparison of methods which need to compute the Hessian explicitly in Table ","element":"span"},{"href":"#id-26","text":"1","element":"a"},{"text":", the Hessian-free or Hessian-vector product based methods in Table ","element":"span"},{"href":"#id-27","text":"2","element":"a"},{"text":".","element":"span"}],[{"id":"id-26","text":"Table 1: Comparisons of different methods to find an (","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/3-0.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-local minimum on gradient and Hessian complexity.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"95%"},"width":1784,"height":971,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/3-1.png","element":"img"}]]},{"heading":"2 Additional Related Work","paragraphs":[[{"text":"In this section, we review additional related work that is not discussed in the introduction section. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Cubic Regularization and Trust-Region Methods ","element":"span"},{"text":"Since cubic regularization was first proposed by ","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"Nesterov and Polyak ","element":"a"},{"text":"(","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"2006","element":"a"},{"text":"), there has been a line of followup research. It was extended to adaptive regularized cubic methods (ARC) by ","element":"span"},{"href":"#id-28","referenceIndex":12,"text":"Cartis et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-28","referenceIndex":12,"text":"2011a","element":"a"},{"text":",","element":"span"},{"href":"#id-29","referenceIndex":15,"text":"b","element":"a"},{"text":"), which enjoy the same iteration complexity as standard cubic regularization while having better empirical performance. The first attempt to make cubic regularization a Hessian-free method was done by ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and ","element":"a"},{"href":"#id-20","referenceIndex":8,"text":"Duchi ","element":"a"},{"text":"(","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":"), which solves the cubic sub-problem by gradient descent, requiring in total ","element":"span"},{"style":{"height":19.13},"width":138.92,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/3-2.png","element":"img","alt":"�O(nϵ−2","inline":true},{"text":") stochastic gradient and Hessian-vector product computations. ","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"Agarwal et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"2017","element":"a"},{"text":") solved cubic sub-problem by fast matrix inversion based on accelerated gradient descent, which requires ","element":"span"},{"style":{"height":20.33},"width":172.86,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/3-3.png","element":"img","alt":"�O(nϵ−3/2","inline":true,"padRight":true},{"text":"+ ","element":"span"},{"style":{"height":16.33},"width":173.92,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/3-4.png","element":"img","alt":" n3/4ϵ−7/4","inline":true},{"text":") stochastic gradient and Hessian-vector product computations. In the pure stochastic optimization setting, ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":") proposed stochastic cubic regularization method, which uses subsampled gradient and Hessian-vector product-based cubic subproblem solver, and requires ","element":"span"},{"style":{"height":19.14},"width":139.21,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/3-5.png","element":"img","alt":"�O(ϵ−3.5","inline":true},{"text":") stochastic gradient and Hessian-vector product computations. A closely related second-order method to cubic regularization methods are trust-region methods (","element":"span"},{"href":"#id-30","referenceIndex":17,"text":"Conn et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-30","referenceIndex":17,"text":"2000","element":"a"},{"text":"; ","element":"span"},{"href":"#id-31","referenceIndex":11,"text":"Cartis et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-31","referenceIndex":11,"text":"2009","element":"a"},{"text":", ","element":"span"},{"href":"#id-32","referenceIndex":13,"text":"2012","element":"a"},{"text":", ","element":"span"},{"href":"#id-33","referenceIndex":14,"text":"2013","element":"a"},{"text":"). Recent studies (","element":"span"},{"href":"#id-34","referenceIndex":7,"text":"Blanchet et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-34","referenceIndex":7,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-35","referenceIndex":18,"text":"Curtis et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-35","referenceIndex":18,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-36","referenceIndex":35,"text":"Mart´ınez and Raydan","element":"a"},{"text":", ","element":"span"},{"href":"#id-36","referenceIndex":35,"text":"2017","element":"a"},{"text":") proved that the trust-region method can achieve the same iteration complexity as the cubic regularization method. ","element":"span"},{"href":"#id-11","referenceIndex":50,"text":"Xu et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-11","referenceIndex":50,"text":"2017","element":"a"},{"text":") also extended trust-region method to subsampled trust-region method for nonconvex finite-sum optimization.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Local Minima Finding ","element":"span"},{"text":"Besides cubic regularization and trust-region type methods, there is","element":"span"}],[{"id":"id-27","text":"Table 2: Comparisons of different methods to find an (","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/4-0.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-local minimum both on stochastic gradient and Hessian-vector product computations.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"65%"},"width":1236,"height":862,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/4-1.png","element":"img"}],[{"text":"another line of research for finding approximate local minima, which is based on first-order optimization. ","element":"span"},{"href":"#id-37","referenceIndex":23,"text":"Ge et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-37","referenceIndex":23,"text":"2015","element":"a"},{"text":"); ","element":"span"},{"href":"#id-38","referenceIndex":28,"text":"Jin et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-38","referenceIndex":28,"text":"2017a","element":"a"},{"text":") proved that (stochastic) gradient methods with additive noise are able to escape from nondegenerate saddle points and find approximate local minima. ","element":"span"},{"href":"#id-39","referenceIndex":10,"text":"Carmon et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-39","referenceIndex":10,"text":"2018","element":"a"},{"text":"); ","element":"span"},{"href":"#id-40","referenceIndex":42,"text":"Royer and Wright ","element":"a"},{"text":"(","element":"span"},{"href":"#id-40","referenceIndex":42,"text":"2017","element":"a"},{"text":"); ","element":"span"},{"href":"#id-41","referenceIndex":2,"text":"Allen-Zhu ","element":"a"},{"text":"(","element":"span"},{"href":"#id-41","referenceIndex":2,"text":"2017","element":"a"},{"text":"); ","element":"span"},{"href":"#id-42","referenceIndex":51,"text":"Xu et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-42","referenceIndex":51,"text":"2018","element":"a"},{"text":"); ","element":"span"},{"href":"#id-43","referenceIndex":5,"text":"Allen-Zhu and ","element":"a"},{"href":"#id-43","referenceIndex":5,"text":"Li ","element":"a"},{"text":"(","element":"span"},{"href":"#id-43","referenceIndex":5,"text":"2018","element":"a"},{"text":"); ","element":"span"},{"href":"#id-44","referenceIndex":30,"text":"Jin et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-44","referenceIndex":30,"text":"2017b","element":"a"},{"text":"); ","element":"span"},{"href":"#id-45","referenceIndex":53,"text":"Yu et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-45","referenceIndex":53,"text":"2017","element":"a"},{"text":", ","element":"span"},{"href":"#id-46","referenceIndex":52,"text":"2018","element":"a"},{"text":"); ","element":"span"},{"href":"#id-47","referenceIndex":56,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-47","referenceIndex":56,"text":"2018a","element":"a"},{"text":"); ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":") showed that by alternating first-order optimization and Hessian-vector product based negative curvature descent, one can find approximate local minima even more efficiently. Very recently, ","element":"span"},{"href":"#id-48","referenceIndex":21,"text":"Fang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-48","referenceIndex":21,"text":"2019","element":"a"},{"text":"); ","element":"span"},{"href":"#id-49","referenceIndex":29,"text":"Jin et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-49","referenceIndex":29,"text":"2019","element":"a"},{"text":") showed that stochastic gradient descent itself can escape from saddle points. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Variance Reduction ","element":"span"},{"text":"Variance reduction techniques play an important role in our proposed algorithms. Variance reduction techniques were first proposed for convex finite-sum optimization, which use semi-stochastic gradient to reduce the variance of the stochastic gradient and improve the gradient complexity. Representative algorithms include Stochastic Average Gradient (SAG) (","element":"span"},{"href":"#id-50","referenceIndex":41,"text":"Roux et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-50","referenceIndex":41,"text":"2012","element":"a"},{"text":"), Stochastic Variance Reduced Gradient (SVRG) (","element":"span"},{"href":"#id-13","referenceIndex":31,"text":"Johnson and Zhang","element":"a"},{"text":", ","element":"span"},{"href":"#id-13","referenceIndex":31,"text":"2013","element":"a"},{"text":"; ","element":"span"},{"href":"#id-51","referenceIndex":49,"text":"Xiao ","element":"a"},{"href":"#id-51","referenceIndex":49,"text":"and Zhang","element":"a"},{"text":", ","element":"span"},{"href":"#id-51","referenceIndex":49,"text":"2014","element":"a"},{"text":"), SAGA (","element":"span"},{"href":"#id-52","referenceIndex":19,"text":"Defazio et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-52","referenceIndex":19,"text":"2014","element":"a"},{"text":") and SARAH (","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"Nguyen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"2017","element":"a"},{"text":"), to mention a few. For nonconvex finite-sum optimization problems, ","element":"span"},{"href":"#id-53","referenceIndex":22,"text":"Garber and Hazan ","element":"a"},{"text":"(","element":"span"},{"href":"#id-53","referenceIndex":22,"text":"2015","element":"a"},{"text":"); ","element":"span"},{"href":"#id-54","referenceIndex":43,"text":"Shalev-Shwartz ","element":"a"},{"text":"(","element":"span"},{"href":"#id-54","referenceIndex":43,"text":"2016","element":"a"},{"text":") studied the case where each individual function is nonconvex, but their sum is still (strongly) convex. ","element":"span"},{"href":"#id-14","referenceIndex":40,"text":"Reddi et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-14","referenceIndex":40,"text":"2016","element":"a"},{"text":"); ","element":"span"},{"href":"#id-15","referenceIndex":4,"text":"Allen-Zhu and Hazan ","element":"a"},{"text":"(","element":"span"},{"href":"#id-15","referenceIndex":4,"text":"2016","element":"a"},{"text":") extended SVRG to noncovnex finite-sum optimization, which is able to converge to first-order stationary point with better gradient complexity than vanilla gradient descent. ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":"); ","element":"span"},{"href":"#id-55","referenceIndex":58,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-55","referenceIndex":58,"text":"2018c","element":"a"},{"text":"); ","element":"span"},{"href":"#id-56","referenceIndex":47,"text":"Wang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-56","referenceIndex":47,"text":"2018a","element":"a"},{"text":"); ","element":"span"},{"href":"#id-57","referenceIndex":38,"text":"Nguyen ","element":"a"},{"href":"#id-57","referenceIndex":38,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-57","referenceIndex":38,"text":"2019","element":"a"},{"text":") further improved the gradient complexity for nonconvex finite-sum optimization to be (near) optimal.","element":"span"}]]},{"heading":"3 Notation and Preliminaries","paragraphs":[[{"style":{"width":"100%"},"width":1879,"height":554,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-0.png","element":"img"}],[{"text":"and the minimal function value is bounded.","element":"span"}],[{"id":"id-61","style":{"fontWeight":"bold"},"text":"Assumption 3.1. ","element":"span"},{"text":"For any function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":") and an initial point ","element":"span"},{"style":{"height":10.62},"width":43.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-1.png","element":"img","alt":" x0","inline":true},{"text":", there exists a constant 0 ","element":"span"},{"style":{"height":15.1},"width":155.95,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-2.png","element":"img","alt":" < ∆F <","inline":true},{"style":{"height":18.26},"width":793.11,"height":45.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-3.png","element":"img","alt":"∞ such that F(x0) − infx∈Rd F(x) ≤ ∆F .","inline":true}],[{"text":"We also need the following ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"-gradient Lipschitz and ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-4.png","element":"img","alt":" ρ","inline":true},{"text":"-Hessian Lipschitz assumption.","element":"span"}],[{"id":"id-62","style":{"fontWeight":"bold"},"text":"Assumption 3.2. ","element":"span"},{"text":"For each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":", we assume that ","element":"span"},{"style":{"height":16.4},"width":123.08,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-5.png","element":"img","alt":" fi is L","inline":true},{"text":"-gradient Lipschitz continuous and ","element":"span"},{"style":{"height":15.6},"width":180.18,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-6.png","element":"img","alt":" ρ-Hessian","inline":true,"padRight":true},{"text":"Lipschitz continuous, where we have ","element":"span"},{"style":{"height":19.14},"width":1182.02,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-7.png","element":"img","alt":" ∥∇fi(x) − ∇fi(y)∥2 ≤ L∥x − y∥2 and ∥∇2fi(x) − ∇2fi(y)∥2 ≤","inline":true},{"style":{"height":19.53},"width":523.39,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-8.png","element":"img","alt":"ρ∥x − y∥2 for all x, y ∈ Rd.","inline":true}],[{"text":"Note that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"-gradient Lipschitz is not required in the original cubic regularization algorithm (","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"Nesterov and Polyak","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"2006","element":"a"},{"text":") and the SVRC algorithm (","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":"). However, for most other SVRC algorithms (","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"2018a","element":"a"},{"text":"), they need the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"-gradient Lipschitz assumption.","element":"span"}],[{"text":"In addition, we need the difference between the stochastic gradient and the full gradient to be bounded.","element":"span"}],[{"id":"id-58","style":{"fontWeight":"bold"},"text":"Assumption 3.3. ","element":"span"},{"text":"We assume that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"has ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":"-bounded stochastic gradient, where we have ","element":"span"},{"style":{"height":17.6},"width":190.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-9.png","element":"img","alt":" ∥∇fi(x)−","inline":true},{"style":{"height":19.53},"width":623.34,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-10.png","element":"img","alt":"∇F(x)∥2 ≤ M, ∀x ∈ Rd, ∀i ∈ [n].","inline":true}],[{"text":"It is worth noting that Assumption ","element":"span"},{"href":"#id-58","text":"3.3 ","element":"a"},{"text":"is weaker than the assumption that each ","element":"span"},{"style":{"height":16.4},"width":33.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-11.png","element":"img","alt":" fi","inline":true,"padRight":true},{"text":"is Lipschitz continuous, which has been made in ","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"Kohler and Lucchi ","element":"a"},{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"2017","element":"a"},{"text":"); ","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"2018b","element":"a"},{"text":"); ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"); ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"Zhang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"2018a","element":"a"},{"text":"). ","element":"span"},{"text":"We would also like to point out that we can make additional assumptions on the variances of the stochastic gradient and Hessian, such as the ones made in ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":"). Nevertheless, making these additional assumptions does not improve the dependency of the gradient and Hessian complexities or the stochastic gradient and Hessian-vector product computations on ","element":"span"},{"style":{"height":12},"width":142.25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/5-12.png","element":"img","alt":" ϵ and n","inline":true},{"text":". Therefore we chose not making these additional assumptions on the variances.","element":"span"}]]},{"heading":"4 The Proposed SRVRC Algorithm","paragraphs":[[{"text":"In this section, we present SRVRC, a novel algorithm which utilizes new semi-stochastic gradient and Hessian estimators compared with previous SVRC algorithms. We also provide a convergence analysis of the proposed algorithm.","element":"span"}],[{"id":"id-60","style":{"width":"100%"},"width":1876,"height":1141,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"4.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Algorithm Description","element":"span"}],[{"text":"In order to reduce the computational complexity for calculating full gradient and full Hessian in (","element":"span"},{"href":"#id-59","text":"1.3","element":"a"},{"text":"), several ideas such as subsampled/stochastic gradient and Hessian (","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"Kohler and Lucchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-11","referenceIndex":50,"text":"Xu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":50,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":") and variance-reduced semi-stochastic gradient and Hessian (","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou ","element":"a"},{"href":"#id-12","referenceIndex":59,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"2018a","element":"a"},{"text":") have been used in previous work. SRVRC follows this line of work. The key idea is to use a new construction of semi-stochastic gradient and Hessian estimators, which are recursively updated in each iteration, and reset periodically after certain number of iterations (i.e., an epoch). This is inspired by the first-order variance reduction algorithms SPIDER (","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":") and SARAH (","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"Nguyen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"2017","element":"a"},{"text":"). SRVRC constructs semi-stochastic gradient and Hessian as in (","element":"span"},{"href":"#id-60","text":"3.1","element":"a"},{"text":") and (","element":"span"},{"href":"#id-60","text":"3.2","element":"a"},{"text":") respectively. To be more specific, in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"-th iteration when mod(","element":"span"},{"style":{"height":19.14},"width":107.86,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-1.png","element":"img","alt":"t, S(g)","inline":true},{"text":") = 0 or mod(","element":"span"},{"style":{"height":19.14},"width":110.12,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-2.png","element":"img","alt":"t, S(h)","inline":true},{"text":") = 0, where ","element":"span"},{"style":{"height":19.13},"width":169.23,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-3.png","element":"img","alt":" S(g), S(h)","inline":true,"padRight":true},{"text":"are the epoch lengths of gradient and Hessian, SRVRC will set the semi-stochastic gradient ","element":"span"},{"style":{"height":10.62},"width":38.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-4.png","element":"img","alt":" vt","inline":true,"padRight":true},{"text":"and Hessian ","element":"span"},{"style":{"height":14.62},"width":50.6,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-5.png","element":"img","alt":" Ut","inline":true,"padRight":true},{"text":"to be a subsampled gradient ","element":"span"},{"style":{"height":17.2},"width":151.18,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-6.png","element":"img","alt":" ∇fJt(xt","inline":true},{"text":") and Hessian ","element":"span"},{"style":{"height":19.13},"width":405.23,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-7.png","element":"img","alt":" ∇2fJt(xt) at point xt","inline":true},{"text":", respectively. In the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"-th iteration when mod(","element":"span"},{"style":{"height":17.6},"width":93.82,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-8.png","element":"img","alt":"t, S) ̸","inline":true},{"text":"= 0 or mod(","element":"span"},{"style":{"height":20.33},"width":141.68,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-9.png","element":"img","alt":"t, S(h)) ̸","inline":true},{"text":"= 0, SRVRC constructs semi-stochastic gradient and Hessian ","element":"span"},{"style":{"height":10.62},"width":38.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-10.png","element":"img","alt":"vt","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.62},"width":50.6,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-11.png","element":"img","alt":" Ut","inline":true,"padRight":true},{"text":"based on previous estimators ","element":"span"},{"style":{"height":10.62},"width":82.06,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-12.png","element":"img","alt":" vt−1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.62},"width":94.19,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-13.png","element":"img","alt":" Ut−1","inline":true,"padRight":true},{"text":"recursively. With semi-stochastic gradient ","element":"span"},{"style":{"height":10.62},"width":38.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-14.png","element":"img","alt":"vt","inline":true},{"text":", semi-stochastic Hessian ","element":"span"},{"style":{"height":14.62},"width":50.6,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-15.png","element":"img","alt":" Ut","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"-th Cubic penalty parameter ","element":"span"},{"style":{"height":14.62},"width":54.34,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-16.png","element":"img","alt":" Mt","inline":true},{"text":", SRVRC constructs the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"-th Cubic subproblem ","element":"span"},{"style":{"height":10.62},"width":50.31,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-17.png","element":"img","alt":" mt","inline":true,"padRight":true},{"text":"and solves for the solution to ","element":"span"},{"style":{"height":13.82},"width":135.66,"height":34.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-18.png","element":"img","alt":" mt as t","inline":true},{"text":"-th update direction as (","element":"span"},{"href":"#id-60","text":"3.3","element":"a"},{"text":"). If ","element":"span"},{"style":{"height":17.6},"width":148,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-19.png","element":"img","alt":" ∥ht∥2 is","inline":true,"padRight":true},{"text":"less than a given threshold which we set it as","element":"span"},{"style":{"height":20.8},"width":106.18,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-20.png","element":"img","alt":"�ϵ/ρ","inline":true},{"text":", SRVRC returns ","element":"span"},{"style":{"height":16.22},"width":276.6,"height":40.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-21.png","element":"img","alt":" xt+1 = xt + ht","inline":true,"padRight":true},{"text":"as its output. Otherwise, SRVRC updates ","element":"span"},{"style":{"height":16.22},"width":276.1,"height":40.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/6-22.png","element":"img","alt":" xt+1 = xt + ht","inline":true,"padRight":true},{"text":"and continues the loop.","element":"span"}],[{"text":"The main difference between SRVRC and previous stochastic cubic regularization algorithms (","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"Kohler and Lucchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-11","referenceIndex":50,"text":"Xu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":50,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":",","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"2018a","element":"a"},{"text":") is that SRVRC adapts new semi-stochastic gradient and semi-stochastic Hessian estimators, which are defined recursively and have smaller asymptotic variance. The use of such semi-stochastic gradient has been proved to help reduce the gradient complexity in first-order nonconvex finite-sum optimization for finding stationary points (","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-56","referenceIndex":47,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-56","referenceIndex":47,"text":"2018a","element":"a"},{"text":"; ","element":"span"},{"href":"#id-57","referenceIndex":38,"text":"Nguyen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-57","referenceIndex":38,"text":"2019","element":"a"},{"text":"). Our work takes one step further to apply it to Hessian, and we will later show that it helps reduce the gradient and Hessian complexities in second-order nonconvex finite-sum optimization for finding local minima.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Convergence Analysis","element":"span"}],[{"text":"In this subsection, we present our theoretical results about SRVRC. While the idea of using variance reduction technique for cubic regularization is hardly new, the new semi-stochastic gradient and Hessian estimators in (","element":"span"},{"href":"#id-60","text":"3.1","element":"a"},{"text":") and (","element":"span"},{"href":"#id-60","text":"3.2","element":"a"},{"text":") bring new technical challenges in the convergence analysis.","element":"span"}],[{"text":"To describe whether a point ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"is a local minimum, we follow the original cubic regularization work (","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"Nesterov and Polyak","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"2006","element":"a"},{"text":") to use the following criterion ","element":"span"},{"style":{"height":17.6},"width":98.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-0.png","element":"img","alt":" µ(x):","inline":true}],[{"style":{"height":24.66},"width":1762.34,"height":61.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-1.png","element":"img","alt":"Definition 4.1. For any x, define µ(x) as µ(x) = max{∥∇F(x)∥3/22 , −λ3min�∇2F(x)�/ρ3/2}.","inline":true}],[{"text":"It is easy to note that ","element":"span"},{"style":{"height":20.34},"width":215.87,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-2.png","element":"img","alt":" µ(x) ≤ ϵ3/2","inline":true,"padRight":true},{"text":"if and only if ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"is an (","element":"span"},{"style":{"height":17.6},"width":114.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-3.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum. Thus, in order to find an (","element":"span"},{"style":{"height":17.6},"width":114.02,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-4.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum, it suffices to find a point ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"which satisfies ","element":"span"},{"style":{"height":20.33},"width":227.41,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-5.png","element":"img","alt":" µ(x) ≤ ϵ3/2.","inline":true}],[{"text":"The following theorem provides the convergence guarantee of SRVRC for finding an (","element":"span"},{"style":{"height":17.6},"width":114.02,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-6.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum.","element":"span"}],[{"id":"id-63","style":{"fontWeight":"bold"},"text":"Theorem 4.2. ","element":"span"},{"text":"Under Assumptions ","element":"span"},{"href":"#id-61","text":"3.1","element":"a"},{"text":", ","element":"span"},{"href":"#id-62","text":"3.2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-58","text":"3.3","element":"a"},{"text":", set the cubic penalty parameter ","element":"span"},{"style":{"height":14.62},"width":54.34,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-7.png","element":"img","alt":" Mt","inline":true,"padRight":true},{"text":"= 4","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-8.png","element":"img","alt":"ρ","inline":true,"padRight":true},{"text":"for any ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"and the total iteration number ","element":"span"},{"style":{"height":14.4},"width":78.18,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-9.png","element":"img","alt":" T ≥","inline":true,"padRight":true},{"text":"40∆","element":"span"},{"style":{"height":19.93},"width":198.43,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-10.png","element":"img","alt":"F ρ1/2ϵ−3/2","inline":true},{"text":". For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"such that mod(","element":"span"},{"style":{"height":20.33},"width":139.96,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-11.png","element":"img","alt":"t, S(g)) ̸","inline":true},{"text":"= 0 or mod(","element":"span"},{"style":{"height":20.33},"width":141.37,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-12.png","element":"img","alt":"t, S(h)) ̸","inline":true},{"text":"= 0, set the gradient sample size ","element":"span"},{"style":{"height":23.43},"width":78.72,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-13.png","element":"img","alt":" B(g)t","inline":true,"padRight":true},{"text":"and Hessian sample size ","element":"span"},{"style":{"height":23.43},"width":136.51,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-14.png","element":"img","alt":" B(h)t as","inline":true}],[{"id":"id-76","style":{"width":"71%"},"width":1335,"height":216,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-15.png","element":"img"}],[{"text":"For ","element":"span"},{"style":{"height":20.33},"width":807.04,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-16.png","element":"img","alt":" t such that mod(t, S(g)) = 0 or mod(t, S(h)","inline":true},{"text":") = 0, set the gradient sample size ","element":"span"},{"style":{"height":23.42},"width":78.72,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-17.png","element":"img","alt":" B(g)t","inline":true,"padRight":true},{"text":"and Hessian sample size ","element":"span"},{"style":{"height":23.42},"width":136.51,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-18.png","element":"img","alt":" B(h)t as","inline":true}],[{"id":"id-64","style":{"width":"65%"},"width":1232,"height":222,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-19.png","element":"img"}],[{"text":"Then with probability at least 1 ","element":"span"},{"style":{"height":16.4},"width":65.24,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-20.png","element":"img","alt":" − ξ","inline":true},{"text":", SRVRC outputs ","element":"span"},{"style":{"height":10.62},"width":75.22,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-21.png","element":"img","alt":" xout","inline":true,"padRight":true},{"text":"satisfying ","element":"span"},{"style":{"height":17.6},"width":188.45,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-22.png","element":"img","alt":" µ(xout) ≤","inline":true,"padRight":true},{"text":"600","element":"span"},{"style":{"height":16.33},"width":68.58,"height":40.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-23.png","element":"img","alt":"ϵ3/2","inline":true},{"text":", i.e., an (","element":"span"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-24.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum.","element":"span"}],[{"text":"Next corollary spells out the exact gradient complexity and Hessian complexity of SRVRC to find an (","element":"span"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/7-25.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum.","element":"span"}],[{"id":"id-65","style":{"fontWeight":"bold"},"text":"Corollary 4.3. ","element":"span"},{"text":"Under the same conditions as Theorem ","element":"span"},{"href":"#id-63","text":"4.2","element":"a"},{"text":", if we set ","element":"span"},{"style":{"height":19.13},"width":169.23,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-0.png","element":"img","alt":" S(g), S(h)","inline":true,"padRight":true},{"text":"as ","element":"span"},{"style":{"height":16.33},"width":72.71,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-1.png","element":"img","alt":" S(g)","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"height":19.12},"width":149.83,"height":47.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-2.png","element":"img","alt":"√ρϵ/L ·","inline":true},{"style":{"height":23.88},"width":1200.02,"height":59.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-3.png","element":"img","alt":"�n ∧ M2/ϵ2 and S(h) =�n ∧ L/(ρϵ), and set T, {B(g)t }, {B(h)t }","inline":true,"padRight":true},{"text":"as their lower bounds in (","element":"span"},{"href":"#id-64","text":"4.1","element":"a"},{"text":")- (","element":"span"},{"href":"#id-64","text":"4.4","element":"a"},{"text":"), then with probability at least 1 ","element":"span"},{"style":{"height":16.4},"width":63.58,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-4.png","element":"img","alt":" − ξ","inline":true},{"text":", SRVRC will output an (","element":"span"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-5.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum within","element":"span"}],[{"style":{"width":"31%"},"width":583,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-6.png","element":"img"}],[{"text":"stochastic Hessian evaluations and","element":"span"}],[{"style":{"width":"42%"},"width":789,"height":109,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-7.png","element":"img"}],[{"text":"stochastic gradient evaluations.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 4.4. ","element":"span"},{"text":"For SRVRC, if we assume ","element":"span"},{"style":{"height":16.4},"width":217,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-8.png","element":"img","alt":" M, L, ρ, ∆F","inline":true,"padRight":true},{"text":"to be constants, then its gradient complexity is ","element":"span"},{"style":{"height":20.33},"width":448.85,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-9.png","element":"img","alt":"�O(n/ϵ3/2 ∧ √n/ϵ2 ∧ ϵ−3","inline":true},{"text":"), and its Hessian complexity is ","element":"span"},{"style":{"height":20.33},"width":527.28,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-10.png","element":"img","alt":"�O(n ∧ ϵ−1 + n1/2ϵ−3/2 ∧ ϵ−2","inline":true},{"text":"). Regarding Hessian complexity, suppose that ","element":"span"},{"style":{"height":11.2},"width":73.86,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-11.png","element":"img","alt":" ϵ ≪","inline":true,"padRight":true},{"text":"1, then the Hessian complexity of SRVRC can be simplified as ","element":"span"},{"style":{"height":20.34},"width":336.68,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-12.png","element":"img","alt":"�O(n1/2ϵ−3/2 ∧ ϵ−2","inline":true},{"text":"). Compared with existing SVRC algorithms (","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"2018a","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"), SRVRC outperforms the best-known Hessian sample complexity by a factor of ","element":"span"},{"style":{"height":16.34},"width":275.09,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-13.png","element":"img","alt":" n1/6 ∧ n2/3ϵ1/2","inline":true},{"text":". In terms of gradient complexity, SRVRC outperforms STR2 (","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"Shen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"2019","element":"a"},{"text":") by a factor of ","element":"span"},{"style":{"height":17.13},"width":480.92,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-14.png","element":"img","alt":" n3/4ϵ3/2 when ϵ ≫ n−1/2.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark 4.5. ","element":"span"},{"text":"Note that both Theorem ","element":"span"},{"href":"#id-63","text":"4.2 ","element":"a"},{"text":"and Corollary ","element":"span"},{"href":"#id-65","text":"4.3 ","element":"a"},{"text":"still hold when Assumption ","element":"span"},{"href":"#id-58","text":"3.3 ","element":"a"},{"text":"does not hold. In that case, ","element":"span"},{"style":{"height":12.4},"width":160.21,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-15.png","element":"img","alt":" M = ∞","inline":true,"padRight":true},{"text":"and SRVRC’s Hessian complexity remains the same, while its gradient complexity can be potentially worse, i.e., ","element":"span"},{"style":{"height":20.33},"width":337.29,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-16.png","element":"img","alt":"�O(n/ϵ3/2 ∧ √n/ϵ2","inline":true},{"text":"), which degenerates to that of STR1 (","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"Shen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"2019","element":"a"},{"text":").","element":"span"}]]},{"heading":"5 Hessian-Free SRVRC","paragraphs":[[{"text":"While SRVRC adapts novel semi-stochastic gradient and Hessian estimators to reduce both the gradient and Hessian complexities, it has three limitations for high-dimensional problems with ","element":"span"},{"style":{"height":13.6},"width":123.97,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-17.png","element":"img","alt":" d ≫ 1:","inline":true,"padRight":true},{"text":"(1) it needs to compute and store the Hessian matrix, which needs ","element":"span"},{"style":{"height":19.14},"width":91.25,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-18.png","element":"img","alt":" O(d2","inline":true},{"text":") computational time and storage space; (2) it needs to solve cubic subproblem ","element":"span"},{"style":{"height":10.62},"width":50.31,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-19.png","element":"img","alt":" mt","inline":true,"padRight":true},{"text":"exactly, which requires ","element":"span"},{"style":{"height":17.2},"width":97.84,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-20.png","element":"img","alt":" O(dw","inline":true},{"text":") computational time because it needs to compute the inverse of a Hessian matrix (","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"Nesterov and Polyak","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"2006","element":"a"},{"text":"); and (3) it cannot leverage the Hessian-vector product-based cubic subproblem solvers (","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"Agarwal et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":", ","element":"span"},{"href":"#id-21","referenceIndex":9,"text":"2018","element":"a"},{"text":") because of the use of the semi-stochastic Hessian estimator. It is interesting to ask whether we can modify SRVRC to overcome these shortcomings.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"5.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Algorithm Description","element":"span"}],[{"text":"We present a Hessian-free algorithm SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-21.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"to address above limitations of SRVRC for high-dimensional problems, which only requires stochastic gradient and Hessian-vector product computations. SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-22.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"uses the same semi-stochastic gradient ","element":"span"},{"style":{"height":10.62},"width":38.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-23.png","element":"img","alt":" vt","inline":true,"padRight":true},{"text":"as SRVRC. As opposed to SRVRC which has to construct semi-stochastic Hessian explicitly, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/8-24.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"only accesses to stochastic","element":"span"}],[{"id":"id-75","style":{"width":"100%"},"width":1876,"height":1138,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-0.png","element":"img"}],[{"text":"Hessian-vector product. In detail, at each iteration ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-1.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"subsamples an index set ","element":"span"},{"style":{"height":14.62},"width":35.76,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-2.png","element":"img","alt":" It","inline":true,"padRight":true},{"text":"and defines a stochastic Hessian-vector product function ","element":"span"},{"style":{"height":19.53},"width":293.89,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-3.png","element":"img","alt":" Ut[·] : Rd → Rd ","inline":true,"padRight":true},{"text":"as follows:","element":"span"}],[{"style":{"width":"53%"},"width":1006,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-4.png","element":"img"}],[{"text":"Note that although the subproblem depends on ","element":"span"},{"style":{"height":15.6},"width":277.52,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-5.png","element":"img","alt":" Ut, SRVRCfree","inline":true,"padRight":true},{"text":"never explicitly computes this matrix. Instead, it only provides the subproblem solver access to ","element":"span"},{"style":{"height":14.62},"width":50.61,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-6.png","element":"img","alt":" Ut","inline":true,"padRight":true},{"text":"through stochastic Hessian-vector product function ","element":"span"},{"style":{"height":17.6},"width":77.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-7.png","element":"img","alt":" Ut[·","inline":true},{"text":"]. The subproblem solver performs gradient-based optimization to solve the subproblem ","element":"span"},{"style":{"height":17.6},"width":97.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-8.png","element":"img","alt":" mt(h","inline":true},{"text":") as ","element":"span"},{"style":{"height":17.6},"width":134.21,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-9.png","element":"img","alt":" ∇mt(h","inline":true},{"text":") depends on ","element":"span"},{"style":{"height":14.62},"width":50.61,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-10.png","element":"img","alt":" Ut","inline":true,"padRight":true},{"text":"only via ","element":"span"},{"style":{"height":17.6},"width":93.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-11.png","element":"img","alt":" Ut[h","inline":true},{"text":"]. In detail, following ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni ","element":"a"},{"href":"#id-0","referenceIndex":45,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":"), SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-12.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"uses Cubic-Subsolver (See Algorithms ","element":"span"},{"href":"#id-66","text":"3 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-66","text":"4 ","element":"a"},{"text":"in Appendix ","element":"span"},{"text":"G","element":"span"},{"text":") and Cubic-Finalsolver from (","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":"), to find an approximate solution ","element":"span"},{"style":{"height":15.02},"width":39.88,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-13.png","element":"img","alt":" ht","inline":true,"padRight":true},{"text":"to the cubic subproblem in (","element":"span"},{"href":"#id-60","text":"3.3","element":"a"},{"text":"). Both Cubic-Subsolver and Cubic-Finalsolver only need to access gradient ","element":"span"},{"style":{"height":10.62},"width":38.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-14.png","element":"img","alt":" vt","inline":true,"padRight":true},{"text":"and Hessian-vector product function ","element":"span"},{"style":{"height":17.6},"width":77,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-15.png","element":"img","alt":" Ut[·","inline":true},{"text":"] along with other problem-dependent parameters. With the output ","element":"span"},{"style":{"height":15.02},"width":39.87,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-16.png","element":"img","alt":" ht","inline":true,"padRight":true},{"text":"from Cubic-Subsolver, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-17.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"decides either to update ","element":"span"},{"style":{"height":10.62},"width":38.49,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-18.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"as ","element":"span"},{"style":{"height":16.22},"width":287.04,"height":40.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-19.png","element":"img","alt":" xt+1 ← xt + ht","inline":true,"padRight":true},{"text":"or to exit the loop. For the later case, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-20.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"will call Cubic-Finalsolver to output ","element":"span"},{"style":{"height":15.02},"width":39.88,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-21.png","element":"img","alt":" ht","inline":true},{"text":", and takes ","element":"span"},{"style":{"height":16.22},"width":276.09,"height":40.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-22.png","element":"img","alt":"xt+1 = xt + ht","inline":true,"padRight":true},{"text":"as its final output.","element":"span"}],[{"text":"The main differences between SRVRC and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-23.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"are two-fold. First, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-24.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"only needs to compute stochastic gradient and Hessian-vector product. Since both of these two computations only take ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":") time in many applications in machine learning, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/9-25.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"is suitable for high-dimensional problems. ","element":"span"},{"text":"In the sequel, following ","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"Agarwal et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-19","referenceIndex":1,"text":"2017","element":"a"},{"text":"); ","element":"span"},{"href":"#id-39","referenceIndex":10,"text":"Carmon et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-39","referenceIndex":10,"text":"2018","element":"a"},{"text":"); ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":"), we do not distinguish stochastic gradient and Hessian-vector product computations and consider them to have the same runtime complexity. Second, instead of solving ","element":"span"},{"text":"cubic subproblem ","element":"span"},{"style":{"height":10.62},"width":50.31,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-0.png","element":"img","alt":" mt","inline":true,"padRight":true},{"text":"exactly, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-1.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"adopts approximate subproblem solver Cubic-Subsolver and Cubic-Finalsolver, both of which only need to access gradient and Hessian-vector product function, and again only take ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":") time. Thus, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-2.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"is computational more efficient than SRVRC when ","element":"span"},{"style":{"height":13.6},"width":124.41,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-3.png","element":"img","alt":" d ≫ 1.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"5.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Convergence Analysis","element":"span"}],[{"text":"We now provide the convergence guarantee of SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-4.png","element":"img","alt":"free","inline":true},{"text":", which ensures that SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-5.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"will output an (","element":"span"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-6.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum.","element":"span"}],[{"id":"id-67","style":{"fontWeight":"bold"},"text":"Theorem 5.1. ","element":"span"},{"text":"Under Assumptions ","element":"span"},{"href":"#id-61","text":"3.1","element":"a"},{"text":", ","element":"span"},{"href":"#id-62","text":"3.2","element":"a"},{"text":", ","element":"span"},{"href":"#id-58","text":"3.3","element":"a"},{"text":", suppose ","element":"span"},{"style":{"height":17.6},"width":208.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-7.png","element":"img","alt":" ϵ < L/(4ρ","inline":true},{"text":"). ","element":"span"},{"text":"Set the cubic penalty parameter ","element":"span"},{"style":{"height":14.62},"width":54.33,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-8.png","element":"img","alt":" Mt","inline":true,"padRight":true},{"text":"= 4","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-9.png","element":"img","alt":"ρ","inline":true,"padRight":true},{"text":"for any ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"and the total iteration number ","element":"span"},{"style":{"height":14.4},"width":77.68,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-10.png","element":"img","alt":" T ≥","inline":true,"padRight":true},{"text":"25∆","element":"span"},{"style":{"height":19.93},"width":198.43,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-11.png","element":"img","alt":"F ρ1/2ϵ−3/2","inline":true},{"text":". Set the Hessianvector product sample size ","element":"span"},{"style":{"height":23.43},"width":136.51,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-12.png","element":"img","alt":" B(h)t as","inline":true}],[{"id":"id-68","style":{"width":"65%"},"width":1235,"height":109,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-13.png","element":"img"}],[{"text":"For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"such that mod(","element":"span"},{"style":{"height":20.33},"width":139.11,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-14.png","element":"img","alt":"t, S(g)) ̸","inline":true},{"text":"= 0, set the gradient sample size ","element":"span"},{"style":{"height":23.42},"width":134.26,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-15.png","element":"img","alt":" B(g)t as","inline":true}],[{"style":{"width":"71%"},"width":1334,"height":100,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-16.png","element":"img"}],[{"text":"For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"such that mod(","element":"span"},{"style":{"height":19.13},"width":107.86,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-17.png","element":"img","alt":"t, S(g)","inline":true},{"text":") = 0, set the gradient sample size ","element":"span"},{"style":{"height":23.43},"width":134.26,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-18.png","element":"img","alt":" B(g)t as","inline":true}],[{"id":"id-69","style":{"width":"65%"},"width":1231,"height":99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-19.png","element":"img"}],[{"text":"Then with probability at least 1 ","element":"span"},{"style":{"height":16.4},"width":541.85,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-20.png","element":"img","alt":" − ξ, SRVRCfree outputs xout","inline":true,"padRight":true},{"text":"satisfying ","element":"span"},{"style":{"height":20.33},"width":505.89,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-21.png","element":"img","alt":" µ(xout) ≤ 1300ϵ3/2, i.e., an","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-22.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum.","element":"span"}],[{"text":"The following corollary calculates the total amount of stochastic gradient and Hessian-vector product computations of SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-23.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"to find an (","element":"span"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-24.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum.","element":"span"}],[{"id":"id-71","style":{"fontWeight":"bold"},"text":"Corollary 5.2. ","element":"span"},{"text":"Under the same conditions as Theorem ","element":"span"},{"href":"#id-67","text":"5.1","element":"a"},{"text":", if set ","element":"span"},{"style":{"height":16.33},"width":72.71,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-25.png","element":"img","alt":" S(g)","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"height":21.73},"width":404.21,"height":54.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-26.png","element":"img","alt":"√ρϵ/L ·�n ∧ M2/ϵ2","inline":true,"padRight":true},{"text":"and set ","element":"span"},{"style":{"height":23.42},"width":319.41,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-27.png","element":"img","alt":" T, {B(g)t }, {B(h)t }","inline":true,"padRight":true},{"text":"as their lower bounds in (","element":"span"},{"href":"#id-68","text":"5.1","element":"a"},{"text":")-(","element":"span"},{"href":"#id-69","text":"5.3","element":"a"},{"text":"), then with probability at least 1 ","element":"span"},{"style":{"height":16.4},"width":63.66,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-28.png","element":"img","alt":" − ξ","inline":true},{"text":", SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-29.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"will output an (","element":"span"},{"style":{"height":17.6},"width":114.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-30.png","element":"img","alt":"ϵ, √ρϵ","inline":true},{"text":")-approximate local minimum within","element":"span"}],[{"style":{"width":"88%"},"width":1656,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-31.png","element":"img"}],[{"text":"stochastic gradient and Hessian-vector product computations.","element":"span"}],[{"style":{"height":15.24},"width":567.94,"height":38.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-32.png","element":"img","alt":"Remark 5.3. For SRVRCfree","inline":true},{"text":", if we assume ","element":"span"},{"style":{"height":16.4},"width":216.81,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-33.png","element":"img","alt":" ρ, L, M, ∆F","inline":true,"padRight":true},{"text":"are constants, then (","element":"span"},{"href":"#id-69","text":"5.4","element":"a"},{"text":") is ","element":"span"},{"style":{"height":19.13},"width":280.43,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-34.png","element":"img","alt":"�O(nϵ−2 ∧ ϵ−3).","inline":true,"padRight":true},{"text":"For stochastic algorithms, the regime ","element":"span"},{"style":{"height":9.6},"width":144.48,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-35.png","element":"img","alt":" n → ∞","inline":true,"padRight":true},{"text":"is of most interest. In this regime, (","element":"span"},{"href":"#id-69","text":"5.4","element":"a"},{"text":") becomes ","element":"span"},{"style":{"height":19.13},"width":112.52,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-36.png","element":"img","alt":"�O(ϵ−3","inline":true},{"text":"). Compared with other local minimum finding algorithms based on stochastic gradient and Hessian-vector product, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-37.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"outperforms the results achieved by ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":") and ","element":"span"},{"href":"#id-70","referenceIndex":3,"text":"Allen-Zhu ","element":"a"},{"text":"(","element":"span"},{"href":"#id-70","referenceIndex":3,"text":"2018","element":"a"},{"text":") by a factor of ","element":"span"},{"style":{"height":18.78},"width":328.55,"height":46.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/10-38.png","element":"img","alt":" ϵ−1/2. SRVRCfree","inline":true,"padRight":true},{"text":"also matches the best-known result achieved ","element":"span"},{"text":"by a recent first-order algorithm proposed in (","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":"). Note that the algorithm proposed by ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":") needs to alternate the first-order finite-sum optimization algorithm SPIDER and negative curvature descent. In sharp contrast, SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-0.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"is a pure cubic regularization type algorithm and does not need to calculate the negative curvature direction.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 5.4. ","element":"span"},{"text":"It is worth noting that both Theorem ","element":"span"},{"href":"#id-67","text":"5.1 ","element":"a"},{"text":"and Corollary ","element":"span"},{"href":"#id-71","text":"5.2 ","element":"a"},{"text":"still hold when Assumption ","element":"span"},{"href":"#id-58","text":"3.3 ","element":"a"},{"text":"does not hold, and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-1.png","element":"img","alt":"free","inline":true},{"text":"’s runtime complexity remains the same. The only difference is: without Assumption ","element":"span"},{"href":"#id-58","text":"3.3","element":"a"},{"text":", we need to use full gradient (i.e., ","element":"span"},{"style":{"height":23.42},"width":164.39,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-2.png","element":"img","alt":" B(g)t = n","inline":true},{"text":") instead of subsampled gradient at each iteration ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"5.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Discussions on runtime complexity","element":"span"}],[{"text":"We would like to further compare the runtime complexity between SRVRC and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-3.png","element":"img","alt":"free","inline":true},{"text":". In specific, SRVRC needs ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":") time to construct semi-stochastic gradient and ","element":"span"},{"style":{"height":19.13},"width":90.84,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-4.png","element":"img","alt":" O(d2","inline":true},{"text":") time to construct semi-stochastic Hessian. SRVRC also needs ","element":"span"},{"style":{"height":17.6},"width":98.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-5.png","element":"img","alt":" O(dw","inline":true},{"text":") time to solve cubic subproblem ","element":"span"},{"style":{"height":10.62},"width":50.31,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-6.png","element":"img","alt":" mt","inline":true,"padRight":true},{"text":"for each iteration. Thus, with the fact that the total number of iterations is ","element":"span"},{"style":{"height":20.33},"width":235.78,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-7.png","element":"img","alt":" T = O(ϵ−3/2","inline":true},{"text":") by Corollary ","element":"span"},{"href":"#id-65","text":"4.3","element":"a"},{"text":", SRVRC needs","element":"span"}],[{"style":{"width":"49%"},"width":931,"height":105,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-8.png","element":"img"}],[{"text":"runtime to find an (","element":"span"},{"style":{"height":17.6},"width":91.47,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-9.png","element":"img","alt":"ϵ, √ϵ","inline":true},{"text":")-approximate local minimum if we regard ","element":"span"},{"style":{"height":16.4},"width":216.74,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-10.png","element":"img","alt":" M, L, ρ, ∆F","inline":true,"padRight":true},{"text":"as constants. As we mentioned before, for many machine learning problems, both stochastic gradient and Hessian-vector product computations only need ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":") time, therefore the runtime of SRVRC","element":"span"},{"style":{"height":19.13},"width":437.36,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-11.png","element":"img","alt":"free is �O(dnϵ−2 ∧ dϵ−3).","inline":true,"padRight":true},{"text":"We conclude that SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-12.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"outperforms SRVRC when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"is large, which is in accordance with the fact that Hessian-free methods are superior for high dimension machine learning tasks. On the other hand, a careful calculation can show that the runtime of SRVRC can be less than that of SRVRC","element":"span"},{"style":{"height":15.24},"width":207.14,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-13.png","element":"img","alt":"free when d","inline":true,"padRight":true},{"text":"is moderately small. This is also reflected in our experiments in Section ","element":"span"},{"text":"6","element":"span"},{"text":".","element":"span"}]]},{"heading":"6 Experiments","paragraphs":[[{"text":"In this section, we present numerical experiments on different nonconvex empirical risk minimization (ERM) problems and on different datasets to validate the advantage of our proposed SRVRC and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-14.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"algorithms for finding approximate local minima.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Baselines: ","element":"span"},{"text":"We compare our algorithms with the following algorithms: SPIDER+ (","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"Fang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":20,"text":"2018","element":"a"},{"text":"), which is the local minimum finding version of SPIDER, stochastic trust region (STR1, STR2) (","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"Shen ","element":"a"},{"href":"#id-25","referenceIndex":44,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-25","referenceIndex":44,"text":"2019","element":"a"},{"text":"), subsampled cubic regularization (SCR) (","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"Kohler and Lucchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"2017","element":"a"},{"text":"), stochastic cubic regularization (STC) (","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"Tripuraneni et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":45,"text":"2018","element":"a"},{"text":"), stochastic variance-reduced cubic regularization (SVRC) (","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":"), sample efficient SVRC (Lite-SVRC) (","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":57,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":48,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":54,"text":"2018a","element":"a"},{"text":").","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Parameter Settings and Subproblem Solver ","element":"span"},{"text":"For each algorithm, we set the cubic penalty parameter ","element":"span"},{"style":{"height":14.62},"width":54.33,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-15.png","element":"img","alt":" Mt","inline":true,"padRight":true},{"text":"adaptively based on how well the model approximates the real objective as suggested in (","element":"span"},{"href":"#id-28","referenceIndex":12,"text":"Cartis et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-28","referenceIndex":12,"text":"2011a","element":"a"},{"text":",","element":"span"},{"href":"#id-29","referenceIndex":15,"text":"b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"Kohler and Lucchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":33,"text":"2017","element":"a"},{"text":"). For SRVRC, we set ","element":"span"},{"style":{"height":16.33},"width":72.71,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-16.png","element":"img","alt":" S(g)","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"height":16.33},"width":74.96,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/11-17.png","element":"img","alt":" S(h)","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"for the","element":"span"}],[{"id":"id-74","style":{"width":"98%"},"width":1854,"height":936,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-0.png","element":"img"}],[{"id":"id-73","text":"Figure 1: Plots of logarithmic function value gap with respect to CPU time (in seconds) for ","element":"figcaption","subtype":"caption"},{"text":"nonconvex regularized binary logistic regression on (a) ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"a9a ","element":"figcaption","subtype":"caption"},{"text":"(b) ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"ovtype ","element":"figcaption","subtype":"caption"},{"text":"(c) ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"ijcnn1 ","element":"figcaption","subtype":"caption"},{"text":"and for nonconvex regularized multiclass logistic regression on (d) ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"mnist ","element":"figcaption","subtype":"caption"},{"text":"(e) ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"cifar10 ","element":"figcaption","subtype":"caption"},{"text":"(f) ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"SVHN","element":"figcaption","subtype":"caption"},{"text":". Best viewed in color.","element":"figcaption","subtype":"caption"}],[{"text":"simplicity and set gradient and Hessian batch sizes ","element":"span"},{"style":{"height":23.43},"width":261.26,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-1.png","element":"img","alt":" B(g)t and B(h)t","inline":true,"padRight":true},{"text":"as follows:","element":"span"}],[{"style":{"width":"66%"},"width":1240,"height":139,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-2.png","element":"img"}],[{"text":"For SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-3.png","element":"img","alt":"free","inline":true},{"text":", we set gradient batch sizes ","element":"span"},{"style":{"height":23.43},"width":78.72,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-4.png","element":"img","alt":" B(g)t","inline":true,"padRight":true},{"text":"the same as SRVRC and Hessian batch sizes ","element":"span"},{"style":{"height":23.42},"width":456.2,"height":58.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-5.png","element":"img","alt":"B(h)t = B(h). We tune S","inline":true,"padRight":true},{"text":"over the grid ","element":"span"},{"style":{"height":20.34},"width":357.13,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-6.png","element":"img","alt":" {5, 10, 20, 50}, B(g) ","inline":true,"padRight":true},{"text":"over the grid ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"style":{"fontStyle":"italic"},"text":"n, n/","element":"span"},{"text":"10","element":"span"},{"style":{"fontStyle":"italic"},"text":", n/","element":"span"},{"text":"20","element":"span"},{"style":{"fontStyle":"italic"},"text":", n/","element":"span"},{"text":"100","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":", and ","element":"span"},{"style":{"height":15.93},"width":80.98,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-7.png","element":"img","alt":"B(h) ","inline":true,"padRight":true},{"text":"over the grid ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"50","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"100","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"500","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1000","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"for the best performance. For SCR, SVRC, Lite-SVRC, and SRVRC, we solve the cubic subproblem using the cubic subproblem solver discussed in (","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"Nesterov ","element":"a"},{"href":"#id-9","referenceIndex":36,"text":"and Polyak","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":36,"text":"2006","element":"a"},{"text":"). For STR1 and STR2, we solve the trust-region subproblem using the exact trust-region subproblem solver discussed in (","element":"span"},{"href":"#id-30","referenceIndex":17,"text":"Conn et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-30","referenceIndex":17,"text":"2000","element":"a"},{"text":"). For STC and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-8.png","element":"img","alt":"free","inline":true},{"text":", we use Cubic-Subsolver (Algorithm ","element":"span"},{"href":"#id-66","text":"3 ","element":"a"},{"text":"in Appendix ","element":"span"},{"text":"G","element":"span"},{"text":") to approximately solve the cubic subproblem. All algorithms are carefully tuned for a fair comparison.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Datasets and Optimization Problems ","element":"span"},{"text":"We use 6 datasets ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a9a","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"covtype","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ijcnn1 ","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnist","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cifar10 ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SVHN ","element":"span"},{"text":"from ","element":"span"},{"href":"#id-72","referenceIndex":16,"text":"Chang and Lin ","element":"a"},{"text":"(","element":"span"},{"href":"#id-72","referenceIndex":16,"text":"2011","element":"a"},{"text":") . For binary logistic regression problem with a nonconvex regularizer on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a9a","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"covtype","element":"span"},{"text":", and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ijcnn1","element":"span"},{"text":", we are given training data ","element":"span"},{"style":{"height":18.09},"width":192.83,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-9.png","element":"img","alt":" {xi, yi}ni=1","inline":true},{"text":", where ","element":"span"},{"style":{"height":17.75},"width":142.86,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-10.png","element":"img","alt":" xi ∈ Rd","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.6},"width":200.9,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/12-11.png","element":"img","alt":"yi ∈ {0, 1}","inline":true,"padRight":true},{"text":"are feature vector and output label corresponding to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th training example. The","element":"span"}],[{"text":"nonconvex penalized binary logistic regression is formulated as follows","element":"span"}],[{"style":{"width":"68%"},"width":1276,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-0.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.2},"width":67.82,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-1.png","element":"img","alt":" φ(x","inline":true},{"text":") is the sigmoid function and ","element":"span"},{"style":{"height":15.13},"width":170,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-2.png","element":"img","alt":" λ = 10−3","inline":true},{"text":". For multiclass logistic regression problem with a nonconvex regularizer on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnist","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cifar10 ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SVHN","element":"span"},{"text":", we are given training data ","element":"span"},{"style":{"height":18.09},"width":197.92,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-3.png","element":"img","alt":" {xi, yi}ni=1","inline":true},{"text":", where ","element":"span"},{"style":{"height":18.73},"width":396.07,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-4.png","element":"img","alt":"xi ∈ Rd and yi ∈ Rm ","inline":true,"padRight":true},{"text":"are feature vectors and multilabels corresponding to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th data points. The nonconvex penalized multiclass logistic regression is formulated as follows","element":"span"}],[{"style":{"width":"60%"},"width":1143,"height":137,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-5.png","element":"img"}],[{"text":"where softmax(","element":"span"},{"style":{"height":22},"width":482.48,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-6.png","element":"img","alt":"a) = exp(a)/ �di=1 exp(ai","inline":true},{"text":") is the softmax function and ","element":"span"},{"style":{"height":15.13},"width":184.55,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-7.png","element":"img","alt":" λ = 10−3.","inline":true,"padRight":true},{"text":"We plot the logarithmic function value gap with respect to CPU time in Figure ","element":"span"},{"href":"#id-73","text":"1","element":"a"},{"text":". From Figure","element":"span"}],[{"href":"#id-74","text":"1(a) ","element":"a"},{"text":"to ","element":"span"},{"href":"#id-74","text":"1(f)","element":"a"},{"text":", we can see that for the low dimension optimization task on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a9a","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"covtype ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ijcnn1","element":"span"},{"text":", our SRVRC outperforms all the other algorithms with respect to CPU time. We can also observe that the stochastic trust region method STR1 is better than STR2, which is well-aligned with our discussion before. The SPIDER+ does not perform as well as other second-order methods, even though its stochastic gradient and Hessian complexity is comparable to second-order methods in theory. Meanwhile, we also notice that SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-8.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"always outperforms STC, which suggests that the variance reduction technique is useful. For high dimension optimization task ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnist","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cifar10 ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SVHN","element":"span"},{"text":", only SPIDER+, STC and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-9.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"are able to make notable progress and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-10.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"outperforms the other two. This is again consistent with our theory and discussions in Section ","element":"span"},{"text":"5","element":"span"},{"text":". Overall, our experiments clearly validate the advantage of SRVRC and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-11.png","element":"img","alt":"free","inline":true},{"text":", and corroborate the theory of both algorithms.","element":"span"}]]},{"heading":"7 Conclusions and Future Work","paragraphs":[[{"text":"In this work we present two faster SVRC algorithms namely SRVRC and SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-12.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"to find approximate local minima for nonconvex finite-sum optimization problems. SRVRC outperforms existing SVRC algorithms in terms of gradient and Hessian complexities, while SRVRC","element":"span"},{"style":{"height":15.24},"width":202.53,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-13.png","element":"img","alt":"free further","inline":true,"padRight":true},{"text":"outperforms the best-known runtime complexity for existing CR based algorithms. Whether our algorithms have achieved the optimal complexity under the current assumptions is still an open problem, and we leave it as a future work.","element":"span"}]]},{"heading":"A Proofs in Section 4","paragraphs":[[{"text":"We define the filtration ","element":"span"},{"style":{"height":17.6},"width":319.42,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-14.png","element":"img","alt":" Ft = σ(x0, ..., xt","inline":true},{"text":") as the ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-15.png","element":"img","alt":" σ","inline":true},{"text":"-algebra of ","element":"span"},{"style":{"height":10.62},"width":43.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-16.png","element":"img","alt":" x0","inline":true,"padRight":true},{"text":"to ","element":"span"},{"style":{"height":10.62},"width":38.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-17.png","element":"img","alt":" xt","inline":true},{"text":". Recall that ","element":"span"},{"style":{"height":10.62},"width":38.48,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-18.png","element":"img","alt":" vt","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.62},"width":50.61,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-19.png","element":"img","alt":" Ut","inline":true,"padRight":true},{"text":"are the semi-stochastic gradient and Hessian respectively, ","element":"span"},{"style":{"height":15.02},"width":39.88,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-20.png","element":"img","alt":" ht","inline":true,"padRight":true},{"text":"is the update parameter, and ","element":"span"},{"style":{"height":14.62},"width":54.33,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-21.png","element":"img","alt":" Mt","inline":true,"padRight":true},{"text":"is the cubic penalty parameter appeared in Algorithm ","element":"span"},{"href":"#id-60","text":"1 ","element":"a"},{"text":"and Algorithm ","element":"span"},{"href":"#id-75","text":"2","element":"a"},{"text":". We denote ","element":"span"},{"style":{"height":17.6},"width":97.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-22.png","element":"img","alt":" mt(h","inline":true},{"text":") := ","element":"span"},{"style":{"height":19.8},"width":1078.24,"height":49.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-23.png","element":"img","alt":"v⊤h + h⊤Uth/2 + Mt∥h∥32/6 and h∗t = argminh∈Rd mt(h","inline":true},{"text":"). In this section, we define ","element":"span"},{"style":{"height":17.6},"width":272.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/13-24.png","element":"img","alt":" δ = ξ/(2T) for","inline":true,"padRight":true},{"text":"the simplicity.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"A.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-63","style":{"fontWeight":"bold"},"text":"4.2","element":"a"}],[{"text":"To prove Theorem ","element":"span"},{"href":"#id-63","text":"4.2","element":"a"},{"text":", we need the following lemma adapted from ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":"), which characterizes that ","element":"span"},{"style":{"height":17.6},"width":169,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-0.png","element":"img","alt":" µ(xt + h","inline":true},{"text":") can be bounded by ","element":"span"},{"style":{"height":17.6},"width":88.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-1.png","element":"img","alt":" ∥h∥2","inline":true,"padRight":true},{"text":"and the norm of difference between semi-stochastic gradient and Hessian.","element":"span"}],[{"id":"id-82","style":{"fontWeight":"bold"},"text":"Lemma A.1. ","element":"span"},{"text":"Suppose that ","element":"span"},{"style":{"height":17.6},"width":97.85,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-2.png","element":"img","alt":" mt(h","inline":true},{"text":") := ","element":"span"},{"style":{"height":17.6},"width":297.1,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-3.png","element":"img","alt":" v⊤t h + h⊤Uth/","inline":true},{"text":"2 + ","element":"span"},{"style":{"height":19.41},"width":169,"height":48.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-4.png","element":"img","alt":" Mt∥h∥32/","inline":true},{"text":"6 and ","element":"span"},{"style":{"height":16.72},"width":44.88,"height":41.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-5.png","element":"img","alt":" h∗t","inline":true,"padRight":true},{"text":"= argmin","element":"span"},{"style":{"height":18.26},"width":191.77,"height":45.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-6.png","element":"img","alt":"h∈Rd mt(h","inline":true},{"text":"). ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":17.6},"width":147.06,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-7.png","element":"img","alt":" Mt/ρ ≥","inline":true,"padRight":true},{"text":"2, then for any ","element":"span"},{"style":{"height":18.33},"width":309.72,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-8.png","element":"img","alt":" h ∈ Rd, we have","inline":true}],[{"style":{"width":"86%"},"width":1625,"height":174,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-9.png","element":"img"}],[{"text":"Next lemma gives bounds on the inner products ","element":"span"},{"style":{"height":20.8},"width":851.1,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-10.png","element":"img","alt":" ⟨∇F(xt) − vt, h⟩ and ⟨�∇2F(xt) − Ut�h, h⟩.","inline":true}],[{"id":"id-77","style":{"height":19.13},"width":761.11,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-11.png","element":"img","alt":"Lemma A.2. For any h ∈ Rd, we have","inline":true}],[{"style":{"width":"56%"},"width":1056,"height":235,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-12.png","element":"img"}],[{"text":"We also need the following two lemmas, which show that semi-stochastic gradient and Hessian ","element":"span"},{"style":{"height":15.02},"width":190.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-13.png","element":"img","alt":"vt and Ut","inline":true,"padRight":true},{"text":"estimators are good approximations to true gradient and Hessian.","element":"span"}],[{"id":"id-78","style":{"fontWeight":"bold"},"text":"Lemma A.3. ","element":"span"},{"text":"Suppose that ","element":"span"},{"href":"#id-64","style":{"height":24.44},"width":572.38,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-14.png","element":"img","alt":" {B(g)k } satisfies (4.1) and (4.3)","inline":true},{"text":", then conditioned on ","element":"span"},{"style":{"height":20.52},"width":337.06,"height":51.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-15.png","element":"img","alt":" F⌊t/S(g)⌋·S(g), with","inline":true,"padRight":true},{"text":"probability at least 1 ","element":"span"},{"style":{"height":20.34},"width":437.72,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-16.png","element":"img","alt":" − δ · (t − ⌊t/S(g)⌋ · S(g)","inline":true},{"text":"), we have that for all ","element":"span"},{"style":{"height":20.34},"width":425.84,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-17.png","element":"img","alt":" ⌊t/S(g)⌋ · S(g) ≤ k ≤ t,","inline":true}],[{"style":{"width":"61%"},"width":1146,"height":98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-18.png","element":"img"}],[{"id":"id-79","style":{"fontWeight":"bold"},"text":"Lemma A.4. ","element":"span"},{"text":"Suppose that ","element":"span"},{"href":"#id-76","style":{"height":24.44},"width":572.53,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-19.png","element":"img","alt":" {B(h)k } satisfies (4.2) and (4.4)","inline":true},{"text":", then conditioned on ","element":"span"},{"style":{"height":20.52},"width":340.56,"height":51.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-20.png","element":"img","alt":" F⌊t/S(h)⌋·S(h), with","inline":true,"padRight":true},{"text":"probability at least 1 ","element":"span"},{"style":{"height":20.34},"width":442.23,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-21.png","element":"img","alt":" − δ · (t − ⌊t/S(h)⌋ · S(h)","inline":true},{"text":"), we have that for all ","element":"span"},{"style":{"height":20.34},"width":430.35,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-22.png","element":"img","alt":" ⌊t/S(h)⌋ · S(h) ≤ k ≤ t,","inline":true}],[{"style":{"width":"61%"},"width":1161,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-23.png","element":"img"}],[{"text":"Given all the above lemmas, we are ready to prove Theorem ","element":"span"},{"href":"#id-63","text":"4.2","element":"a"},{"text":".","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-63","style":{"fontStyle":"italic"},"text":"4.2","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"Suppose that SRVRC terminates at iteration ","element":"span"},{"style":{"height":20.8},"width":576.32,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/14-24.png","element":"img","alt":" T ∗ − 1, then ∥ht∥2 >�ϵ/ρ for","inline":true}],[{"text":"all 0 ","element":"span"},{"style":{"height":14.73},"width":214.18,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-0.png","element":"img","alt":" ≤ t ≤ T ∗ −","inline":true,"padRight":true},{"text":"1. We have","element":"span"}],[{"style":{"height":35.78},"width":1187.42,"height":89.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-1.png","element":"img","alt":"F(xt+1) ≤ F(xt) + ⟨∇F(xt), ht⟩ + 12⟨ht, ∇2F(xt)ht⟩ + ρ6∥ht∥32","inline":true}],[{"id":"id-81","style":{"width":"86%"},"width":1624,"height":361,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-2.png","element":"img"}],[{"text":"where the second inequality holds due to the fact that ","element":"span"},{"style":{"height":17.6},"width":282.02,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-3.png","element":"img","alt":" mt(ht) ≤ mt(0","inline":true},{"text":") = 0, ","element":"span"},{"style":{"height":14.62},"width":54.33,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-4.png","element":"img","alt":" Mt","inline":true,"padRight":true},{"text":"= 4","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-5.png","element":"img","alt":"ρ","inline":true,"padRight":true},{"text":"and Lemma ","element":"span"},{"href":"#id-77","text":"A.2","element":"a"},{"text":". By Lemmas ","element":"span"},{"href":"#id-78","text":"A.3 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-79","text":"A.4","element":"a"},{"text":", with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":115.62,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-6.png","element":"img","alt":" − 2Tδ","inline":true},{"text":", for all 0 ","element":"span"},{"style":{"height":14.4},"width":194.3,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-7.png","element":"img","alt":" ≤ t ≤ T −","inline":true,"padRight":true},{"text":"1, we have that","element":"span"}],[{"style":{"width":"77%"},"width":1459,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-8.png","element":"img"}],[{"text":"for all 0 ","element":"span"},{"style":{"height":14.4},"width":195.26,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-9.png","element":"img","alt":" ≤ t ≤ T −","inline":true,"padRight":true},{"text":"1. Substituting (","element":"span"},{"href":"#id-80","text":"A.4","element":"a"},{"text":") into (","element":"span"},{"href":"#id-81","text":"A.3","element":"a"},{"text":"), we have","element":"span"}],[{"id":"id-80","style":{"width":"70%"},"width":1320,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-10.png","element":"img"}],[{"text":"Telescoping (","element":"span"},{"href":"#id-80","text":"A.5","element":"a"},{"text":") from ","element":"span"},{"style":{"height":15.53},"width":286.91,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-11.png","element":"img","alt":" t = 0, . . . , T ∗ −","inline":true,"padRight":true},{"text":"1, we have","element":"span"}],[{"style":{"width":"95%"},"width":1796,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-12.png","element":"img"}],[{"text":"Recall that we have ","element":"span"},{"style":{"height":21.86},"width":346.8,"height":54.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-13.png","element":"img","alt":" T ≥ 40∆F √ρ/ϵ3/2 ","inline":true,"padRight":true},{"text":"from the condition of Theorem ","element":"span"},{"href":"#id-63","text":"4.2","element":"a"},{"text":", then by (","element":"span"},{"href":"#id-80","text":"A.6","element":"a"},{"text":"), we have ","element":"span"},{"style":{"height":14.74},"width":139.67,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-14.png","element":"img","alt":"T ∗ ≤ T","inline":true},{"text":". Thus, we have ","element":"span"},{"style":{"height":20.8},"width":706.46,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-15.png","element":"img","alt":" ∥hT ∗−1∥2 ≤�ϵ/ρ. Denote �T = T ∗ −","inline":true,"padRight":true},{"text":"1, then we have","element":"span"}],[{"style":{"width":"90%"},"width":1692,"height":214,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-16.png","element":"img"}],[{"text":"where the first inequality holds due to Lemma ","element":"span"},{"href":"#id-82","text":"A.1 ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":19.62},"width":170.16,"height":49.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-17.png","element":"img","alt":" ∇m �T (h �T","inline":true,"padRight":true},{"text":") = 0 and ","element":"span"},{"style":{"height":22.09},"width":290.46,"height":55.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-18.png","element":"img","alt":" ∥h �T ∥2 = ∥h∗�T ∥2","inline":true},{"text":". This ","element":"span"},{"text":"completes our proof.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"A.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Corollary ","element":"span"},{"href":"#id-65","style":{"fontWeight":"bold"},"text":"4.3","element":"a"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Corollary ","element":"span"},{"href":"#id-65","style":{"fontStyle":"italic"},"text":"4.3","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"Suppose that SRVRC terminates at ","element":"span"},{"style":{"height":14.73},"width":259.2,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-19.png","element":"img","alt":" T ∗ − 1 ≤ T −","inline":true,"padRight":true},{"text":"1 iteration. Telescoping (","element":"span"},{"href":"#id-80","text":"A.5","element":"a"},{"text":") from ","element":"span"},{"style":{"height":12.73},"width":257.83,"height":31.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-20.png","element":"img","alt":" t = 0 to T ∗ −","inline":true,"padRight":true},{"text":"1, we have","element":"span"}],[{"id":"id-83","style":{"width":"96%"},"width":1803,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/15-21.png","element":"img"}],[{"text":"where the last inequality holds since ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":"is set to be 40∆","element":"span"},{"style":{"height":21.86},"width":177.46,"height":54.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/16-0.png","element":"img","alt":"F √ρ/ϵ3/2 ","inline":true,"padRight":true},{"text":"as the conditions of Corollary ","element":"span"},{"href":"#id-65","text":"4.3 ","element":"a"},{"text":"suggests. (","element":"span"},{"href":"#id-83","text":"A.7","element":"a"},{"text":") implies that ","element":"span"},{"style":{"height":22.1},"width":456.11,"height":55.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/16-1.png","element":"img","alt":"�T ∗−1t=0 ∥ht∥32 ≤ 40∆F /ρ","inline":true},{"text":". Thus, we have","element":"span"}],[{"id":"id-84","style":{"width":"94%"},"width":1769,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/16-2.png","element":"img"}],[{"text":"where the first inequality holds due to H¨older’s inequality inequality, and the second inequality is due to ","element":"span"},{"style":{"height":21.86},"width":453.6,"height":54.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/16-3.png","element":"img","alt":" T ∗ ≤ T = 40∆F √ρ/ϵ3/2","inline":true},{"text":". We first consider the total gradient sample complexity ","element":"span"},{"style":{"height":23.96},"width":233.19,"height":59.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/16-4.png","element":"img","alt":"�T ∗−1t=0 B(g)t ,","inline":true,"padRight":true},{"text":"which can be bounded as","element":"span"}],[{"style":{"width":"97%"},"width":1828,"height":828,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/16-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.87},"width":356.9,"height":49.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/16-6.png","element":"img","alt":" C1 = 1440 log2(d/δ","inline":true},{"text":"), the second inequality holds due to (","element":"span"},{"href":"#id-84","text":"A.8","element":"a"},{"text":"), and the last equality holds due to the choice of ","element":"span"},{"style":{"height":21.86},"width":529.56,"height":54.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/16-7.png","element":"img","alt":" S(g) = √ρϵ/L ·�n ∧ M2/ϵ2","inline":true},{"text":". We then consider the total Hessian sample complexity","element":"span"}],[{"style":{"height":23.96},"width":221.28,"height":59.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-0.png","element":"img","alt":"�T ∗−1t=0 B(h)t","inline":true,"padRight":true},{"text":", which can be bounded as","element":"span"}],[{"style":{"width":"93%"},"width":1748,"height":622,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-1.png","element":"img"}],[{"style":{"height":15.02},"width":137.88,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-2.png","element":"img","alt":"≤ 40C2","inline":true}],[{"style":{"width":"53%"},"width":994,"height":158,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-3.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.87},"width":336.06,"height":49.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-4.png","element":"img","alt":" C2 = 800 log2(d/δ","inline":true},{"text":"), the second inequality holds due to (","element":"span"},{"href":"#id-84","text":"A.8","element":"a"},{"text":"), and the last equality holds due to the choice of ","element":"span"},{"style":{"height":21.01},"width":391.36,"height":52.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-5.png","element":"img","alt":" S(h) =�n ∧ L/(ρϵ).","inline":true}]]},{"heading":"B Proofs in Section 5","paragraphs":[[{"text":"In this section, we denote ","element":"span"},{"style":{"height":17.6},"width":191.94,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-6.png","element":"img","alt":" δ = ξ/(3T","inline":true},{"text":") for simplicity.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"B.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-67","style":{"fontWeight":"bold"},"text":"5.1","element":"a"}],[{"text":"We need the following two lemmas, which bound the variance of semi-stochastic gradient and Hessian estimators.","element":"span"}],[{"id":"id-85","style":{"fontWeight":"bold"},"text":"Lemma B.1. ","element":"span"},{"text":"Suppose that ","element":"span"},{"style":{"height":24.44},"width":124.71,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-7.png","element":"img","alt":" {B(g)k }","inline":true,"padRight":true},{"text":"satisfies (","element":"span"},{"href":"#id-69","text":"5.2","element":"a"},{"text":") and (","element":"span"},{"href":"#id-69","text":"5.3","element":"a"},{"text":"), then conditioned on ","element":"span"},{"style":{"height":19.15},"width":144.43,"height":47.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-8.png","element":"img","alt":" F⌊t/S⌋·S","inline":true},{"text":", with ","element":"span"},{"text":"probability at least 1 ","element":"span"},{"style":{"height":17.6},"width":348.41,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-9.png","element":"img","alt":" − δ · (t − ⌊t/S⌋ · S","inline":true},{"text":"), we have that for all ","element":"span"},{"style":{"height":17.6},"width":334.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-10.png","element":"img","alt":" ⌊t/S⌋ · S ≤ k ≤ t,","inline":true}],[{"style":{"width":"22%"},"width":430,"height":99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-85","style":{"fontStyle":"italic"},"text":"B.1","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"The proof is very similar to that of Lemma ","element":"span"},{"href":"#id-78","text":"A.3","element":"a"},{"text":", hence we omit it.","element":"span"}],[{"id":"id-86","style":{"fontWeight":"bold"},"text":"Lemma B.2. ","element":"span"},{"text":"Suppose that ","element":"span"},{"href":"#id-68","style":{"height":24.44},"width":386.86,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-12.png","element":"img","alt":" {B(h)k } satisfies (5.1)","inline":true},{"text":", then conditioned on ","element":"span"},{"style":{"height":15.24},"width":49.36,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-13.png","element":"img","alt":" Fk","inline":true},{"text":", with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":12.8},"width":63.64,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-14.png","element":"img","alt":" − δ","inline":true},{"text":", we have that","element":"span"}],[{"style":{"width":"24%"},"width":460,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/17-15.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-86","style":{"fontStyle":"italic"},"text":"B.2","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"The proof is very similar to that of Lemma ","element":"span"},{"href":"#id-79","text":"A.4","element":"a"},{"text":", hence we omit it.","element":"span"}],[{"text":"We have the following lemma to guarantee that by Algorithm ","element":"span"},{"href":"#id-66","text":"3 ","element":"a"},{"text":"Cubic-Subsolver, the output ","element":"span"},{"style":{"height":15.02},"width":39.88,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-0.png","element":"img","alt":" ht","inline":true,"padRight":true},{"text":"satisfies that sufficient decrease of function value will be made and the total number of iterations is bounded by ","element":"span"},{"style":{"height":12},"width":54.74,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-1.png","element":"img","alt":" T ′.","inline":true}],[{"id":"id-87","style":{"width":"115%"},"width":2159,"height":324,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-2.png","element":"img"}],[{"text":"iterations, where ","element":"span"},{"style":{"height":15.1},"width":101.68,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-3.png","element":"img","alt":" CS >","inline":true,"padRight":true},{"text":"0 is a constant.","element":"span"}],[{"text":"We have the following lemma which provides the guarantee for the dynamic of gradient steps in Cubic-Finalsolver.","element":"span"}],[{"id":"id-91","style":{"fontWeight":"bold"},"text":"Lemma B.4. ","element":"span"},{"text":"(","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":") For ","element":"span"},{"style":{"height":15.6},"width":127.63,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-4.png","element":"img","alt":" b, A, τ","inline":true},{"text":", suppose that ","element":"span"},{"style":{"height":17.6},"width":194.1,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-5.png","element":"img","alt":" ∥A∥2 ≤ L","inline":true},{"text":". We denote that","element":"span"}],[{"style":{"width":"73%"},"width":1371,"height":214,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-6.png","element":"img"}],[{"text":"Then for Cubic-Finalsolver, suppose that ","element":"span"},{"style":{"height":19.13},"width":335.07,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-7.png","element":"img","alt":" η < (4(L+τR))−1","inline":true},{"text":", then each iterate ∆ in Cubic-Finalsolver satisfies that ","element":"span"},{"style":{"height":17.6},"width":856.06,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-8.png","element":"img","alt":" ∥∆∥2 ≤ ∥s∥2, and g(h) is (L + 2τR)-smooth.","inline":true}],[{"text":"With these lemmas, we begin our proof of Theorem ","element":"span"},{"href":"#id-67","text":"5.1","element":"a"},{"text":".","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-67","style":{"fontStyle":"italic"},"text":"5.1","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"Suppose that SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-9.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"terminates at iteration ","element":"span"},{"style":{"height":14.8},"width":503.34,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-10.png","element":"img","alt":" T ∗ − 1. Then T ∗ ≤ T. We","inline":true,"padRight":true},{"text":"first claim that ","element":"span"},{"style":{"height":13.13},"width":139.69,"height":32.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-11.png","element":"img","alt":" T ∗ < T","inline":true},{"text":". Otherwise, suppose ","element":"span"},{"style":{"height":12.33},"width":139.67,"height":30.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-12.png","element":"img","alt":" T ∗ = T","inline":true},{"text":", then we have that for all 0 ","element":"span"},{"style":{"height":15.53},"width":182.49,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-13.png","element":"img","alt":" ≤ t < T ∗,","inline":true}],[{"style":{"height":35.78},"width":1187.42,"height":89.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-14.png","element":"img","alt":"F(xt+1) ≤ F(xt) + ⟨∇F(xt), ht⟩ + 12⟨ht, ∇2F(xt)ht⟩ + ρ6∥ht∥32","inline":true}],[{"id":"id-89","style":{"width":"86%"},"width":1624,"height":224,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-15.png","element":"img"}],[{"text":"where the second inequality holds due to ","element":"span"},{"style":{"height":14.62},"width":54.34,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-16.png","element":"img","alt":" Mt","inline":true,"padRight":true},{"text":"= 4","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-17.png","element":"img","alt":"ρ","inline":true,"padRight":true},{"text":"and Lemma ","element":"span"},{"href":"#id-77","text":"A.2","element":"a"},{"text":". By Lemma ","element":"span"},{"href":"#id-87","text":"B.3 ","element":"a"},{"text":"and union bound, we know that with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":273.81,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-18.png","element":"img","alt":" − Tδ, we have","inline":true}],[{"id":"id-88","style":{"width":"71%"},"width":1349,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-19.png","element":"img"}],[{"text":"where we use the fact that ","element":"span"},{"style":{"height":14.62},"width":54.34,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-20.png","element":"img","alt":" Mt","inline":true,"padRight":true},{"text":"= 4","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-21.png","element":"img","alt":"ρ","inline":true},{"text":". By Lemmas ","element":"span"},{"href":"#id-85","text":"B.1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-86","text":"B.2","element":"a"},{"text":", we know that with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":117.01,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-22.png","element":"img","alt":" − 2Tδ","inline":true},{"text":", for all 0 ","element":"span"},{"style":{"height":14.74},"width":214.19,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-23.png","element":"img","alt":" ≤ t ≤ T ∗ −","inline":true,"padRight":true},{"text":"1, we have","element":"span"}],[{"style":{"width":"81%"},"width":1532,"height":60,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/18-24.png","element":"img"}],[{"text":"Substituting (","element":"span"},{"href":"#id-88","text":"B.2","element":"a"},{"text":") and (","element":"span"},{"href":"#id-88","text":"B.3","element":"a"},{"text":") into (","element":"span"},{"href":"#id-89","text":"B.1","element":"a"},{"text":"), we have","element":"span"}],[{"style":{"width":"97%"},"width":1821,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-0.png","element":"img"}],[{"text":"Telescoping (","element":"span"},{"href":"#id-90","text":"B.4","element":"a"},{"text":") from ","element":"span"},{"style":{"height":12.73},"width":257.82,"height":31.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-1.png","element":"img","alt":" t = 0 to T ∗ −","inline":true,"padRight":true},{"text":"1, we have","element":"span"}],[{"id":"id-90","style":{"width":"94%"},"width":1774,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-2.png","element":"img"}],[{"text":"where the last inequality holds since we assume ","element":"span"},{"style":{"height":12.33},"width":48.56,"height":30.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-3.png","element":"img","alt":" T ∗","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"height":14.4},"width":80.1,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-4.png","element":"img","alt":" T ≥","inline":true,"padRight":true},{"text":"25∆","element":"span"},{"style":{"height":19.93},"width":198.43,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-5.png","element":"img","alt":"F ρ1/2ϵ−3/2","inline":true,"padRight":true},{"text":"from the condition of Theorem ","element":"span"},{"href":"#id-67","text":"5.1","element":"a"},{"text":". (","element":"span"},{"href":"#id-90","text":"B.5","element":"a"},{"text":") leads to a contradiction, thus we have ","element":"span"},{"style":{"height":13.13},"width":139.69,"height":32.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-6.png","element":"img","alt":" T ∗ < T","inline":true},{"text":". Therefore, by union bound, with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":117.39,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-7.png","element":"img","alt":" − 3Tδ","inline":true},{"text":", Cubic-Finalsolver is executed by SRVRC","element":"span"},{"style":{"height":8.8},"width":53.6,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-8.png","element":"img","alt":"free","inline":true,"padRight":true},{"text":"at ","element":"span"},{"style":{"height":12.33},"width":94.14,"height":30.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-9.png","element":"img","alt":" T ∗ −","inline":true,"padRight":true},{"text":"1 iteration. We have that ","element":"span"},{"style":{"height":21.11},"width":1677.36,"height":52.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-10.png","element":"img","alt":" ∥vT ∗−1∥2 < max{MT ∗−1ϵ/(2ρ),�LMT ∗−1/2(ϵ/ρ)3/4} and ∥h∗T ∗−1∥2 <�ϵ/ρ by Lemma","inline":true}],[{"style":{"width":"3%"},"width":73,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-11.png","element":"img"}],[{"text":"The only thing left is to check that we indeed find a second-order stationary point, ","element":"span"},{"style":{"height":10.7},"width":65.91,"height":26.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-12.png","element":"img","alt":" xT ∗","inline":true},{"text":", by Cubic-Finalsolver. We first need to check that the choice of ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-13.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"(16","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":") satisfies that 1","element":"span"},{"style":{"height":17.6},"width":98.1,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-14.png","element":"img","alt":"/η >","inline":true,"padRight":true},{"text":"4(","element":"span"},{"href":"#id-91","style":{"height":17.6},"width":624.71,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-15.png","element":"img","alt":"L + MtR) by Lemma B.4, where","inline":true}],[{"style":{"width":"100%"},"width":1877,"height":829,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-16.png","element":"img"}],[{"text":"where the first inequality holds due to Lemma ","element":"span"},{"href":"#id-82","text":"A.1","element":"a"},{"text":", the second inequality holds due to the fact that ","element":"span"},{"style":{"height":22.09},"width":289.98,"height":55.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-17.png","element":"img","alt":"∥h �T ∥2 ≤ ∥h∗�T ∥2","inline":true,"padRight":true},{"text":"from Lemma ","element":"span"},{"href":"#id-91","text":"B.4","element":"a"},{"text":", the last inequality holds due to the facts that ","element":"span"},{"style":{"height":19.62},"width":120.49,"height":49.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-18.png","element":"img","alt":" ∥∇m �T","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":18.82},"width":51.88,"height":47.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-19.png","element":"img","alt":"h �T","inline":true,"padRight":true},{"text":")","element":"span"},{"style":{"height":17.6},"width":116.93,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-20.png","element":"img","alt":"∥2 ≤ ϵ","inline":true,"padRight":true},{"text":"from Cubic-Finalsolver and ","element":"span"},{"href":"#id-87","style":{"height":24.62},"width":588.51,"height":61.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-21.png","element":"img","alt":" ∥h∗�T ∥2 ≤�ϵ/ρ by Lemma B.3.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"B.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Corollary ","element":"span"},{"href":"#id-71","style":{"fontWeight":"bold"},"text":"5.2","element":"a"}],[{"text":"We have the following lemma to bound the total number of iterations ","element":"span"},{"style":{"height":12},"width":56.74,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-22.png","element":"img","alt":" T ′′ ","inline":true,"padRight":true},{"text":"of Algorithm ","element":"span"},{"href":"#id-66","text":"4 ","element":"a"},{"text":"Cubic-Finalsolver.","element":"span"}],[{"id":"id-93","style":{"fontWeight":"bold"},"text":"Lemma B.5. ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":19.14},"width":271.61,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-23.png","element":"img","alt":" ϵ < 4L2ρ/M 2t","inline":true,"padRight":true},{"text":", then Cubic-Finalsolver will terminate within ","element":"span"},{"style":{"height":19.12},"width":317.34,"height":47.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-24.png","element":"img","alt":" T ′′ = CF L/√ρϵ","inline":true,"padRight":true},{"text":"iterations, where ","element":"span"},{"style":{"height":15.1},"width":105.44,"height":37.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/19-25.png","element":"img","alt":" CF >","inline":true,"padRight":true},{"text":"0 is a constant.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Corollary ","element":"span"},{"href":"#id-71","style":{"fontStyle":"italic"},"text":"5.2","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"We have that","element":"span"}],[{"id":"id-92","style":{"width":"94%"},"width":1768,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-0.png","element":"img"}],[{"text":"where the first inequality holds due to H¨older’s inequality, the second inequality holds due to the facts that ","element":"span"},{"style":{"height":14.73},"width":145.88,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-1.png","element":"img","alt":" T ∗ ≤ T","inline":true,"padRight":true},{"text":"= 25∆","element":"span"},{"style":{"height":20.33},"width":193.9,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-2.png","element":"img","alt":"F ρ1/2/ϵ3/2","inline":true,"padRight":true},{"text":"and ∆","element":"span"},{"style":{"height":22.1},"width":389.32,"height":55.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-3.png","element":"img","alt":"F ≥ ρ �T ∗−1t=0 ∥ht∥32/","inline":true},{"text":"4 by (","element":"span"},{"href":"#id-90","text":"B.5","element":"a"},{"text":"). We first consider the ","element":"span"},{"text":"total stochastic gradient computations, ","element":"span"},{"style":{"height":23.96},"width":219.02,"height":59.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-4.png","element":"img","alt":"�T ∗−1t=0 B(g)t","inline":true,"padRight":true},{"text":", which can be bounded as","element":"span"}],[{"id":"id-94","style":{"width":"98%"},"width":1845,"height":960,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.02},"width":48.19,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-6.png","element":"img","alt":" C1","inline":true,"padRight":true},{"text":"= 2640 log","element":"span"},{"style":{"height":19.87},"width":100.79,"height":49.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-7.png","element":"img","alt":"2(d/δ","inline":true},{"text":"), the second inequality holds due to (","element":"span"},{"href":"#id-92","text":"B.6","element":"a"},{"text":"), the last equality holds due to the fact ","element":"span"},{"style":{"height":16.33},"width":72.71,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-8.png","element":"img","alt":" S(g)","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"height":21.73},"width":405.74,"height":54.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-9.png","element":"img","alt":"√ρϵ/L ·�n ∧ M2/ϵ2","inline":true},{"text":". We now consider the total amount of Hessian-vector product computations ","element":"span"},{"style":{"fontStyle":"italic"},"text":"T ","element":"span"},{"text":", which includes ","element":"span"},{"style":{"height":15.42},"width":40.76,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-10.png","element":"img","alt":" T1","inline":true,"padRight":true},{"text":"from Cubic-Subsolver and ","element":"span"},{"style":{"height":15.42},"width":40.76,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-11.png","element":"img","alt":" T2","inline":true,"padRight":true},{"text":"from Cubic-Finalsolver. By Lemma ","element":"span"},{"href":"#id-87","text":"B.3","element":"a"},{"text":", we know that at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"-th iteration of SRVRC","element":"span"},{"style":{"height":8.8},"width":53.61,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-12.png","element":"img","alt":"free","inline":true},{"text":", Cubic-Subsolver has ","element":"span"},{"style":{"height":12},"width":47.56,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-13.png","element":"img","alt":" T ′","inline":true,"padRight":true},{"text":"iterations, which needs ","element":"span"},{"style":{"height":24.44},"width":80.98,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-14.png","element":"img","alt":" B(h)k","inline":true,"padRight":true},{"text":"Hessian-vector product computations each time. Thus, we have","element":"span"}],[{"id":"id-95","style":{"width":"66%"},"width":1246,"height":508,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-15.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.88},"width":356.9,"height":49.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-16.png","element":"img","alt":" C2 = 1200 log2(d/δ","inline":true},{"text":"), the first inequality holds due to the fact that ","element":"span"},{"style":{"height":24.44},"width":493.71,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/20-17.png","element":"img","alt":" B(h)k = C2n ∧ (L2/ρϵ), the","inline":true,"padRight":true},{"text":"second inequality holds due to the fact that ","element":"span"},{"style":{"height":20.33},"width":361.36,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-0.png","element":"img","alt":" T = 25∆F ρ1/2/ϵ3/2","inline":true},{"text":", the last inequality holds due to the fact that ","element":"span"},{"style":{"height":21.65},"width":1027.11,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-1.png","element":"img","alt":" T ′ = CSL/Mt ·�ρ/ϵ = CSL/(4√ρϵ). For T2, we have","inline":true}],[{"id":"id-96","style":{"width":"81%"},"width":1523,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-2.png","element":"img"}],[{"text":"where the first inequality holds due to the fact that ","element":"span"},{"style":{"height":24.29},"width":430.92,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-3.png","element":"img","alt":" B(h)T ∗−1 = C2n ∧ (L2/ρϵ","inline":true},{"text":"), the second inequality ","element":"span"},{"text":"holds due to the fact that ","element":"span"},{"style":{"height":12},"width":56.75,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-4.png","element":"img","alt":" T ′′","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"height":19.12},"width":187.76,"height":47.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-5.png","element":"img","alt":" CF L/√ρϵ","inline":true,"padRight":true},{"text":"by Lemma ","element":"span"},{"href":"#id-93","text":"B.5","element":"a"},{"text":". Since at each iteration we need ","element":"span"},{"style":{"height":24.29},"width":118.14,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-6.png","element":"img","alt":" B(h)T ∗−1","inline":true,"padRight":true},{"text":"Hessian-vector computations.","element":"span"}],[{"text":"Combining (","element":"span"},{"href":"#id-94","text":"B.7","element":"a"},{"text":"), (","element":"span"},{"href":"#id-95","text":"B.8","element":"a"},{"text":") and (","element":"span"},{"href":"#id-96","text":"B.9","element":"a"},{"text":"), we know that the total stochastic gradient and Hessian-vector product computations are bounded as","element":"span"}],[{"style":{"width":"93%"},"width":1756,"height":343,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-7.png","element":"img"}]]},{"heading":"C Proofs of Technical Lemmas in Appendix A","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"C.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-82","style":{"fontWeight":"bold"},"text":"A.1","element":"a"}],[{"text":"We have the following lemmas from ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":")","element":"span"}],[{"id":"id-97","style":{"fontWeight":"bold"},"text":"Lemma C.1. ","element":"span"},{"text":"(","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":") If ","element":"span"},{"style":{"height":16},"width":159.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-8.png","element":"img","alt":" Mt ≥ 2ρ","inline":true},{"text":", then we have","element":"span"}],[{"style":{"width":"84%"},"width":1582,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-9.png","element":"img"}],[{"id":"id-98","style":{"fontWeight":"bold"},"text":"Lemma C.2. ","element":"span"},{"text":"(","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"Zhou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":59,"text":"2018d","element":"a"},{"text":") If ","element":"span"},{"style":{"height":16},"width":159.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-10.png","element":"img","alt":" Mt ≥ 2ρ","inline":true},{"text":", then we have","element":"span"}],[{"style":{"width":"74%"},"width":1390,"height":57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-82","style":{"fontStyle":"italic"},"text":"A.1","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"By Lemma ","element":"span"},{"href":"#id-97","text":"C.1","element":"a"},{"text":", we have","element":"span"}],[{"style":{"height":38},"width":1020.95,"height":95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-12.png","element":"img","alt":"∥∇F(xt + h)∥3/22 ≤�Mt∥h∥22 +��∇F(xt) − vt��2 + 1Mt","inline":true}],[{"id":"id-99","style":{"width":"81%"},"width":1523,"height":134,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-13.png","element":"img"}],[{"text":"where the second inequality holds due to the fact that for any ","element":"span"},{"style":{"height":15.6},"width":145.61,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-14.png","element":"img","alt":" a, b, c ≥","inline":true,"padRight":true},{"text":"0, we have (","element":"span"},{"style":{"height":20.34},"width":283.47,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/21-15.png","element":"img","alt":"a + b + c)3/2 ≤","inline":true}],[{"text":"2(","element":"span"},{"style":{"height":17.53},"width":323.82,"height":43.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-0.png","element":"img","alt":"a3/2 + b3/2 + c3/2","inline":true},{"text":"). By Lemma ","element":"span"},{"href":"#id-98","text":"C.2","element":"a"},{"text":", we have","element":"span"}],[{"id":"id-100","style":{"width":"98%"},"width":1839,"height":182,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-1.png","element":"img"}],[{"text":"where the second inequality holds due to the fact that for any ","element":"span"},{"style":{"height":15.6},"width":146.37,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-2.png","element":"img","alt":" a, b, c ≥","inline":true,"padRight":true},{"text":"0, we have (","element":"span"},{"style":{"height":19.13},"width":253.09,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-3.png","element":"img","alt":"a + b + c)3 ≤","inline":true,"padRight":true},{"text":"9(","element":"span"},{"style":{"height":16.33},"width":222.2,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-4.png","element":"img","alt":"a3 + b3 + c3","inline":true},{"text":"). Thus we have","element":"span"}],[{"style":{"width":"86%"},"width":1625,"height":253,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-5.png","element":"img"}],[{"text":"where the inequality holds due to (","element":"span"},{"href":"#id-99","text":"C.1","element":"a"},{"text":"), (","element":"span"},{"href":"#id-100","text":"C.2","element":"a"},{"text":") and the fact that ","element":"span"},{"style":{"height":16},"width":171.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-6.png","element":"img","alt":" Mt ≥ 4ρ.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"C.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-77","style":{"fontWeight":"bold"},"text":"A.2","element":"a"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-77","style":{"fontStyle":"italic"},"text":"A.2","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"We have","element":"span"}],[{"style":{"width":"73%"},"width":1382,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-7.png","element":"img"}],[{"text":"where the first inequality holds due to CauchySchwarz inequality, the second inequality holds due to Young’s inequality. We also have","element":"span"}],[{"style":{"width":"82%"},"width":1543,"height":99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-8.png","element":"img"}],[{"text":"where the first inequality holds due to CauchySchwarz inequality, the second inequality holds due to Young’s inequality.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"C.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-78","style":{"fontWeight":"bold"},"text":"A.3","element":"a"}],[{"text":"We need the following lemma:","element":"span"}],[{"id":"id-101","style":{"fontWeight":"bold"},"text":"Lemma C.3. ","element":"span"},{"text":"Conditioned on ","element":"span"},{"style":{"height":15.24},"width":49.36,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-9.png","element":"img","alt":" Fk","inline":true},{"text":", with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":256.8,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-10.png","element":"img","alt":" − δ , we have","inline":true}],[{"style":{"width":"93%"},"width":1758,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-11.png","element":"img"}],[{"text":"We also have","element":"span"}],[{"style":{"width":"70%"},"width":1325,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/22-12.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-78","style":{"fontStyle":"italic"},"text":"A.3","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"First, we have ","element":"span"},{"style":{"height":24.93},"width":779.05,"height":62.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-0.png","element":"img","alt":" vt − ∇F(xt) = �tk=⌊t/S(g)⌋·S(g) uk, where","inline":true}],[{"style":{"width":"82%"},"width":1542,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-1.png","element":"img"}],[{"text":"Meanwhile, we have ","element":"span"},{"style":{"height":17.6},"width":194.75,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-2.png","element":"img","alt":" E[uk|Fk−1","inline":true},{"text":"] = 0. Conditioned on ","element":"span"},{"style":{"height":20.33},"width":435.57,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-3.png","element":"img","alt":" Fk−1, for mod(k, S(g)) ̸","inline":true},{"text":"= 0, from Lemma ","element":"span"},{"href":"#id-101","text":"C.3","element":"a"},{"text":", we have that with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":63.64,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-4.png","element":"img","alt":" − δ","inline":true,"padRight":true},{"text":"the following inequality holds :","element":"span"}],[{"id":"id-102","style":{"width":"79%"},"width":1481,"height":139,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-5.png","element":"img"}],[{"text":"where the second inequality holds due to (","element":"span"},{"href":"#id-64","text":"4.1","element":"a"},{"text":"). For mod(","element":"span"},{"style":{"height":19.13},"width":116.19,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-6.png","element":"img","alt":"k, S(g)","inline":true},{"text":") = 0, with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":74.74,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-7.png","element":"img","alt":" − δ,","inline":true,"padRight":true},{"text":"we have","element":"span"}],[{"id":"id-103","style":{"width":"70%"},"width":1325,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-8.png","element":"img"}],[{"text":"where the second inequality holds due to (","element":"span"},{"href":"#id-64","text":"4.3","element":"a"},{"text":"). Conditioned on ","element":"span"},{"style":{"height":20.52},"width":224.12,"height":51.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-9.png","element":"img","alt":" F⌊t/S(g)⌋·S(g)","inline":true},{"text":", by union bound, with probability at least 1 ","element":"span"},{"style":{"height":20.34},"width":445.52,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-10.png","element":"img","alt":" − δ · (t − ⌊t/S(g)⌋ · S(g)","inline":true},{"text":") (","element":"span"},{"href":"#id-102","text":"C.5","element":"a"},{"text":") or (","element":"span"},{"href":"#id-103","text":"C.6","element":"a"},{"text":") holds for all ","element":"span"},{"style":{"height":20.34},"width":426.88,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-11.png","element":"img","alt":" ⌊t/S(g)⌋ · S(g) ≤ k ≤ t","inline":true},{"text":". Then for given ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":", by vector Azuma-Hoeffding inequality in Lemma ","element":"span"},{"href":"#id-104","text":"F.1","element":"a"},{"text":", conditioned on","element":"span"},{"style":{"height":15.24},"width":49.36,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-12.png","element":"img","alt":"Fk","inline":true},{"text":", with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":230.14,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-13.png","element":"img","alt":" − δ we have","inline":true}],[{"id":"id-105","style":{"width":"91%"},"width":1724,"height":460,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-14.png","element":"img"}],[{"text":"Finally, by union bound, we have that with probability at least 1 ","element":"span"},{"style":{"height":20.33},"width":617.86,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-15.png","element":"img","alt":" − 2δ · (t − ⌊t/S(g)⌋ · S(g)), for all","inline":true},{"style":{"height":20.33},"width":414.08,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-16.png","element":"img","alt":"⌊t/S(g)⌋ · S(g) ≤ k ≤ t","inline":true},{"text":", we have (","element":"span"},{"href":"#id-105","text":"C.7","element":"a"},{"text":") holds.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"C.4 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-79","style":{"fontWeight":"bold"},"text":"A.4","element":"a"}],[{"text":"We need the following lemma:","element":"span"}],[{"id":"id-106","style":{"fontWeight":"bold"},"text":"Lemma C.4. ","element":"span"},{"text":"Conditioned on ","element":"span"},{"style":{"height":15.24},"width":49.37,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-17.png","element":"img","alt":" Fk","inline":true},{"text":", with probability at least 1","element":"span"},{"style":{"height":12.8},"width":55.34,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-18.png","element":"img","alt":"−δ","inline":true,"padRight":true},{"text":", we have the following concentration inequality","element":"span"}],[{"style":{"width":"95%"},"width":1788,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/23-19.png","element":"img"}],[{"text":"We also have","element":"span"}],[{"style":{"width":"71%"},"width":1334,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-79","style":{"fontStyle":"italic"},"text":"A.4","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"First, we have ","element":"span"},{"style":{"height":24.93},"width":824.08,"height":62.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-1.png","element":"img","alt":" Ut − ∇2F(xt) = �tk=⌊t/S(h)⌋·S(h) Vk, where","inline":true}],[{"style":{"width":"85%"},"width":1598,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-2.png","element":"img"}],[{"text":"Meanwhile, we have ","element":"span"},{"style":{"height":17.6},"width":387.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-3.png","element":"img","alt":" E[Vk|σ(Vk−1, ..., V0","inline":true},{"text":")] = 0. Conditioned on ","element":"span"},{"style":{"height":15.24},"width":93.2,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-4.png","element":"img","alt":" Fk−1","inline":true},{"text":", for mod(","element":"span"},{"style":{"height":20.33},"width":150,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-5.png","element":"img","alt":"k, S(h)) ̸","inline":true},{"text":"= 0, from Lemma ","element":"span"},{"href":"#id-106","text":"C.4","element":"a"},{"text":", we have that with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":63.64,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-6.png","element":"img","alt":" − δ","inline":true},{"text":", the following inequality holds :","element":"span"}],[{"id":"id-107","style":{"width":"79%"},"width":1484,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-7.png","element":"img"}],[{"text":"where the second inequality holds due to (","element":"span"},{"href":"#id-64","text":"4.1","element":"a"},{"text":"). For mod(","element":"span"},{"style":{"height":19.14},"width":118.45,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-8.png","element":"img","alt":"k, S(h)","inline":true},{"text":") = 0, with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":74.47,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-9.png","element":"img","alt":" − δ,","inline":true,"padRight":true},{"text":"we have","element":"span"}],[{"style":{"width":"70%"},"width":1323,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-10.png","element":"img"}],[{"text":"where the second inequality holds due to (","element":"span"},{"href":"#id-64","text":"4.3","element":"a"},{"text":"). Conditioned on ","element":"span"},{"style":{"height":20.52},"width":228.06,"height":51.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-11.png","element":"img","alt":" F⌊t/S(h)⌋·S(h)","inline":true},{"text":", by union bound, with probability at least 1 ","element":"span"},{"href":"#id-107","style":{"height":20.34},"width":786.09,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-12.png","element":"img","alt":" − δ · (t − ⌊t/S(h)⌋ · S(h)) (C.10) or (C.11)","inline":true,"padRight":true},{"text":"holds for all ","element":"span"},{"style":{"height":20.34},"width":430.37,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-13.png","element":"img","alt":" ⌊t/S(h)⌋ · S(h) ≤ k ≤ t.","inline":true,"padRight":true},{"text":"Then for given ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":", by Matrix Azuma inequality Lemma ","element":"span"},{"href":"#id-108","text":"F.2","element":"a"},{"text":", conditioned on","element":"span"},{"style":{"height":15.24},"width":49.36,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-14.png","element":"img","alt":"Fk","inline":true},{"text":", with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":230.14,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-15.png","element":"img","alt":" − δ we have","inline":true}],[{"id":"id-109","style":{"width":"92%"},"width":1743,"height":426,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-16.png","element":"img"}],[{"text":"Finally, by union bound, we have that with probability at least 1 ","element":"span"},{"style":{"height":20.33},"width":622.01,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-17.png","element":"img","alt":" − 2δ · (t − ⌊t/S(h)⌋ · S(h)), for all","inline":true},{"style":{"height":20.33},"width":418.6,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-18.png","element":"img","alt":"⌊t/S(h)⌋ · S(h) ≤ k ≤ t","inline":true},{"text":", we have (","element":"span"},{"href":"#id-109","text":"C.12","element":"a"},{"text":") holds.","element":"span"}],[{"style":{"width":"1%"},"width":30,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/24-19.png","element":"img"}]]},{"heading":"D Proofs of Technical Lemmas in Appendix B","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"D.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-87","style":{"fontWeight":"bold"},"text":"B.3","element":"a"}],[{"text":"We have the following lemma which guarantees the effectiveness of Cubic-Subsolver in Algorithm ","element":"span"},{"href":"#id-66","text":"3","element":"a"},{"text":". ","element":"span"},{"id":"id-111","style":{"fontWeight":"bold"},"text":"Lemma D.1. ","element":"span"},{"text":"(","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":") Let ","element":"span"},{"style":{"height":15.93},"width":184.57,"height":39.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-0.png","element":"img","alt":" A ∈ Rd×d","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.53},"width":686.71,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-1.png","element":"img","alt":" ∥A∥2 ≤ β, b ∈ Rd, τ > 0, ζ > 0, ϵ′ ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1)","element":"span"},{"style":{"height":15.6},"width":93.88,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-2.png","element":"img","alt":", δ′ ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1) and ","element":"span"},{"style":{"height":17.6},"width":193.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-3.png","element":"img","alt":" η < 1/(8β","inline":true,"padRight":true},{"text":"+ 2","element":"span"},{"style":{"height":16.4},"width":45,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-4.png","element":"img","alt":"τζ","inline":true},{"text":"). We denote that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"g","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"h","element":"span"},{"text":") = ","element":"span"},{"style":{"height":17.6},"width":283.05,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-5.png","element":"img","alt":" b⊤h + h⊤Ah/","inline":true},{"text":"2 + ","element":"span"},{"style":{"height":19.41},"width":189.03,"height":48.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-6.png","element":"img","alt":" τ/6 · ∥h∥32","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.26},"width":372.7,"height":45.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-7.png","element":"img","alt":"s = argminh∈Rd g(h","inline":true},{"text":"). Then with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":130.66,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-8.png","element":"img","alt":" − δ′, if","inline":true}],[{"id":"id-110","style":{"width":"72%"},"width":1356,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-9.png","element":"img"}],[{"text":"then ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"= Cubic-Subsolver(","element":"span"},{"style":{"height":16.4},"width":315.25,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-10.png","element":"img","alt":"A, b, τ, η, ζ, ϵ′, δ′","inline":true},{"text":") satisfies that ","element":"span"},{"style":{"height":19.13},"width":455.62,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-11.png","element":"img","alt":" g(x) ≤ −(1 − ϵ′)τζ3/12.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-87","style":{"fontStyle":"italic"},"text":"B.3","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"We simply set ","element":"span"},{"style":{"height":16.4},"width":539.1,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-12.png","element":"img","alt":" A = Ut, b = vt, τ = Mt, η","inline":true,"padRight":true},{"text":"= (16","element":"span"},{"style":{"height":20.8},"width":375.14,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-13.png","element":"img","alt":"L)−1, ζ =�ϵ/ρ, ϵ′","inline":true,"padRight":true},{"text":"= 0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"5 and ","element":"span"},{"style":{"height":17.6},"width":518.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-14.png","element":"img","alt":" δ′ = δ. We have ∥Ut∥2 ≤ L","inline":true},{"text":", then we set ","element":"span"},{"style":{"height":16.4},"width":114.49,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-15.png","element":"img","alt":" β = L","inline":true},{"text":". With the choice of ","element":"span"},{"style":{"height":16},"width":510.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-16.png","element":"img","alt":" Mt where Mt = 4ρ and the","inline":true,"padRight":true},{"text":"assumption that ","element":"span"},{"style":{"height":19.13},"width":252.04,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-17.png","element":"img","alt":" ϵ < 4L2ρ/M 2t ","inline":true,"padRight":true},{"text":", we can check that ","element":"span"},{"style":{"height":17.6},"width":309.13,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-18.png","element":"img","alt":" η < 1/(8β + 2τζ","inline":true},{"text":"). We also have that ","element":"span"},{"style":{"height":16.72},"width":208.14,"height":41.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-19.png","element":"img","alt":" s = h∗t and","inline":true,"padRight":true},{"text":"(","element":"span"},{"href":"#id-110","text":"D.1","element":"a"},{"text":") holds. Thus, by Lemma ","element":"span"},{"href":"#id-111","text":"D.1","element":"a"},{"text":", we have","element":"span"}],[{"style":{"width":"47%"},"width":884,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-20.png","element":"img"}],[{"text":"By the choice of ","element":"span"},{"style":{"height":12},"width":47.56,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-21.png","element":"img","alt":" T ′ ","inline":true,"padRight":true},{"text":"in Cubic-Subsolver, we have","element":"span"}],[{"style":{"width":"84%"},"width":1590,"height":193,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-22.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"D.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-93","style":{"fontWeight":"bold"},"text":"B.5","element":"a"}],[{"text":"We have the following lemma which provides the guarantee for the function value in Cubic-Finalsolver.","element":"span"}],[{"id":"id-112","style":{"fontWeight":"bold"},"text":"Lemma D.2. ","element":"span"},{"text":"(","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":") We denote that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"g","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"h","element":"span"},{"text":") = ","element":"span"},{"style":{"height":17.6},"width":284.78,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-23.png","element":"img","alt":" b⊤h + h⊤Ah/","inline":true},{"text":"2 + ","element":"span"},{"style":{"height":19.41},"width":190.75,"height":48.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-24.png","element":"img","alt":" τ/6 · ∥h∥32","inline":true},{"text":", ","element":"span"},{"style":{"height":19.8},"width":1081.69,"height":49.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-25.png","element":"img","alt":"s = argminh∈Rd g(h), then g(s) ≥ ∥b∥2∥s∥2/2 − τ∥s∥32/6.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-93","style":{"fontStyle":"italic"},"text":"B.5","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"In Cubic-Finalsolver we are focusing on minimizing ","element":"span"},{"style":{"height":17.6},"width":170.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-26.png","element":"img","alt":" mT ∗−1(h","inline":true},{"text":"). We have that ","element":"span"},{"style":{"height":21.01},"width":815.18,"height":52.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-27.png","element":"img","alt":" ∥vt∥2 < max{Mtϵ/(2ρ),�LMt/2(ϵ/ρ)3/4}","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20.9},"width":352.5,"height":52.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-28.png","element":"img","alt":" ∥h∗T ∗−1∥2 ≤�ϵ/ρ","inline":true,"padRight":true},{"text":"by Lemma ","element":"span"},{"href":"#id-87","text":"B.3","element":"a"},{"text":". We can check that ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-29.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"= (16","element":"span"},{"style":{"height":19.14},"width":90.1,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-30.png","element":"img","alt":"L)−1","inline":true,"padRight":true},{"text":"satisfies that ","element":"span"},{"style":{"height":13.6},"width":69.39,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-31.png","element":"img","alt":" η <","inline":true,"padRight":true},{"text":"(4(","element":"span"},{"style":{"height":19.14},"width":218.14,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-32.png","element":"img","alt":"L + τR))−1","inline":true},{"text":", where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R ","element":"span"},{"text":"is defined in Lemma ","element":"span"},{"href":"#id-91","text":"B.4","element":"a"},{"text":", when ","element":"span"},{"style":{"height":19.14},"width":251.94,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-33.png","element":"img","alt":"ϵ < 4L2ρ/M 2t ","inline":true,"padRight":true},{"text":". From Lemma ","element":"span"},{"href":"#id-91","text":"B.4 ","element":"a"},{"text":"we also know that ","element":"span"},{"style":{"height":17.2},"width":452.02,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-34.png","element":"img","alt":" mT ∗−1 is (L+2MT ∗−1R","inline":true},{"text":")-smooth, which satisfies ","element":"span"},{"text":"that 1","element":"span"},{"style":{"height":17.6},"width":409.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-35.png","element":"img","alt":"/η > 2(L + 2MT ∗−1R","inline":true},{"text":"). Thus, by standard gradient descent analysis, to get a point ∆ where ","element":"span"},{"style":{"height":17.6},"width":370.69,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-36.png","element":"img","alt":"∥∇mT ∗−1(∆)∥2 ≤ ϵ","inline":true},{"text":", Cubic-Finalsolver needs to run","element":"span"}],[{"style":{"width":"92%"},"width":1731,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-37.png","element":"img"}],[{"text":"iterations, where we denote by ∆","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-38.png","element":"img","alt":"0","inline":true,"padRight":true},{"text":"the starting point of Cubic-Finalsolver. By directly computing, we have ","element":"span"},{"style":{"height":17.6},"width":260.63,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-39.png","element":"img","alt":" mT ∗−1(∆0) ≤","inline":true,"padRight":true},{"text":"0. By Lemma ","element":"span"},{"href":"#id-112","text":"D.2","element":"a"},{"text":", we have","element":"span"}],[{"id":"id-113","style":{"width":"58%"},"width":1090,"height":271,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/25-40.png","element":"img"}],[{"text":"Thus, (","element":"span"},{"href":"#id-113","text":"D.2","element":"a"},{"text":") can be further bounded as ","element":"span"},{"style":{"height":19.12},"width":318.7,"height":47.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-0.png","element":"img","alt":" T ′′ = O(L/√ρϵ).","inline":true}]]},{"heading":"E Proofs of Additional Lemmas in Appendix C","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"E.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-101","style":{"fontWeight":"bold"},"text":"C.3","element":"a"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-101","style":{"fontStyle":"italic"},"text":"C.3","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"We only need to consider the case where ","element":"span"},{"style":{"height":24.44},"width":300.89,"height":61.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-1.png","element":"img","alt":" B(g)k = |Jk| < n","inline":true},{"text":". For each ","element":"span"},{"style":{"height":14.84},"width":117.27,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-2.png","element":"img","alt":" i ∈ Jk","inline":true},{"text":", let","element":"span"}],[{"style":{"width":"51%"},"width":958,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-3.png","element":"img"}],[{"text":"then we have ","element":"span"},{"style":{"height":15.2},"width":223.59,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-4.png","element":"img","alt":" Eiai = 0, ai","inline":true,"padRight":true},{"text":"i.i.d., and","element":"span"}],[{"style":{"width":"80%"},"width":1504,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-5.png","element":"img"}],[{"text":"where the second inequality holds due to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"-smoothness of ","element":"span"},{"style":{"height":16.4},"width":33.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-6.png","element":"img","alt":" fi","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":". Thus by vector Azuma-Hoeffding inequality in Lemma ","element":"span"},{"href":"#id-104","text":"F.1","element":"a"},{"text":", we have that with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":76.68,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-7.png","element":"img","alt":" − δ,","inline":true}],[{"style":{"width":"63%"},"width":1192,"height":347,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-8.png","element":"img"}],[{"text":"For each ","element":"span"},{"style":{"height":15.6},"width":193.59,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-9.png","element":"img","alt":" i ∈ Jk, let","inline":true}],[{"style":{"width":"24%"},"width":464,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-10.png","element":"img"}],[{"text":"then we have ","element":"span"},{"style":{"height":17.6},"width":469.46,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-11.png","element":"img","alt":" Eibi = 0 and ∥bi∥2 ≤ M","inline":true},{"text":". Thus by vector Azuma-Hoeffding inequality in Lemma ","element":"span"},{"href":"#id-104","text":"F.1","element":"a"},{"text":", we have that with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":76.68,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-12.png","element":"img","alt":" − δ,","inline":true}],[{"style":{"width":"89%"},"width":1680,"height":216,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-13.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"E.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-106","style":{"fontWeight":"bold"},"text":"C.4","element":"a"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-106","style":{"fontStyle":"italic"},"text":"C.4","element":"a"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"text":"We only need to consider the case where ","element":"span"},{"style":{"height":24.44},"width":681.48,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-14.png","element":"img","alt":" B(h)k = |Ik| < n. For each i ∈ Ik, let","inline":true}],[{"style":{"width":"55%"},"width":1047,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-15.png","element":"img"}],[{"text":"then we have ","element":"span"},{"style":{"height":17.29},"width":446,"height":43.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-16.png","element":"img","alt":" EiAi = 0, A⊤i = Ai, Ai","inline":true,"padRight":true},{"text":"i.i.d. and","element":"span"}],[{"style":{"width":"85%"},"width":1596,"height":57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/26-17.png","element":"img"}],[{"text":"where the second inequality holds due to ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-0.png","element":"img","alt":" ρ","inline":true},{"text":"-Hessian Lipschitz continuous of ","element":"span"},{"style":{"height":16.4},"width":33.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-1.png","element":"img","alt":" fi","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":". Then by Matrix Azuma inequality Lemma ","element":"span"},{"href":"#id-108","text":"F.2","element":"a"},{"text":", we have that with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":76.68,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-2.png","element":"img","alt":" − δ,","inline":true}],[{"style":{"width":"67%"},"width":1266,"height":348,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-3.png","element":"img"}],[{"text":"For each ","element":"span"},{"style":{"height":15.6},"width":187.77,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-4.png","element":"img","alt":" i ∈ Ik, let","inline":true}],[{"style":{"width":"27%"},"width":510,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-5.png","element":"img"}],[{"text":"then we have ","element":"span"},{"style":{"height":14.62},"width":90.31,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-6.png","element":"img","alt":" EiBi","inline":true,"padRight":true},{"text":"= 0, ","element":"span"},{"style":{"height":16.89},"width":51.7,"height":42.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-7.png","element":"img","alt":" B⊤i","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"height":14.62},"width":47.69,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-8.png","element":"img","alt":" Bi","inline":true},{"text":", and ","element":"span"},{"style":{"height":17.6},"width":221.96,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-9.png","element":"img","alt":" ∥Bi∥2 ≤ 2L","inline":true},{"text":". Then by Matrix Azuma inequality in Lemma ","element":"span"},{"href":"#id-108","text":"F.2","element":"a"},{"text":", we have that with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":76.68,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-10.png","element":"img","alt":" − δ,","inline":true}],[{"style":{"width":"82%"},"width":1550,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-11.png","element":"img"}],[{"text":"which completes the proof.","element":"span"}]]},{"heading":"F Auxiliary Lemmas","paragraphs":[[{"text":"We have the following vector Azuma-Hoeffding inequality:","element":"span"}],[{"id":"id-104","style":{"fontWeight":"bold"},"text":"Lemma F.1. ","element":"span"},{"text":"(","element":"span"},{"href":"#id-114","referenceIndex":39,"text":"Pinelis","element":"a"},{"text":", ","element":"span"},{"href":"#id-114","referenceIndex":39,"text":"1994","element":"a"},{"text":") Consider ","element":"span"},{"style":{"height":17.6},"width":90.78,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-12.png","element":"img","alt":" {vk}","inline":true,"padRight":true},{"text":"be a vector-valued martingale difference, where ","element":"span"},{"style":{"height":17.6},"width":781.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-13.png","element":"img","alt":"E[vk|σ(v1, ..., vk−1)] = 0 and ∥vk∥2 ≤ Ak","inline":true},{"text":", then we have that with probability at least 1 ","element":"span"},{"style":{"height":15.6},"width":76.68,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-14.png","element":"img","alt":" − δ,","inline":true}],[{"style":{"width":"32%"},"width":604,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-15.png","element":"img"}],[{"text":"We have the following Matrix Azuma inequality :","element":"span"}],[{"id":"id-108","style":{"fontWeight":"bold"},"text":"Lemma F.2. ","element":"span"},{"text":"(","element":"span"},{"href":"#id-115","referenceIndex":46,"text":"Tropp","element":"a"},{"text":", ","element":"span"},{"href":"#id-115","referenceIndex":46,"text":"2012","element":"a"},{"text":") Consider a finite adapted sequence ","element":"span"},{"style":{"height":17.6},"width":102.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-16.png","element":"img","alt":" {Xk}","inline":true,"padRight":true},{"text":"of self-adjoint matrices in dimension ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":", and a fixed sequence ","element":"span"},{"style":{"height":17.6},"width":102.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-17.png","element":"img","alt":" {Ak}","inline":true,"padRight":true},{"text":"of self-adjoint matrices that satisfy","element":"span"}],[{"style":{"width":"77%"},"width":1459,"height":334,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/27-18.png","element":"img"}]]},{"heading":"G Additional Algorithms and Functions","paragraphs":[[{"text":"Due to space limit, we include the approximate solvers (","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"Carmon and Duchi","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":8,"text":"2016","element":"a"},{"text":") for the cubic subproblem in this section for the purpose of self-containedness.","element":"span"}],[{"id":"id-66","style":{"width":"99%"},"width":1873,"height":1477,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/28-0.png","element":"img"}],[{"style":{"width":"99%"},"width":1872,"height":641,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/29-0.png","element":"img"}]]},{"heading":"References","paragraphs":[[{"id":"id-19","text":"Agarwal, N.","element":"span"},{"text":", ","element":"span"},{"text":"Allen-Zhu, Z.","element":"span"},{"text":", ","element":"span"},{"text":"Bullins, B.","element":"span"},{"text":", ","element":"span"},{"text":"Hazan, E. ","element":"span"},{"text":"and ","element":"span"},{"text":"Ma, T. ","element":"span"},{"text":"(2017). Finding approximate local minima faster than gradient descent. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 49th Annual ACM SIGACT Symposium on Theory of Computing","element":"span"},{"text":".","element":"span"}],[{"id":"id-41","text":"Allen-Zhu, Z. ","element":"span"},{"text":"(2017). Natasha 2: Faster non-convex optimization than sgd. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1708.08694 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-70","text":"Allen-Zhu, Z. ","element":"span"},{"text":"(2018). Natasha 2: Faster non-convex optimization than sgd. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-15","text":"Allen-Zhu, Z. ","element":"span"},{"text":"and ","element":"span"},{"text":"Hazan, E. ","element":"span"},{"text":"(2016). Variance reduction for faster non-convex optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":".","element":"span"}],[{"id":"id-43","text":"Allen-Zhu, Z. ","element":"span"},{"text":"and ","element":"span"},{"text":"Li, Y. ","element":"span"},{"text":"(2018). Neon2: Finding local minima via first-order oracles. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-5","text":"Bhojanapalli, S.","element":"span"},{"text":", ","element":"span"},{"text":"Neyshabur, B. ","element":"span"},{"text":"and ","element":"span"},{"text":"Srebro, N. ","element":"span"},{"text":"(2016). Global optimality of local search for low rank matrix recovery. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-34","text":"Blanchet, J.","element":"span"},{"text":", ","element":"span"},{"text":"Cartis, C.","element":"span"},{"text":", ","element":"span"},{"text":"Menickelly, M. ","element":"span"},{"text":"and ","element":"span"},{"text":"Scheinberg, K. ","element":"span"},{"text":"(2016). Convergence rate analysis of a stochastic trust region method for nonconvex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1609.07428 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-20","text":"Carmon, Y. ","element":"span"},{"text":"and ","element":"span"},{"text":"Duchi, J. C. ","element":"span"},{"text":"(2016). Gradient descent efficiently finds the cubic-regularized non-convex newton step. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1612.00547 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-21","text":"Carmon, Y. ","element":"span"},{"text":"and ","element":"span"},{"text":"Duchi, J. C. ","element":"span"},{"text":"(2018). Analysis of krylov subspace solutions of regularized non-convex quadratic problems. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-39","text":"Carmon, Y.","element":"span"},{"text":", ","element":"span"},{"text":"Duchi, J. C.","element":"span"},{"text":", ","element":"span"},{"text":"Hinder, O. ","element":"span"},{"text":"and ","element":"span"},{"text":"Sidford, A. ","element":"span"},{"text":"(2018). Accelerated methods for nonconvex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Optimization ","element":"span"},{"style":{"fontWeight":"bold"},"text":"28 ","element":"span"},{"text":"1751–1772.","element":"span"}],[{"id":"id-31","text":"Cartis, C.","element":"span"},{"text":", ","element":"span"},{"text":"Gould, N. I. ","element":"span"},{"text":"and ","element":"span"},{"text":"Toint, P. L. ","element":"span"},{"text":"(2009). Trust-region and other regularisations of linear least-squares problems. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"BIT Numerical Mathematics ","element":"span"},{"style":{"fontWeight":"bold"},"text":"49 ","element":"span"},{"text":"21–53.","element":"span"}],[{"id":"id-28","text":"Cartis, C.","element":"span"},{"text":", ","element":"span"},{"text":"Gould, N. I. ","element":"span"},{"text":"and ","element":"span"},{"text":"Toint, P. L. ","element":"span"},{"text":"(2011a). Adaptive cubic regularisation methods for unconstrained optimization. part i: motivation, convergence and numerical results. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming ","element":"span"},{"style":{"fontWeight":"bold"},"text":"127 ","element":"span"},{"text":"245–295.","element":"span"}],[{"id":"id-32","text":"Cartis, C.","element":"span"},{"text":", ","element":"span"},{"text":"Gould, N. I. ","element":"span"},{"text":"and ","element":"span"},{"text":"Toint, P. L. ","element":"span"},{"text":"(2012). Complexity bounds for second-order optimality in unconstrained optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Complexity ","element":"span"},{"style":{"fontWeight":"bold"},"text":"28 ","element":"span"},{"text":"93–108.","element":"span"}],[{"id":"id-33","text":"Cartis, C.","element":"span"},{"text":", ","element":"span"},{"text":"Gould, N. I. ","element":"span"},{"text":"and ","element":"span"},{"text":"Toint, P. L. ","element":"span"},{"text":"(2013). On the evaluation complexity of cubic regularization methods for potentially rank-deficient nonlinear least-squares problems and its relevance to constrained nonlinear optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Optimization ","element":"span"},{"style":{"fontWeight":"bold"},"text":"23 ","element":"span"},{"text":"1553–1574.","element":"span"}],[{"id":"id-29","text":"Cartis, C.","element":"span"},{"text":", ","element":"span"},{"text":"Gould, N. I. M. ","element":"span"},{"text":"and ","element":"span"},{"text":"Toint, P. L. ","element":"span"},{"text":"(2011b). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Adaptive cubic regularisation methods for unconstrained optimization. Part II: worst-case function- and derivative-evaluation complexity","element":"span"},{"text":". Springer-Verlag New York, Inc.","element":"span"}],[{"id":"id-72","text":"Chang, C.-C. ","element":"span"},{"text":"and ","element":"span"},{"text":"Lin, C.-J. ","element":"span"},{"text":"(2011). Libsvm: a library for support vector machines. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ACM transactions on intelligent systems and technology (TIST) ","element":"span"},{"style":{"fontWeight":"bold"},"text":"2 ","element":"span"},{"text":"27.","element":"span"}],[{"id":"id-30","text":"Conn, A. R.","element":"span"},{"text":", ","element":"span"},{"text":"Gould, N. I. ","element":"span"},{"text":"and ","element":"span"},{"text":"Toint, P. L. ","element":"span"},{"text":"(2000). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Trust region methods","element":"span"},{"text":". SIAM.","element":"span"}],[{"id":"id-35","text":"Curtis, F. E.","element":"span"},{"text":", ","element":"span"},{"text":"Robinson, D. P. ","element":"span"},{"text":"and ","element":"span"},{"text":"Samadi, M. ","element":"span"},{"text":"(2017). A trust region algorithm with a worst-case iteration complexity of ","element":"span"},{"style":{"height":20.33},"width":133.39,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/30-0.png","element":"img","alt":" o(ϵ−3/2","inline":true},{"text":") for nonconvex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming ","element":"span"},{"style":{"fontWeight":"bold"},"text":"162 ","element":"span"},{"text":"1–32.","element":"span"}],[{"id":"id-52","text":"Defazio, A.","element":"span"},{"text":", ","element":"span"},{"text":"Bach, F. ","element":"span"},{"text":"and ","element":"span"},{"text":"Lacoste-Julien, S. ","element":"span"},{"text":"(2014). Saga: A fast incremental gradient method with support for non-strongly convex composite objectives. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-23","text":"Fang, C.","element":"span"},{"text":", ","element":"span"},{"text":"Li, C. J.","element":"span"},{"text":", ","element":"span"},{"text":"Lin, Z. ","element":"span"},{"text":"and ","element":"span"},{"text":"Zhang, T. ","element":"span"},{"text":"(2018). Spider: Near-optimal non-convex optimization via stochastic path integrated differential estimator. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1807.01695 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-48","text":"Fang, C.","element":"span"},{"text":", ","element":"span"},{"text":"Lin, Z. ","element":"span"},{"text":"and ","element":"span"},{"text":"Zhang, T. ","element":"span"},{"text":"(2019). Sharp analysis for nonconvex sgd escaping from saddle points. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1902.00247 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-53","text":"Garber, D. ","element":"span"},{"text":"and ","element":"span"},{"text":"Hazan, E. ","element":"span"},{"text":"(2015). Fast and simple pca via convex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1509.05647 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-37","text":"Ge, R.","element":"span"},{"text":", ","element":"span"},{"text":"Huang, F.","element":"span"},{"text":", ","element":"span"},{"text":"Jin, C. ","element":"span"},{"text":"and ","element":"span"},{"text":"Yuan, Y. ","element":"span"},{"text":"(2015). Escaping from saddle pointsonline stochastic gradient for tensor decomposition. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on Learning Theory","element":"span"},{"text":".","element":"span"}],[{"id":"id-4","text":"Ge, R.","element":"span"},{"text":", ","element":"span"},{"text":"Lee, J. D. ","element":"span"},{"text":"and ","element":"span"},{"text":"Ma, T. ","element":"span"},{"text":"(2016). Matrix completion has no spurious local minimum. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"text":"Golub, G. H. ","element":"span"},{"text":"and ","element":"span"},{"text":"Van Loan, C. F. ","element":"span"},{"text":"(1996). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Matrix Computations (3rd Ed.)","element":"span"},{"text":". Johns Hopkins University Press, Baltimore, MD, USA.","element":"span"}],[{"id":"id-8","text":"Hardt, M. ","element":"span"},{"text":"and ","element":"span"},{"text":"Ma, T. ","element":"span"},{"text":"(2016). Identity matters in deep learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1611.04231 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-3","text":"Hillar, C. J. ","element":"span"},{"text":"and ","element":"span"},{"text":"Lim, L.-H. ","element":"span"},{"text":"(2013). Most tensor problems are np-hard. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the ACM (JACM) ","element":"span"},{"style":{"fontWeight":"bold"},"text":"60 ","element":"span"},{"text":"45.","element":"span"}],[{"id":"id-38","text":"Jin, C.","element":"span"},{"text":", ","element":"span"},{"text":"Ge, R.","element":"span"},{"text":", ","element":"span"},{"text":"Netrapalli, P.","element":"span"},{"text":", ","element":"span"},{"text":"Kakade, S. M. ","element":"span"},{"text":"and ","element":"span"},{"text":"Jordan, M. I. ","element":"span"},{"text":"(2017a). How to escape saddle points efficiently. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1703.00887 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-49","text":"Jin, C.","element":"span"},{"text":", ","element":"span"},{"text":"Netrapalli, P.","element":"span"},{"text":", ","element":"span"},{"text":"Ge, R.","element":"span"},{"text":", ","element":"span"},{"text":"Kakade, S. M. ","element":"span"},{"text":"and ","element":"span"},{"text":"Jordan, M. I. ","element":"span"},{"text":"(2019). Stochastic gradient descent escapes saddle points efficiently. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1902.04811 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-44","text":"Jin, C.","element":"span"},{"text":", ","element":"span"},{"text":"Netrapalli, P. ","element":"span"},{"text":"and ","element":"span"},{"text":"Jordan, M. I. ","element":"span"},{"text":"(2017b). Accelerated gradient descent escapes saddle points faster than gradient descent. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1711.10456 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-13","text":"Johnson, R. ","element":"span"},{"text":"and ","element":"span"},{"text":"Zhang, T. ","element":"span"},{"text":"(2013). Accelerating stochastic gradient descent using predictive variance reduction. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-7","text":"Kawaguchi, K. ","element":"span"},{"text":"(2016). ","element":"span"},{"text":"Deep learning without poor local minima. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-10","text":"Kohler, J. M. ","element":"span"},{"text":"and ","element":"span"},{"text":"Lucchi, A. ","element":"span"},{"text":"(2017). Sub-sampled cubic regularization for non-convex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1705.05933 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-1","text":"LeCun, Y.","element":"span"},{"text":", ","element":"span"},{"text":"Bengio, Y. ","element":"span"},{"text":"and ","element":"span"},{"text":"Hinton, G. ","element":"span"},{"text":"(2015). Deep learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Nature ","element":"span"},{"style":{"fontWeight":"bold"},"text":"521 ","element":"span"},{"text":"436–444.","element":"span"}],[{"id":"id-36","style":{"height":17.45},"width":350.84,"height":43.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/31-0.png","element":"img","alt":"Mart´ınez, J. M.","inline":true,"padRight":true},{"text":"and ","element":"span"},{"text":"Raydan, M. ","element":"span"},{"text":"(2017). Cubic-regularization counterpart of a variable-norm trust-region method for unconstrained minimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Global Optimization ","element":"span"},{"style":{"fontWeight":"bold"},"text":"68 ","element":"span"},{"text":"367–385.","element":"span"}],[{"id":"id-9","text":"Nesterov, Y. ","element":"span"},{"text":"and ","element":"span"},{"text":"Polyak, B. T. ","element":"span"},{"text":"(2006). Cubic regularization of newton method and its global performance. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming ","element":"span"},{"style":{"fontWeight":"bold"},"text":"108 ","element":"span"},{"text":"177–205.","element":"span"}],[{"id":"id-24","text":"Nguyen, L. M.","element":"span"},{"text":", ","element":"span"},{"text":"Liu, J.","element":"span"},{"text":", ","element":"span"},{"text":"Scheinberg, K. ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":17.45},"width":229.18,"height":43.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1901.11518/images/31-1.png","element":"img","alt":" Tak´aˇc, M.","inline":true,"padRight":true},{"text":"(2017). Sarah: A novel method for machine learning problems using stochastic recursive gradient. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":".","element":"span"}],[{"id":"id-57","text":"Nguyen, L. M.","element":"span"},{"text":", ","element":"span"},{"text":"van Dijk, M.","element":"span"},{"text":", ","element":"span"},{"text":"Phan, D. T.","element":"span"},{"text":", ","element":"span"},{"text":"Nguyen, P. H.","element":"span"},{"text":", ","element":"span"},{"text":"Weng, T.-W. ","element":"span"},{"text":"and ","element":"span"},{"text":"Kalagnanam, J. R. ","element":"span"},{"text":"(2019). Optimal finite-sum smooth non-convex optimization with sarah. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1901.07648 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-114","text":"Pinelis, I. ","element":"span"},{"text":"(1994). Optimum bounds for the distributions of martingales in banach spaces. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Probability ","element":"span"},{"text":"1679–1706.","element":"span"}],[{"id":"id-14","text":"Reddi, S. J.","element":"span"},{"text":", ","element":"span"},{"text":"Hefny, A.","element":"span"},{"text":", ","element":"span"},{"text":"Sra, S.","element":"span"},{"text":", ","element":"span"},{"text":"Poczos, B. ","element":"span"},{"text":"and ","element":"span"},{"text":"Smola, A. ","element":"span"},{"text":"(2016). Stochastic variance reduction for nonconvex optimization 314–323.","element":"span"}],[{"id":"id-50","text":"Roux, N. L.","element":"span"},{"text":", ","element":"span"},{"text":"Schmidt, M. ","element":"span"},{"text":"and ","element":"span"},{"text":"Bach, F. R. ","element":"span"},{"text":"(2012). A stochastic gradient method with an exponential convergence rate for finite training sets. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-40","text":"Royer, C. W. ","element":"span"},{"text":"and ","element":"span"},{"text":"Wright, S. J. ","element":"span"},{"text":"(2017). ","element":"span"},{"text":"Complexity analysis of second-order line-search algorithms for smooth nonconvex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1706.03131 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-54","text":"Shalev-Shwartz, S. ","element":"span"},{"text":"(2016). Sdca without duality, regularization, and individual convexity. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":".","element":"span"}],[{"id":"id-25","text":"Shen, Z.","element":"span"},{"text":", ","element":"span"},{"text":"Zhou, P.","element":"span"},{"text":", ","element":"span"},{"text":"Fang, C. ","element":"span"},{"text":"and ","element":"span"},{"text":"Ribeiro, A. ","element":"span"},{"text":"(2019). A stochastic trust region method for non-convex minimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1903.01540 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-0","text":"Tripuraneni, N.","element":"span"},{"text":", ","element":"span"},{"text":"Stern, M.","element":"span"},{"text":", ","element":"span"},{"text":"Jin, C.","element":"span"},{"text":", ","element":"span"},{"text":"Regier, J. ","element":"span"},{"text":"and ","element":"span"},{"text":"Jordan, M. I. ","element":"span"},{"text":"(2018). Stochastic cubic regularization for fast nonconvex optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-115","text":"Tropp, J. A. ","element":"span"},{"text":"(2012). User-friendly tail bounds for sums of random matrices. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Foundations of computational mathematics ","element":"span"},{"style":{"fontWeight":"bold"},"text":"12 ","element":"span"},{"text":"389–434.","element":"span"}],[{"id":"id-56","text":"Wang, Z.","element":"span"},{"text":", ","element":"span"},{"text":"Ji, K.","element":"span"},{"text":", ","element":"span"},{"text":"Zhou, Y.","element":"span"},{"text":", ","element":"span"},{"text":"Liang, Y. ","element":"span"},{"text":"and ","element":"span"},{"text":"Tarokh, V. ","element":"span"},{"text":"(2018a). Spiderboost: A class of faster variance-reduced algorithms for nonconvex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1810.10690 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-17","text":"Wang, Z.","element":"span"},{"text":", ","element":"span"},{"text":"Zhou, Y.","element":"span"},{"text":", ","element":"span"},{"text":"Liang, Y. ","element":"span"},{"text":"and ","element":"span"},{"text":"Lan, G. ","element":"span"},{"text":"(2018b). Sample complexity of stochastic variance-reduced cubic regularization for nonconvex optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1802.07372 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-51","text":"Xiao, L. ","element":"span"},{"text":"and ","element":"span"},{"text":"Zhang, T. ","element":"span"},{"text":"(2014). A proximal stochastic gradient method with progressive variance reduction. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Optimization ","element":"span"},{"style":{"fontWeight":"bold"},"text":"24 ","element":"span"},{"text":"2057–2075.","element":"span"}],[{"id":"id-11","text":"Xu, P.","element":"span"},{"text":", ","element":"span"},{"text":"Roosta-Khorasani, F. ","element":"span"},{"text":"and ","element":"span"},{"text":"Mahoney, M. W. ","element":"span"},{"text":"(2017). Newton-type methods for non-convex optimization under inexact hessian information. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1708.07164 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-42","text":"Xu, Y.","element":"span"},{"text":", ","element":"span"},{"text":"Rong, J. ","element":"span"},{"text":"and ","element":"span"},{"text":"Yang, T. ","element":"span"},{"text":"(2018). First-order stochastic algorithms for escaping from saddle points in almost linear time. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-46","text":"Yu, Y.","element":"span"},{"text":", ","element":"span"},{"text":"Xu, P. ","element":"span"},{"text":"and ","element":"span"},{"text":"Gu, Q. ","element":"span"},{"text":"(2018). Third-order smoothness helps: faster stochastic optimization algorithms for finding local minima. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-45","text":"Yu, Y.","element":"span"},{"text":", ","element":"span"},{"text":"Zou, D. ","element":"span"},{"text":"and ","element":"span"},{"text":"Gu, Q. ","element":"span"},{"text":"(2017). Saving gradient and negative curvature computations: Finding local minima more efficiently. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1712.03950 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-18","text":"Zhang, J.","element":"span"},{"text":", ","element":"span"},{"text":"Xiao, L. ","element":"span"},{"text":"and ","element":"span"},{"text":"Zhang, S. ","element":"span"},{"text":"(2018a). Adaptive stochastic variance reduction for subsampled newton method with cubic regularization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1811.11637 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-6","text":"Zhang, X.","element":"span"},{"text":", ","element":"span"},{"text":"Wang, L.","element":"span"},{"text":", ","element":"span"},{"text":"Yu, Y. ","element":"span"},{"text":"and ","element":"span"},{"text":"Gu, Q. ","element":"span"},{"text":"(2018b). A primal-dual analysis of global optimality in nonconvex low-rank matrix recovery. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International conference on machine learning","element":"span"},{"text":".","element":"span"}],[{"id":"id-47","text":"Zhou, D.","element":"span"},{"text":", ","element":"span"},{"text":"Xu, P. ","element":"span"},{"text":"and ","element":"span"},{"text":"Gu, Q. ","element":"span"},{"text":"(2018a). Finding local minima via stochastic nested variance reduction. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1806.08782 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-16","text":"Zhou, D.","element":"span"},{"text":", ","element":"span"},{"text":"Xu, P. ","element":"span"},{"text":"and ","element":"span"},{"text":"Gu, Q. ","element":"span"},{"text":"(2018b). ","element":"span"},{"text":"Sample efficient stochastic variance-reduced cubic regularization method. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1811.11989 ","element":"span"},{"text":".","element":"span"}],[{"id":"id-55","text":"Zhou, D.","element":"span"},{"text":", ","element":"span"},{"text":"Xu, P. ","element":"span"},{"text":"and ","element":"span"},{"text":"Gu, Q. ","element":"span"},{"text":"(2018c). Stochastic nested variance reduced gradient descent for nonconvex optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":".","element":"span"}],[{"id":"id-12","text":"Zhou, D.","element":"span"},{"text":", ","element":"span"},{"text":"Xu, P. ","element":"span"},{"text":"and ","element":"span"},{"text":"Gu, Q. ","element":"span"},{"text":"(2018d). Stochastic variance-reduced cubic regularized Newton methods. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 35th International Conference on Machine Learning","element":"span"},{"text":". PMLR, 5990–5999.","element":"span"}],[{"text":"Zhou, D.","element":"span"},{"text":", ","element":"span"},{"text":"Xu, P. ","element":"span"},{"text":"and ","element":"span"},{"text":"Gu, Q. ","element":"span"},{"text":"(2019). Stochastic variance-reduced cubic regularization methods. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research ","element":"span"},{"style":{"fontWeight":"bold"},"text":"20 ","element":"span"},{"text":"1–47.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]