1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTgwNi4wMDQ2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2019-01-11T02:51:38.000Z","paperID":"1806.00468","published":"2018-06-01T17:58:58.000Z","authors":"[\"Suriya Gunasekar\",\"Jason Lee\",\"Daniel Soudry\",\"Nathan Srebro\"]","title":"Implicit Bias of Gradient Descent on Linear Convolutional Networks","scoreTrending":null,"summary":"We show that gradient descent on full-width linear convolutional networks of\ndepth $L$ converges to a linear predictor related to the $\\ell_{2/L}$ bridge\npenalty in the frequency domain. This is in contrast to linearly fully\nconnected networks, where gradient descent converges to the hard margin linear\nsupport vector machine solution, regardless of depth.","lastCheckedForCode":"2022-09-03T03:44:27.148Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9pbXBsaWNpdC1iaWFzLW9mLWdyYWRpZW50LWRlc2NlbnQtb24tbGluZWFyIn0=","type":"pwc","url":"https://paperswithcode.com/paper/implicit-bias-of-gradient-descent-on-linear","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"daniel soudry","node":{"id":"eyJhZGRyZXNzIjoiZGFuaWVsLnNvdWRyeUBnbWFpbC5jb20ifQ==","address":"daniel.soudry@gmail.com","name":"Daniel Soudry","avatar":"https://img.fullcontact.com/static/1b58560871fe0e7896b61b89d544afcd_c304da3010a86a8506b475dd9cdf9e33fe1ef8306a0769043044c9f08e3b4169","linkedin":"https://www.linkedin.com/in/daniel-soudry-2aa3a88","bio":null,"site":null,"override":null,"membership":[{"name":"Columbia University in the City of New York"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/4681380?v=4","username":"danielso"},{"avatar":"https://avatars.githubusercontent.com/u/162600181?v=4","username":"danielso"}],"scholar":[{"thirdPartyID":"AEBWEm8AAAAJ"}],"twitter":[{"avatar":"https://pbs.twimg.com/profile_images/1339292379375931393/TEzYE9ai_400x400.jpg","username":"daniel_soudry"}],"location":[{"formatted":"Haifa, Israel"}],"owner":[{"id":"eyJ1aWQiOiIzMDc3NGJkYS1lYzcwLTRlYWYtYjAzMi0wZWE1MTRjNDU0MzcifQ==","name":"daniel soudry","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/1b58560871fe0e7896b61b89d544afcd_c304da3010a86a8506b475dd9cdf9e33fe1ef8306a0769043044c9f08e3b4169"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTYwOS4wNzA2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1609.07061"},{"id":"eyJwYXBlcklEIjoiMTYwNS4wODM2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.08361"},{"id":"eyJwYXBlcklEIjoiMTcwNS4wODc0MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1705.08741"},{"id":"eyJwYXBlcklEIjoiMTUwMy4wMzU2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1503.03562"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wMDQ2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.00468"},{"id":"eyJwYXBlcklEIjoiMTgwMy4xMDEyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1803.10123"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wNTA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.05040"},{"id":"eyJwYXBlcklEIjoiMTgwMy4wMTgxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1803.01814"},{"id":"eyJwYXBlcklEIjoiMTgxMC4wNTcyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.05723"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMDUxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.10518"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMTYzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.01635"},{"id":"eyJwYXBlcklEIjoiMTgwMS4wNDU0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1801.04540"},{"id":"eyJwYXBlcklEIjoiMTgwNS4xMTA0NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.11046"},{"id":"eyJwYXBlcklEIjoiMTkxMi4wMTI3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.01274"},{"id":"eyJwYXBlcklEIjoiMTMxMC4xODY3IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1310.1867"},{"id":"eyJwYXBlcklEIjoiMjQwOS4xMjUxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2409.12517"},{"id":"eyJwYXBlcklEIjoiMTYwNi4wNTMxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1606.05316"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wNDkzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.04938"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMjk2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.12967"},{"id":"eyJwYXBlcklEIjoiMjExMi4xMDc2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.10769"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wODk4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.08986"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wNjczOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.06738"},{"id":"eyJwYXBlcklEIjoiMTkwOS4xMjM0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.12340"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wODEyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.08124"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wMDc3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.00771"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xMDk5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.10991"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wODE3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.08173"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNTE4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.05187"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wNjgzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.06838"},{"id":"eyJwYXBlcklEIjoiMjQwMS4xNDExMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.14110"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMDU5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.10598"},{"id":"eyJwYXBlcklEIjoiMTkxMi4wNTEzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.05137"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wNzIxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.07218"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wODA4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.08085"},{"id":"eyJwYXBlcklEIjoiMjEyODEiLCJwdWJsaXNoZXIiOiJjdnByIn0=","publisher":"cvpr","paperID":"21281"},{"id":"eyJwYXBlcklEIjoiNzA4MzYiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70836"},{"id":"eyJwYXBlcklEIjoiNzIwMjQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72024"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wNjc0OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.06748"}]}]}},{"author":"jason lee","node":{"id":"eyJhZGRyZXNzIjoiamFzb25sZWVAbWFyc2hhbGwudXNjLmVkdSJ9","address":"jasonlee@marshall.usc.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/9759378?v=4","username":"icemansina"},{"avatar":"https://avatars.githubusercontent.com/u/6448285?v=4","username":"jasondlee88"}],"scholar":[{"thirdPartyID":"GR_DsT0AAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIzNmM1YzU3Yy03MGVmLTQ5MDYtYWRmOS01ZjUzZTQyMmY1ZGMifQ==","name":"jason d lee","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/84430ed03784afcf8cfb02da7ed772cf_61c8da342e847f32f9f0fb1b110d6fd57e9b47ff87ddd55ce610045561188cb5"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTYwNS4wNzI3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.07272"},{"id":"eyJwYXBlcklEIjoiMTIwNi4xNjIzIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1206.1623"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wMDI2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.00261"},{"id":"eyJwYXBlcklEIjoiMTcwNS4xMDQxMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1705.10412"},{"id":"eyJwYXBlcklEIjoiMTgxMC4wNTM2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.05369"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wMDkwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.00900"},{"id":"eyJwYXBlcklEIjoiMTcwOS4wNjEyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1709.06129"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wMDQ2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.00468"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wOTQzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.09434"},{"id":"eyJwYXBlcklEIjoiMTcxMC4wNzQwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1710.07406"},{"id":"eyJwYXBlcklEIjoiMTMwNS43NDc3IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1305.7477"},{"id":"eyJwYXBlcklEIjoiMTUxMS4wNzk0OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1511.07948"},{"id":"eyJwYXBlcklEIjoiMTUwNy4wNzU5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1507.07595"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wODI5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.08297"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wODI0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.08249"},{"id":"eyJwYXBlcklEIjoiMTUxMC4wMzUyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1510.03528"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wOTI3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.09277"},{"id":"eyJwYXBlcklEIjoiMjAwOC4wMTA2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.01064"},{"id":"eyJwYXBlcklEIjoiMjMwNy4wMzM4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.03381"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wODI1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.08252"},{"id":"eyJwYXBlcklEIjoiMTQwMi42OTY0IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1402.6964"},{"id":"eyJwYXBlcklEIjoiMjEwMy4xMDg5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.10897"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMTYxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.01619"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wNzEyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.07125"},{"id":"eyJwYXBlcklEIjoiMTcwNC4wNzk3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1704.07971"},{"id":"eyJwYXBlcklEIjoiMTgwOS4wODUzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1809.08530"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xNzMzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.17333"},{"id":"eyJwYXBlcklEIjoiMTkwOC4xMDk2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.10962"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wODY4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.08680"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xNDgxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.14816"},{"id":"eyJwYXBlcklEIjoiMjIwOS4xNTU5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2209.15594"},{"id":"eyJwYXBlcklEIjoiMjEwNy4xNDcwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.14702"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xNjc2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.16767"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wODkwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.08903"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wNjUzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.06530"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wNTczOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.05738"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wODY3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.08671"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNjUzMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.06531"},{"id":"eyJwYXBlcklEIjoiMjQwNC4wODQ5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.08495"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMTIwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.11203"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wNzAzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.07030"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xMjAyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.12020"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xODUwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.18505"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNTg0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.05843"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wOTQ4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.09486"},{"id":"eyJwYXBlcklEIjoiMjQwMy4wMzE4MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.03183"},{"id":"eyJwYXBlcklEIjoiMjQwNy4xMzM5OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.13399"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wNjczOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.06738"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wMTU4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.01581"},{"id":"eyJwYXBlcklEIjoiMjAwMy4xMDM5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.10392"},{"id":"eyJwYXBlcklEIjoiMjAwNC4wMjMzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2004.02336"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wODQ2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.08466"},{"id":"eyJwYXBlcklEIjoiMjMwNy4xMzU4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.13586"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMzQzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.13436"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xODgxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.18817"},{"id":"eyJwYXBlcklEIjoiMjMwMS4xMTUwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2301.11500"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xMDYzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.10633"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xNTY2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.15664"},{"id":"eyJwYXBlcklEIjoiMjAwOS4wOTgyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.09829"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNzgwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.07807"},{"id":"eyJwYXBlcklEIjoiMjMxMi4wNzkzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.07930"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xMjA4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.12081"},{"id":"eyJwYXBlcklEIjoiMjEwNi4xMjEwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.12108"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wNDAzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.04036"},{"id":"eyJwYXBlcklEIjoiMTcwOC4wODU1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1708.08552"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xMTc4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.11788"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xMTM1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.11356"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wOTgxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.09815"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wMjM3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.02377"},{"id":"eyJwYXBlcklEIjoiMjIxMi4wMzcxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2212.03714"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMTU4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.01588"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wNDUxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.04518"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wMDc1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.00751"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wMjM5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.02392"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wNDgxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.04819"},{"id":"eyJwYXBlcklEIjoiMjQwMy4xMDczOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.10738"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xMzc3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.13774"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xMTk2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.11965"},{"id":"eyJwYXBlcklEIjoiMjExMC4wOTUwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.09507"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wMzA5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.03095"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xOTAzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.19035"},{"id":"eyJwYXBlcklEIjoiMjIxMC4wNjcwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.06705"},{"id":"eyJwYXBlcklEIjoiMTMxMS43MTg0IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1311.7184"},{"id":"eyJwYXBlcklEIjoiMTgxMS4xMjA2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.12069"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wNjQ2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.06466"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xMTYzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.11634"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xNTE0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.15144"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMjM4MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.12383"},{"id":"eyJwYXBlcklEIjoiNTQwMjYiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54026"},{"id":"eyJwYXBlcklEIjoiNTM3MDgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53708"},{"id":"eyJwYXBlcklEIjoiNTM3MTYiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53716"},{"id":"eyJwYXBlcklEIjoiNTI3OTAiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"52790"},{"id":"eyJwYXBlcklEIjoiNzIxMTgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72118"},{"id":"eyJwYXBlcklEIjoiNzE0MzciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71437"},{"id":"eyJwYXBlcklEIjoiNzI3MjYiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72726"},{"id":"eyJwYXBlcklEIjoiNzA1NzYiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70576"},{"id":"eyJwYXBlcklEIjoiNzA5MDgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70908"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xOTYxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.19617"}]}]}},{"author":"nathan srebro","node":{"id":"eyJhZGRyZXNzIjoibmF0aUB0dGljLmVkdSJ9","address":"nati@ttic.edu","name":"Nati Srebro","avatar":null,"linkedin":"https://www.linkedin.com/in/nati-srebro-82b48b1","bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"ZnT-QpMAAAAJ"}],"twitter":[],"location":[{"formatted":"Chicago, IL, USA"}],"owner":[{"id":"eyJ1aWQiOiIyZTcwMTU2Zi05ZTA1LTQ1MDUtODllYi01MmI4MDZkZmFlOWYifQ==","name":"nathan srebro","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTQxMi42NjE0IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1412.6614"},{"id":"eyJwYXBlcklEIjoiMTMxMC41NzE1IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1310.5715"},{"id":"eyJwYXBlcklEIjoiMTcwNS4wODI5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1705.08292"},{"id":"eyJwYXBlcklEIjoiMTYwNS4wNzIyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.07221"},{"id":"eyJwYXBlcklEIjoiMTcwNy4wOTU2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1707.09564"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wODk0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.08947"},{"id":"eyJwYXBlcklEIjoiMTYwNS4wODAwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.08003"},{"id":"eyJwYXBlcklEIjoiMTgwNS4xMjA3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.12076"},{"id":"eyJwYXBlcklEIjoiMTkxMi4wMjM2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.02365"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wMDQ2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.00468"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wNjA4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.06081"},{"id":"eyJwYXBlcklEIjoiMTMwNy4xNjc0IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1307.1674"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNDczNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.04735"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wNTA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.05040"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wNzgzOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.07839"},{"id":"eyJwYXBlcklEIjoiMTUxMC4wMjA1NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1510.02054"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wOTI3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.09277"},{"id":"eyJwYXBlcklEIjoiMTUwNy4wODMyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1507.08322"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMTYzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.01635"},{"id":"eyJwYXBlcklEIjoiMTMxMS43NjYyIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1311.7662"},{"id":"eyJwYXBlcklEIjoiMTkwNC4xMDEyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.10120"},{"id":"eyJwYXBlcklEIjoiMTgwNS4xMDIyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.10222"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wNDY4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.04686"},{"id":"eyJwYXBlcklEIjoiMTgxMC4xMTgyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.11829"},{"id":"eyJwYXBlcklEIjoiMjEwMS4wMTEzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2101.01134"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wMTU4MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.01583"},{"id":"eyJwYXBlcklEIjoiMTUxMC4wMDYzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1510.00633"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wNjUzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.06533"},{"id":"eyJwYXBlcklEIjoiMTgxMi4wMjk1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1812.02952"},{"id":"eyJwYXBlcklEIjoiMjEwNC4wNjk3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.06970"},{"id":"eyJwYXBlcklEIjoiMjAwMi4xMTY1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.11651"},{"id":"eyJwYXBlcklEIjoiMjAwNS4wNzY1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.07652"},{"id":"eyJwYXBlcklEIjoiMjExMC4wMjczMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.02732"},{"id":"eyJwYXBlcklEIjoiMTUxMS4wNjc0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1511.06747"},{"id":"eyJwYXBlcklEIjoiMjIxMC4wNzA4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.07082"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wMTIxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.01210"},{"id":"eyJwYXBlcklEIjoiMTYwNC4wMTg3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1604.01870"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wNjczOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.06738"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wOTI3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.09276"},{"id":"eyJwYXBlcklEIjoiMTgwNi4xMDE4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.10188"},{"id":"eyJwYXBlcklEIjoiMjAwNC4wMTAyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2004.01025"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xMjAzOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.12039"},{"id":"eyJwYXBlcklEIjoiMTcxMS4wNTMwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1711.05305"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wMzM5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.03397"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wODE2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.08169"},{"id":"eyJwYXBlcklEIjoiMjExMi4wNDQ3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.04470"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wMjE0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.02145"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNTk0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.05942"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wMTQ1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.01456"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wMTQ2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.01462"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wNjIzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.06233"},{"id":"eyJwYXBlcklEIjoiMjIwNS4xMDY3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.10671"},{"id":"eyJwYXBlcklEIjoiMjIxMC4xMjA4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.12082"},{"id":"eyJwYXBlcklEIjoiMjExMC4xMDYwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.10602"},{"id":"eyJwYXBlcklEIjoiMTMwNi4yMzQ3IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1306.2347"},{"id":"eyJwYXBlcklEIjoiMjMwNy4xNTM5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.15396"},{"id":"eyJwYXBlcklEIjoiMjQwNS4xMTY2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.11667"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wNjgxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.06818"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wNzQyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.07426"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMzE4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.13185"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wOTIzMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.09231"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wNDE5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.04190"},{"id":"eyJwYXBlcklEIjoiMjExMi4xNDE5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.14195"},{"id":"eyJwYXBlcklEIjoiMjIwOS4wNzM2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2209.07369"},{"id":"eyJwYXBlcklEIjoiMjMwNi4xMzE4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.13188"},{"id":"eyJwYXBlcklEIjoiMjIwMi4xMzMyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.13328"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wMjcyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.02720"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xNTQwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.15404"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xNjUwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.16508"},{"id":"eyJwYXBlcklEIjoiMTQwNS4zMTY3IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1405.3167"},{"id":"eyJwYXBlcklEIjoiMTIxMi4zMjc2IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1212.3276"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wNzgzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.07834"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wNTA3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.05073"},{"id":"eyJwYXBlcklEIjoiMjExMC4wMjk1NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.02954"},{"id":"eyJwYXBlcklEIjoiNTQyOTgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54298"},{"id":"eyJwYXBlcklEIjoiNTQ4OTAiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54890"},{"id":"eyJwYXBlcklEIjoiNTQ4OTEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54891"},{"id":"eyJwYXBlcklEIjoiNTM5MjUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53925"},{"id":"eyJwYXBlcklEIjoiNTM4MDQiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53804"},{"id":"eyJwYXBlcklEIjoiNTQzOTciLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54397"},{"id":"eyJwYXBlcklEIjoiNTM4NzAiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53870"},{"id":"eyJwYXBlcklEIjoiNTQzMDIiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54302"},{"id":"eyJwYXBlcklEIjoiNTQ4ODgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"54888"},{"id":"eyJwYXBlcklEIjoiNzI3NzUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72775"},{"id":"eyJwYXBlcklEIjoiNzAzODkiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70389"},{"id":"eyJwYXBlcklEIjoiNzE1NjgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71568"},{"id":"eyJwYXBlcklEIjoiNzAzNTgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"70358"},{"id":"eyJwYXBlcklEIjoiMjMxMS4xNzU4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.17586"}]}]}},{"author":"suriya gunasekar","node":{"id":"eyJhZGRyZXNzIjoic3VyaXlhQHR0aWMuZWR1In0=","address":"suriya@ttic.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/8418631?v=4","username":"sgunasekar"}],"scholar":[{"thirdPartyID":"EkREu_QAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIzMWNkZGVmOS00MDhlLTRmMDYtYmU3Yy1iZTc5YWUzNzlmZDYifQ==","name":"suriya gunasekar","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgwNi4wMDQ2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.00468"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wNjA4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.06081"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wOTI3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.09277"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNDMwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.04301"},{"id":"eyJwYXBlcklEIjoiMTYwOC4wMDcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1608.00704"},{"id":"eyJwYXBlcklEIjoiMTgxMC4xMTgyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.11829"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wNDIxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.04218"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMjIzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.12238"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wNjczOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.06738"},{"id":"eyJwYXBlcklEIjoiMjIxMS4wOTM1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.09359"},{"id":"eyJwYXBlcklEIjoiMjAwNC4wMTAyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2004.01025"},{"id":"eyJwYXBlcklEIjoiMjIwMy4wMTU3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.01572"},{"id":"eyJwYXBlcklEIjoiMTYwMy4wODcwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1603.08708"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wMjM0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.02349"}]}]}}]},"__typename":"paper","authorArray":["Suriya Gunasekar","Jason Lee","Daniel Soudry","Nathan Srebro"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2b",null,{"publisher":"arxiv","paperID":"1806.00468","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2c",null,{"article":"$L2d","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2e",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L2f",null,{"paperID":"1806.00468","publisher":"arxiv","paperJSON":{"title":"Implicit Bias of Gradient Descent on Linear Convolutional Networks","paperID":"1806.00468","avgLineHeight":10.91,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"We show that gradient descent on full width linear convolutional networks of depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"converges to a linear predictor related to the ","element":"span"},{"style":{"height":11.2},"width":70.81,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/0-0.png","element":"img","alt":" ℓ2/L","inline":true,"padRight":true},{"text":"bridge penalty in the frequency domain. This is in contrast to fully connected linear networks, where regardless of depth, gradient descent converges to the ","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/0-1.png","element":"img","alt":" ℓ2","inline":true,"padRight":true},{"text":"maximum margin solution.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"Implicit biases introduced by optimization algorithms play an crucial role in learning deep neural networks ","element":"span"},{"href":"#id-0","referenceIndex":21,"text":"[Neyshabur et al., ","element":"a"},{"href":"#id-0","referenceIndex":21,"text":"2015b,","element":"a"},{"href":"#id-1","referenceIndex":20,"text":"a, ","element":"a"},{"href":"#id-2","referenceIndex":9,"text":"Hochreiter and Schmidhuber, ","element":"a"},{"href":"#id-2","referenceIndex":9,"text":"1997, ","element":"a"},{"href":"#id-3","referenceIndex":15,"text":"Keskar et al., ","element":"a"},{"href":"#id-3","referenceIndex":15,"text":"2016, ","element":"a"},{"href":"#id-4","referenceIndex":4,"text":"Chaudhari ","element":"a"},{"href":"#id-4","referenceIndex":4,"text":"et al., ","element":"a"},{"href":"#id-4","referenceIndex":4,"text":"2016, ","element":"a"},{"href":"#id-5","referenceIndex":5,"text":"Dinh et al., ","element":"a"},{"href":"#id-5","referenceIndex":5,"text":"2017, ","element":"a"},{"href":"#id-6","referenceIndex":1,"text":"Andrychowicz et al., ","element":"a"},{"href":"#id-6","referenceIndex":1,"text":"2016, ","element":"a"},{"href":"#id-7","referenceIndex":22,"text":"Neyshabur et al., ","element":"a"},{"href":"#id-7","referenceIndex":22,"text":"2017, ","element":"a"},{"href":"#id-8","referenceIndex":29,"text":"Zhang et al., ","element":"a"},{"href":"#id-8","referenceIndex":29,"text":"2017, ","element":"a"},{"href":"#id-9","referenceIndex":28,"text":"Wilson et al., ","element":"a"},{"href":"#id-9","referenceIndex":28,"text":"2017, ","element":"a"},{"href":"#id-10","referenceIndex":10,"text":"Hoffer et al., ","element":"a"},{"href":"#id-10","referenceIndex":10,"text":"2017, ","element":"a"},{"href":"#id-11","referenceIndex":25,"text":"Smith, ","element":"a"},{"href":"#id-11","referenceIndex":25,"text":"2018]","element":"a"},{"text":". Large scale neural networks used in practice are highly over-parameterized with far more trainable model parameters compared to the number of training examples. Consequently, optimization objectives for learning such high capacity models have many global minima that fit training data perfectly. However, minimizing the training loss using specific optimization algorithms take us to not just any global minima, but some special global minima, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g., ","element":"span"},{"text":"global minima minimizing some regularizer ","element":"span"},{"style":{"height":16},"width":92.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/0-2.png","element":"img","alt":" R(β)","inline":true},{"text":". In over-parameterized models, specially deep neural networks, much, if not most, of the inductive bias of the learned model comes from this implicit regularization from the optimization algorithm. Understanding the implicit bias, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g., ","element":"span"},{"text":"via characterizing ","element":"span"},{"style":{"height":16},"width":92.91,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/0-3.png","element":"img","alt":" R(β)","inline":true},{"text":", is thus essential for understanding how and what the model learns.","element":"span"}],[{"text":"For example, in linear regression we understand how minimizing an under-determined model (with more parameters than samples) using gradient descent yields the minimum ","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/0-4.png","element":"img","alt":" ℓ2","inline":true,"padRight":true},{"text":"norm solution, and for linear logistic regression trained on linearly separable data, ","element":"span"},{"href":"#id-12","referenceIndex":26,"text":"Soudry et al. ","element":"a"},{"href":"#id-12","referenceIndex":26,"text":"[2017] ","element":"a"},{"text":"recently showed that gradient descent converges in the direction of the hard margin support vector machine solution, even though the norm or margin is not explicitly specified in the optimization problem. Such minimum norm or maximum margin solutions are of course very special among all solutions or separators that fit the training data, and in particular can ensure generalization ","element":"span"},{"href":"#id-13","referenceIndex":2,"text":"Bartlett and Mendelson ","element":"a"},{"href":"#id-13","referenceIndex":2,"text":"[2003]","element":"a"},{"text":", ","element":"span"},{"href":"#id-14","referenceIndex":13,"text":"Kakade et al. ","element":"a"},{"href":"#id-14","referenceIndex":13,"text":"[2009]","element":"a"},{"text":".","element":"span"}],[{"text":"Changing the optimization algorithm, even without changing the model, changes this implicit bias, and consequently also changes generalization properties of the learned models ","element":"span"},{"href":"#id-1","referenceIndex":20,"text":"[Neyshabur et al., ","element":"a"},{"href":"#id-1","referenceIndex":20,"text":"2015a, ","element":"a"},{"href":"#id-3","referenceIndex":15,"text":"Keskar et al., ","element":"a"},{"href":"#id-3","referenceIndex":15,"text":"2016, ","element":"a"},{"href":"#id-9","referenceIndex":28,"text":"Wilson et al., ","element":"a"},{"href":"#id-9","referenceIndex":28,"text":"2017, ","element":"a"},{"href":"#id-15","referenceIndex":7,"text":"Gunasekar et al., ","element":"a"},{"href":"#id-15","referenceIndex":7,"text":"2017, ","element":"a"},{"href":"#id-16","referenceIndex":8,"text":"2018]","element":"a"},{"text":". For example, for linear logistic regression, using coordinate descent instead of gradient descent return a maximum ","element":"span"},{"style":{"height":14},"width":154.96,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/0-5.png","element":"img","alt":" ℓ1 margin","inline":true,"padRight":true},{"text":"solution instead of the hard margin support vector solution solution—an entirely different inductive bias ","element":"span"},{"href":"#id-17","referenceIndex":27,"text":"Telgarsky ","element":"a"},{"href":"#id-17","referenceIndex":27,"text":"[2013]","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":8,"text":"Gunasekar et al. ","element":"a"},{"href":"#id-16","referenceIndex":8,"text":"[2018]","element":"a"},{"text":".","element":"span"}],[{"text":"Similarly, and as we shall see in this paper, changing to a different parameterization of the same model class can also dramatically change the implicit bias ","element":"span"},{"href":"#id-15","referenceIndex":7,"text":"Gunasekar et al. ","element":"a"},{"href":"#id-15","referenceIndex":7,"text":"[2017]","element":"a"},{"text":". In particular, we study the implicit bias of optimizing multi-layer fully connected linear networks, and linear convolutional networks (multiple full width convolutional layers followed by a single fully connected layer) using gradient descent. Both of these types of models ultimately implement linear transformations, and can implement any linear transformation. The model class defined by these networks is thus simply the class of all linear predictors, and these models can be seen as mere (over) parameterizations of the class of linear predictors. Minimizing the training loss on these models is therefore entirely equivalent to minimizing the training loss for linear classification. Nevertheless, as we shall see, optimizing these networks with gradient descent leads to very different solutions.","element":"span"}],[{"text":"In particular, we show that for fully connected networks with single output, optimizing the exponential loss over linearly separable data using gradient loss again converges to the homogeneous hard margin support vector machine solution. This holds regardless of the depth of the network, and hence, at least with a single output, gradient descent on fully connected networks has the same implicit bias as direct gradient descent on the parameters of the linear predictor. In contrast, training a linear convolutional network with gradient descent biases us toward linear separators that are sparse in the frequency domain. Furthermore, this bias changes with the depth of the network, and a network of depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"(with ","element":"span"},{"style":{"height":10.8},"width":95.86,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-0.png","element":"img","alt":" L − 1","inline":true,"padRight":true},{"text":"convolutional layers), implicitly biases towards minimizing the ","element":"span"},{"style":{"height":17.68},"width":110.94,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-1.png","element":"img","alt":" ∥�β∥2/L","inline":true,"padRight":true},{"text":"bridge penalty with ","element":"span"},{"style":{"height":16},"width":124.17,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-2.png","element":"img","alt":"2/L ≤ 1","inline":true,"padRight":true},{"text":"of the Fourier transform ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-3.png","element":"img","alt":"�β","inline":true,"padRight":true},{"text":"of the learned linear predictor ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-4.png","element":"img","alt":" β","inline":true,"padRight":true},{"text":"subject to margin constraints (the gradient descent predictor reaches a stationary point of the ","element":"span"},{"style":{"height":17.68},"width":110.95,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-5.png","element":"img","alt":" ∥�β∥2/L","inline":true,"padRight":true},{"text":"minimization problem). This is a sparsity inducing regularizer, which induces sparsity more aggressively as the depth increases.","element":"span"}],[{"text":"Finally, in this paper we focus on characterizing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"which ","element":"span"},{"text":"global minimum does gradient descent on over-parameterized linear models converge to, while assuming that for appropriate choice of step sizes gradient descent iterates asymptotically minimize the optimization objective. A related challenge in neural networks, not addressed in this paper, is an answer to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"when ","element":"span"},{"text":"does gradient descent minimize the non-convex empirical loss objective to reach ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"global minimum. This problem while hard in worst case, has been studied for linear networks. Recent work have concluded that with sufficient over-parameterization (as is the case with our settings), loss landscape of linear models are well behaved and all local minima are global minima making the problem tractable ","element":"span"},{"href":"#id-18","referenceIndex":3,"text":"Burer and Monteiro ","element":"a"},{"href":"#id-18","referenceIndex":3,"text":"[2003]","element":"a"},{"text":", ","element":"span"},{"href":"#id-19","referenceIndex":12,"text":"Journée et al. ","element":"a"},{"href":"#id-19","referenceIndex":12,"text":"[2010]","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":14,"text":"Kawaguchi ","element":"a"},{"href":"#id-20","referenceIndex":14,"text":"[2016]","element":"a"},{"text":", ","element":"span"},{"href":"#id-21","referenceIndex":23,"text":"Nguyen and Hein ","element":"a"},{"href":"#id-21","referenceIndex":23,"text":"[2017]","element":"a"},{"text":", ","element":"span"},{"href":"#id-22","referenceIndex":16,"text":"Lee et al. ","element":"a"},{"href":"#id-22","referenceIndex":16,"text":"[2016]","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Notation ","element":"span"},{"text":"We typeface vectors with bold characters ","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g., ","element":"span"},{"style":{"height":14},"width":120.8,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-6.png","element":"img","alt":" w, β, x","inline":true},{"text":". Individual entries of a vector ","element":"span"},{"style":{"height":14.18},"width":131.96,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-7.png","element":"img","alt":"z ∈ RD","inline":true,"padRight":true},{"text":"are indexed using ","element":"span"},{"text":"0 ","element":"span"},{"text":"based indexing as ","element":"span"},{"style":{"fontWeight":"bold"},"text":"z","element":"span"},{"text":"[","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":"] ","element":"span"},{"text":"for ","element":"span"},{"style":{"height":14},"width":334.15,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-8.png","element":"img","alt":" d = 0, 1, . . . , D − 1","inline":true},{"text":". Complex numbers are represented in the polar form as ","element":"span"},{"style":{"height":17.38},"width":183.12,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-9.png","element":"img","alt":" z = |z|eiφz","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":16},"width":152.3,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-10.png","element":"img","alt":" |z| ∈ R+","inline":true,"padRight":true},{"text":"denoting the magnitude of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":16},"width":200.46,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-11.png","element":"img","alt":"φz ∈ [0, 2π)","inline":true,"padRight":true},{"text":"denoting the phase. ","element":"span"},{"style":{"height":17.39},"width":219.84,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-12.png","element":"img","alt":" z∗ = |z|e−iφz","inline":true,"padRight":true},{"text":"denotes the complex conjugate of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"text":". The complex inner product between ","element":"span"},{"style":{"height":16.58},"width":169.23,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-13.png","element":"img","alt":" z, β ∈ CD ","inline":true,"padRight":true},{"text":"is given by ","element":"span"},{"style":{"height":20.4},"width":854.38,"height":50.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-14.png","element":"img","alt":" ⟨z, β⟩ = �Dd=1 z[d]β∗[d] = z⊤β∗. The Dth complex","inline":true,"padRight":true},{"text":"root of ","element":"span"},{"text":"1 ","element":"span"},{"text":"is denoted by ","element":"span"},{"style":{"height":18.06},"width":207.45,"height":45.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-15.png","element":"img","alt":" ωD = e− 2πiD","inline":true,"padRight":true},{"text":". For ","element":"span"},{"style":{"height":14.19},"width":138.22,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-16.png","element":"img","alt":" z ∈ RD","inline":true,"padRight":true},{"text":"we use the notation ","element":"span"},{"style":{"height":14.19},"width":139.1,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-17.png","element":"img","alt":" �z ∈ CD","inline":true,"padRight":true},{"text":"to denote the representation of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"z ","element":"span"},{"text":"in the discrete Fourier basis given by, ","element":"span"},{"style":{"height":24.47},"width":442.57,"height":61.17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-18.png","element":"img","alt":" �z[d] = 1√D�D−1p=0 z[p]ωpdD .","inline":true,"padRight":true},{"text":"For integers ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":", we denote the modulo operator as ","element":"span"},{"style":{"height":19.21},"width":383.74,"height":48.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-19.png","element":"img","alt":" a mod D = a − D� aD�","inline":true},{"text":". Finally, for multi-layer linear networks (formally defined in Section ","element":"span"},{"text":"2)","element":"span"},{"text":", we will use ","element":"span"},{"style":{"height":11.6},"width":124.46,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-20.png","element":"img","alt":" w ∈ W","inline":true,"padRight":true},{"text":"to denote parameters of the model in general domain ","element":"span"},{"style":{"height":15.1},"width":384.78,"height":37.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-21.png","element":"img","alt":" W, and βw or simply β","inline":true,"padRight":true},{"text":"to denote the equivalent linear predictor.","element":"span"}]]},{"heading":"2 Multi-layer Linear Networks","paragraphs":[[{"text":"We consider feed forward linear networks that map input features ","element":"span"},{"style":{"height":14.18},"width":131.09,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-22.png","element":"img","alt":" x ∈ RD","inline":true,"padRight":true},{"text":"to a single real valued output ","element":"span"},{"style":{"height":16},"width":181.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-23.png","element":"img","alt":" fw(x) ∈ R","inline":true},{"text":", where ","element":"span"},{"style":{"fontWeight":"bold"},"text":"w ","element":"span"},{"text":"denote the parameters of the network. Such networks can be thought of as directed acyclic graphs where each edge is associated with a weight, and the value at each node/unit is the weighted sum of values from the parent nodes. The input features form source nodes with no incoming edges and the output is a sink node with no outgoing edge. Every such network realizes a linear function ","element":"span"},{"style":{"height":17.39},"width":502.03,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-24.png","element":"img","alt":" x → ⟨x, βw⟩, where βw ∈ RD ","inline":true,"padRight":true},{"text":"denotes the effective linear predictor.","element":"span"}],[{"text":"In multi-layer networks, the nodes are arranged in layers, so an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"–layer network represents a composition of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"linear maps. We use the convention that, the input ","element":"span"},{"style":{"height":14.19},"width":127.73,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-25.png","element":"img","alt":" x ∈ RD ","inline":true,"padRight":true},{"text":"is indexed as the zeroth layer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l ","element":"span"},{"text":"= 0","element":"span"},{"text":", while the output forms the final layer with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":". The outputs of nodes in layer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l ","element":"span"},{"text":"are denoted by ","element":"span"},{"style":{"height":15.77},"width":157.22,"height":39.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-26.png","element":"img","alt":" hl ∈ RDl","inline":true},{"text":", where ","element":"span"},{"style":{"height":13.19},"width":42.99,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-27.png","element":"img","alt":" Dl","inline":true,"padRight":true},{"text":"is the number of nodes in layer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":". We also use ","element":"span"},{"style":{"height":9.59},"width":43.1,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/1-28.png","element":"img","alt":" wl","inline":true,"padRight":true},{"text":"to denote the ","element":"span"},{"text":"parameters of the linear map between ","element":"span"},{"style":{"height":13.19},"width":76.6,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-0.png","element":"img","alt":" hl−1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.19},"width":35.46,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-1.png","element":"img","alt":" hl","inline":true},{"text":", and ","element":"span"},{"style":{"height":18.3},"width":205.04,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-2.png","element":"img","alt":" w = [wl]Ll=1","inline":true,"padRight":true},{"text":"to denote the collective set of ","element":"span"},{"text":"all parameters of the linear network.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Linear fully connected network ","element":"span"},{"text":"In a fully connected linear network, the nodes between successive layers ","element":"span"},{"style":{"height":11.6},"width":170.66,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-3.png","element":"img","alt":" l − 1 and l","inline":true,"padRight":true},{"text":"are densely connected with edge weights ","element":"span"},{"style":{"height":15.78},"width":256.6,"height":39.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-4.png","element":"img","alt":" wl ∈ RDl−1×Dl","inline":true},{"text":", and all the weights are independent parameters. This model class is parameterized by ","element":"span"},{"style":{"height":20.8},"width":586.03,"height":51.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-5.png","element":"img","alt":" w = [wl]Ll=1 ∈ �Ll=1 RDl−1×Dl and","inline":true,"padRight":true},{"text":"the computation for intermediate nodes ","element":"span"},{"style":{"height":13.19},"width":35.46,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-6.png","element":"img","alt":" hl","inline":true,"padRight":true},{"text":"and the composite linear map ","element":"span"},{"style":{"height":16},"width":103.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-7.png","element":"img","alt":" fw(x)","inline":true,"padRight":true},{"text":"is given by,","element":"span"}],[{"style":{"width":"80%"},"width":1269,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-8.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Linear convolutional network ","element":"span"},{"text":"We consider one-dimensional convolutional network architectures where each non-output layer has exactly ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"units (same as the input dimensionality) and the linear transformations from layer ","element":"span"},{"style":{"height":14.4},"width":227.24,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-9.png","element":"img","alt":" l − 1 to layer l","inline":true,"padRight":true},{"text":"are given by the following circular convolutional operation","element":"span"},{"text":"1 ","element":"span"},{"text":"parameterized by full width filters with weights ","element":"span"},{"style":{"height":19.63},"width":643.22,"height":49.07,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-10.png","element":"img","alt":" [wl ∈ RD]L−1l=1 . For l = 1, 2, . . . , L − 1,","inline":true}],[{"style":{"width":"81%"},"width":1299,"height":119,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-11.png","element":"img"}],[{"text":"The output layer is fully connected and parameterized by weights ","element":"span"},{"style":{"height":15.77},"width":160.44,"height":39.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-12.png","element":"img","alt":" wL ∈ RD","inline":true},{"text":". The parameters of the model class therefor consists of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"vectors of size ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"collectively denoted by ","element":"span"},{"style":{"height":20.8},"width":420.17,"height":51.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-13.png","element":"img","alt":" w = [wl]Ll=1 ∈ �Ll=1 RD,","inline":true,"padRight":true},{"text":"and the composite linear map ","element":"span"},{"style":{"height":16},"width":103.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-14.png","element":"img","alt":" fw(x)","inline":true,"padRight":true},{"text":"is given by:","element":"span"}],[{"id":"id-52","style":{"width":"74%"},"width":1174,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-15.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Remark: ","element":"span"},{"text":"We use circular convolution with a scaling of ","element":"span"},{"style":{"height":16},"width":81.35,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-16.png","element":"img","alt":"1/√D","inline":true,"padRight":true},{"text":"to make the analysis cleaner. For convolutions with zero-padding, we expect a similar behavior. Secondly, since our goal here to study implicit bias in sufficiently over-parameterized models, we only study full dimensional convolutional filters. In practice it is common to have filters of width ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"smaller than the number of input features, which can change the implicit bias.","element":"span"}],[{"text":"The fully connected and convolutional linear networks described above can both be represented in terms of a mapping ","element":"span"},{"style":{"height":14.19},"width":223.62,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-17.png","element":"img","alt":" P : W → RD ","inline":true,"padRight":true},{"text":"that maps the input parameters ","element":"span"},{"style":{"height":11.6},"width":124.46,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-18.png","element":"img","alt":" w ∈ W","inline":true,"padRight":true},{"text":"to a linear predictor in ","element":"span"},{"style":{"height":15.79},"width":67.68,"height":39.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-19.png","element":"img","alt":" RD,","inline":true,"padRight":true},{"text":"such that the output of the network is given by ","element":"span"},{"style":{"height":16},"width":325.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-20.png","element":"img","alt":" fw(x) = ⟨x, P(w)⟩","inline":true},{"text":". For fully connected networks, the mapping is given by ","element":"span"},{"style":{"height":16.79},"width":425.24,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-21.png","element":"img","alt":" Pfull(w) = w1w2 . . . wL","inline":true},{"text":", and for convolutional networks, ","element":"span"},{"style":{"height":16},"width":204.57,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-22.png","element":"img","alt":" Pconv(w) =","inline":true},{"style":{"height":32.3},"width":600.58,"height":80.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-23.png","element":"img","alt":"��(w↓L ⋆ wL−1) ⋆ wL−2�. . . ⋆ w1�↓","inline":true},{"text":", where ","element":"span"},{"style":{"height":13.38},"width":49.74,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-24.png","element":"img","alt":" w↓","inline":true,"padRight":true},{"text":"denotes the flipped vector corresponding to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"w","element":"span"},{"text":",","element":"span"}],[{"text":"given by ","element":"span"},{"style":{"height":17.39},"width":577.8,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-25.png","element":"img","alt":" w↓[k] = w[D − k − 1] for k = 0, 1","inline":true},{"style":{"fontStyle":"italic"},"text":", . . . , D ","element":"span"},{"style":{"height":11.2},"width":69.78,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-26.png","element":"img","alt":" − 1.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Separable linear classification ","element":"span"},{"text":"Consider a binary classification dataset ","element":"span"},{"style":{"height":16},"width":316.26,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-27.png","element":"img","alt":" {(xn, yn) : n =","inline":true,"padRight":true},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . N","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"with ","element":"span"},{"style":{"height":15.77},"width":168.24,"height":39.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-28.png","element":"img","alt":" xn ∈ RD","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":237.29,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-29.png","element":"img","alt":" yn ∈ {−1, 1}","inline":true},{"text":". The empirical risk minimization objective for training a linear network parameterized as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"w","element":"span"},{"text":") ","element":"span"},{"text":"is given as follows,","element":"span"}],[{"id":"id-23","style":{"width":"70%"},"width":1119,"height":116,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-30.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16},"width":377.9,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-31.png","element":"img","alt":" ℓ : R × {−1, 1} → R+","inline":true,"padRight":true},{"text":"is some surrogate loss for classification accuracy, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g., ","element":"span"},{"text":"logistic loss ","element":"span"},{"style":{"height":16},"width":476.79,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-32.png","element":"img","alt":"ℓ(�y, y) = log(1 + exp(−�yy))","inline":true,"padRight":true},{"text":"and exponential loss ","element":"span"},{"style":{"height":16},"width":335.19,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-33.png","element":"img","alt":" ℓ(�y, y) = exp(−�yy).","inline":true}],[{"text":"It is easy to see that both fully connected and convolutional networks of any depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"can realize any linear predictor ","element":"span"},{"style":{"height":16.59},"width":143.48,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-34.png","element":"img","alt":" β ∈ RD","inline":true},{"text":". The model class expressed by both networks is therefore simply the unconstrained class of linear predictors, and the two architectures are merely different (over) parameterizations of this class","element":"span"}],[{"id":"id-24","style":{"width":"86%"},"width":1365,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-35.png","element":"img"}],[{"text":"Thus, the empirical risk minimization problem in ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"text":"is equivalent to the following optimization over the linear predictors ","element":"span"},{"style":{"height":16},"width":187.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-36.png","element":"img","alt":" β = P(w):","inline":true}],[{"style":{"width":"99%"},"width":1582,"height":143,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/2-37.png","element":"img"}],[{"text":"Although the optimization problems ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-24","text":"(5) ","element":"a"},{"text":"are exactly equivalent in terms of the set of global minima, in this paper, we show that optimizing ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"text":"with different parameterizations leads to very different classifiers compared to optimizing ","element":"span"},{"href":"#id-24","text":"(5) ","element":"a"},{"text":"directly.","element":"span"}],[{"text":"In particular, consider problems ","element":"span"},{"href":"#id-23","text":"(4)","element":"a"},{"text":"/","element":"span"},{"href":"#id-24","text":"(5) ","element":"a"},{"text":"on a linearly separable dataset ","element":"span"},{"style":{"height":17.38},"width":204.83,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-0.png","element":"img","alt":" {xn, yn}Nn=1","inline":true,"padRight":true},{"text":"and using the ","element":"span"},{"text":"logistic loss (the two class version of the cross entropy loss typically used in deep learning). The global infimum of ","element":"span"},{"style":{"height":16},"width":86.63,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-1.png","element":"img","alt":" L(β)","inline":true,"padRight":true},{"text":"is ","element":"span"},{"text":"0","element":"span"},{"text":", but this is not attainable by any finite ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-2.png","element":"img","alt":" β","inline":true},{"text":". Instead, the loss can be minimized by scaling the norm of any linear predictor that separates the data to infinity. Thus, any sequence of predictors ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-3.png","element":"img","alt":" β(t) ","inline":true,"padRight":true},{"text":"(say, from an optimization algorithm) that asymptotically minimizes the loss in eq. ","element":"span"},{"href":"#id-24","text":"(5) ","element":"a"},{"text":"necessarily separates the data and diverges in norm, ","element":"span"},{"style":{"height":19.53},"width":211.35,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-4.png","element":"img","alt":" ∥β(t)∥ → ∞","inline":true},{"text":". In general there are many linear separators that correctly label the training data, each corresponding to a direction in which we can minimize ","element":"span"},{"href":"#id-24","text":"(5)","element":"a"},{"text":". Which of these separators will we converge to when optimizing ","element":"span"},{"href":"#id-23","text":"(4)","element":"a"},{"text":"/","element":"span"},{"href":"#id-24","text":"(5)","element":"a"},{"text":"? In other words, what is the direction ","element":"span"},{"style":{"height":18.72},"width":182.81,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-5.png","element":"img","alt":" β∞ = lim","inline":true}],[{"text":"will diverge in? If this limit exist we say that ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-6.png","element":"img","alt":" β(t) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converges in direction ","element":"span"},{"text":"to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"limit direction ","element":"span"},{"style":{"height":18.72},"width":71.4,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-7.png","element":"img","alt":" β∞.","inline":true}],[{"href":"#id-12","referenceIndex":26,"text":"Soudry et al. ","element":"a"},{"href":"#id-12","referenceIndex":26,"text":"[2017] ","element":"a"},{"text":"studied this implicit bias of gradient descent on ","element":"span"},{"href":"#id-24","text":"(5) ","element":"a"},{"text":"over the direct parameterization of ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-8.png","element":"img","alt":" β","inline":true},{"text":". They showed that for any linearly separable dataset and any initialization, gradient descent w.r.t. ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-9.png","element":"img","alt":" β","inline":true,"padRight":true},{"text":"converges in direction to hard margin support vector machine solution:","element":"span"}],[{"id":"id-25","style":{"width":"83%"},"width":1326,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-10.png","element":"img"}],[{"text":"In this paper we study the behavior of gradient descent on the problem ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"text":"w.r.t different parameterizations of the model class of linear predictors. For initialization ","element":"span"},{"style":{"height":14.18},"width":74.08,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-11.png","element":"img","alt":" w(0) ","inline":true,"padRight":true},{"text":"and sequence of step sizes ","element":"span"},{"style":{"height":16},"width":83.67,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-12.png","element":"img","alt":" {ηt},","inline":true,"padRight":true},{"text":"gradient descent updates for ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"text":"are given by,","element":"span"}],[{"style":{"width":"88%"},"width":1396,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-13.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16},"width":135.24,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-14.png","element":"img","alt":" ∇wP(.)","inline":true,"padRight":true},{"text":"denotes the Jacobian of ","element":"span"},{"style":{"height":14.18},"width":223.7,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-15.png","element":"img","alt":" P : W → RD ","inline":true,"padRight":true},{"text":"with respect to the parameters ","element":"span"},{"style":{"height":16.79},"width":246.74,"height":41.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-16.png","element":"img","alt":" w, and ∇βL(.)","inline":true,"padRight":true},{"text":"is the gradient of the loss function in ","element":"span"},{"href":"#id-24","text":"(5)","element":"a"},{"text":".","element":"span"}],[{"text":"For separable datasets, if ","element":"span"},{"style":{"height":14.18},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-17.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"text":"minimizes ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"text":"for linear fully connected or convolutional networks, then we will again have ","element":"span"},{"style":{"height":18.18},"width":230.52,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-18.png","element":"img","alt":" ∥w(t)∥ → ∞","inline":true},{"text":", and the question we ask is: what is the limit direction","element":"span"}],[{"style":{"width":"77%"},"width":1230,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-19.png","element":"img"}],[{"text":"The result in ","element":"span"},{"href":"#id-12","referenceIndex":26,"text":"Soudry et al. ","element":"a"},{"href":"#id-12","referenceIndex":26,"text":"[2017] ","element":"a"},{"text":"holds for any loss function ","element":"span"},{"style":{"height":16},"width":109.59,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-20.png","element":"img","alt":" ℓ(u, y)","inline":true,"padRight":true},{"text":"that is strictly monotone in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"uy ","element":"span"},{"text":"with specific tail behavior, name the tightly exponential tail, which is satisfied by popular classification losses like logistic and exponential loss. In the rest of the paper, for simplicity we exclusively focus on the exponential loss function ","element":"span"},{"style":{"height":16},"width":329.38,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-21.png","element":"img","alt":" ℓ(u, y) = exp(−uy)","inline":true},{"text":", which has the same tail behavior as that of the logistic loss. Along the lines of ","element":"span"},{"href":"#id-12","referenceIndex":26,"text":"Soudry et al. ","element":"a"},{"href":"#id-12","referenceIndex":26,"text":"[2017]","element":"a"},{"text":", our results should also extend for any strictly monotonic loss function with a tight exponential tail, including logistic loss.","element":"span"}]]},{"heading":"3 Main Results","paragraphs":[[{"text":"Our main results characterize the implicit bias of gradient descent for multi-layer fully connected and convolutional networks with linear activations. For the gradient descent iterates ","element":"span"},{"style":{"height":14.19},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-22.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"text":"in eq. ","element":"span"},{"href":"#id-25","text":"(7)","element":"a"},{"text":", we henceforth denote the induced linear predictor as ","element":"span"},{"style":{"height":19.53},"width":264.38,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-23.png","element":"img","alt":" β(t) = P(w(t)).","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"Assumptions. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"In the following theorems, we characterize the limiting predictor ","element":"span"},{"style":{"height":18.72},"width":179.84,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-24.png","element":"img","alt":" β∞ = lim","inline":true}],[{"style":{"fontStyle":"italic"},"text":"under the following assumptions:","element":"span"}],[{"style":{"width":"93%"},"width":1489,"height":229,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/3-25.png","element":"img"}],[{"text":"These assumptions allow us to focus on the question of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"which specific linear predictor do gradient descent iterates converge to ","element":"span"},{"text":"by separating it from the related optimization questions of when gradient descent iterates minimize the non-convex objective in eq. ","element":"span"},{"href":"#id-24","text":"(5) ","element":"a"},{"text":"and nicely converge in direction.","element":"span"}],[{"id":"id-31","style":{"width":"60%"},"width":952,"height":1493,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-0.png","element":"img"}],[{"text":"Figure 1: Implicit bias of gradient descent for different linear network architectures.","element":"figcaption","subtype":"caption"}],[{"id":"id-26","style":{"fontWeight":"bold"},"text":"Theorem 1 ","element":"span"},{"text":"(Linear fully connected networks)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For any depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"style":{"fontStyle":"italic"},"text":", almost all linearly separable datasets ","element":"span"},{"style":{"height":17.39},"width":204.84,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-1.png","element":"img","alt":" {xn, yn}Nn=1","inline":true},{"style":{"fontStyle":"italic"},"text":", almost all initializations ","element":"span"},{"style":{"height":14.19},"width":74.08,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-2.png","element":"img","alt":" w(0)","inline":true},{"style":{"fontStyle":"italic"},"text":", and any bounded sequence of step sizes ","element":"span"},{"style":{"height":16},"width":98.7,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-3.png","element":"img","alt":" {ηt}t,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"consider the sequence gradient descent iterates ","element":"span"},{"style":{"height":14.18},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-4.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"in eq. ","element":"span"},{"href":"#id-25","text":"(7) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"for minimizing ","element":"span"},{"style":{"height":17.68},"width":171.26,"height":44.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-5.png","element":"img","alt":" LPfull(w)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"in eq. ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"with exponential loss ","element":"span"},{"style":{"height":16},"width":442.53,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-6.png","element":"img","alt":" ℓ(�y, y) = exp(−�yy) over L","inline":true},{"style":{"fontStyle":"italic"},"text":"–layer fully connected linear networks.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"If (a) the iterates ","element":"span"},{"style":{"height":14.19},"width":70.23,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-7.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"minimize the objective, i.e., ","element":"span"},{"style":{"height":19.87},"width":296.26,"height":49.67,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-8.png","element":"img","alt":" LPfull(w(t)) → 0","inline":true},{"style":{"fontStyle":"italic"},"text":", (b) ","element":"span"},{"style":{"height":14.19},"width":70.23,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-9.png","element":"img","alt":" w(t)","inline":true},{"style":{"fontStyle":"italic"},"text":", and consequently ","element":"span"},{"style":{"height":20.32},"width":311.58,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-10.png","element":"img","alt":"β(t) = Pfull(w(t))","inline":true},{"style":{"fontStyle":"italic"},"text":", converge in direction to yield a separator with positive margin, and (c) gradients with respect to linear predictors ","element":"span"},{"style":{"height":20.32},"width":182.27,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-11.png","element":"img","alt":" ∇βL(β(t))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge in direction, then the limit direction is given by,","element":"span"}],[{"id":"id-48","style":{"width":"97%"},"width":1542,"height":104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-12.png","element":"img"}],[{"text":"For fully connected networks with single output, Theorem ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"shows that there is no effect of depth on the implicit bias of gradient descent. Regardless of the depth of the network, the asymptotic classifier is always the hard margin support vector machine classifier, which is also the limit direction of gradient descent for linear logistic regression with the direct parameterization of ","element":"span"},{"style":{"height":14},"width":124.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/4-13.png","element":"img","alt":" β = w.","inline":true}],[{"text":"In contrast, next we show that for convolutional networks we get very different biases. Let us first look at a ","element":"span"},{"text":"2","element":"span"},{"text":"–layer linear convolutional network, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"text":"a network with single convolutional layer followed by a fully connected final layer.","element":"span"}],[{"style":{"width":"3%"},"width":55,"height":5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-0.png","element":"img"}],[{"text":"Recall that ","element":"span"},{"style":{"height":16.58},"width":137.78,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-1.png","element":"img","alt":"�β ∈ CD","inline":true,"padRight":true},{"text":"denote the Fourier coefficients of ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-2.png","element":"img","alt":" β","inline":true},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"style":{"height":22.73},"width":182.95,"height":56.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-3.png","element":"img","alt":"�β[d] = 1√D","inline":true}],[{"text":"and that any non-zero ","element":"span"},{"style":{"height":11.6},"width":114.52,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-4.png","element":"img","alt":" z ∈ C","inline":true,"padRight":true},{"text":"is denoted in polar form as ","element":"span"},{"style":{"height":17.39},"width":188.28,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-5.png","element":"img","alt":" z = |z|eiφz","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":16},"width":212.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-6.png","element":"img","alt":" φz ∈ [0, 2π)","inline":true},{"text":". Linear predictors induced by gradient descent iterates ","element":"span"},{"style":{"height":14.18},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-7.png","element":"img","alt":" w(t) ","inline":true,"padRight":true},{"text":"for convolutional networks are denoted by ","element":"span"},{"style":{"height":18.73},"width":108.65,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-8.png","element":"img","alt":" β(t) =","inline":true},{"style":{"height":18.18},"width":200.49,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-9.png","element":"img","alt":"Pconv(w(t))","inline":true},{"text":". It is evident that if ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-10.png","element":"img","alt":" β(t)","inline":true,"padRight":true},{"text":"converges in direction to ","element":"span"},{"style":{"height":18.72},"width":59.65,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-11.png","element":"img","alt":" β∞","inline":true},{"text":", then its Fourier transformation ","element":"span"},{"style":{"height":23.38},"width":64.14,"height":58.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-12.png","element":"img","alt":"�β(t)","inline":true,"padRight":true},{"text":"converges in direction to ","element":"span"},{"style":{"height":23.37},"width":59.65,"height":58.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-13.png","element":"img","alt":" �β∞","inline":true},{"text":". In the following theorems, in addition to the earlier assumptions, we further assume a technical condition that the phase of the Fourier coefficients ","element":"span"},{"style":{"height":16.92},"width":96.92,"height":42.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-14.png","element":"img","alt":" eiφ�β(t)","inline":true,"padRight":true},{"text":"converge","element":"span"}],[{"text":"coordinate-wise. For coordinates ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"with ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-15.png","element":"img","alt":"�β","inline":true}],[{"style":{"height":14.19},"width":70.22,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-16.png","element":"img","alt":"w(t)","inline":true},{"text":", in which case ","element":"span"},{"style":{"height":17.76},"width":319.25,"height":44.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-17.png","element":"img","alt":" eiφ�β(t)[d] → eiφ�β∞[d]","inline":true},{"text":". We assume such a ","element":"span"},{"style":{"height":21.82},"width":108.98,"height":54.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-18.png","element":"img","alt":" φ�β∞[d] ","inline":true,"padRight":true},{"text":"also exists when ","element":"span"},{"style":{"height":24.17},"width":187.35,"height":60.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-19.png","element":"img","alt":" �β∞[d] = 0.","inline":true}],[{"id":"id-27","style":{"fontWeight":"bold"},"text":"Theorem 2 ","element":"span"},{"text":"(Linear convolutional networks of depth two)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For almost all linearly separable datasets ","element":"span"},{"style":{"height":17.38},"width":204.84,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-20.png","element":"img","alt":"{xn, yn}Nn=1","inline":true},{"style":{"fontStyle":"italic"},"text":", almost all initializations ","element":"span"},{"style":{"height":14.18},"width":74.08,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-21.png","element":"img","alt":" w(0)","inline":true},{"style":{"fontStyle":"italic"},"text":", and any sequence of step sizes ","element":"span"},{"style":{"height":16},"width":85.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-22.png","element":"img","alt":" {ηt}t","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with ","element":"span"},{"style":{"height":10.4},"width":31.78,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-23.png","element":"img","alt":" ηt","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"smaller ","element":"span"},{"style":{"fontStyle":"italic"},"text":"than the local Lipschitz at ","element":"span"},{"style":{"height":14.19},"width":70.23,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-24.png","element":"img","alt":" w(t)","inline":true},{"style":{"fontStyle":"italic"},"text":", consider the sequence gradient descent iterates ","element":"span"},{"style":{"height":14.19},"width":70.23,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-25.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"in eq. ","element":"span"},{"href":"#id-25","text":"(7) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"for minimizing ","element":"span"},{"href":"#id-23","style":{"height":16},"width":335.43,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-26.png","element":"img","alt":" LPconv(w) in eq. (4)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with exponential loss over ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":"–layer linear convolutional networks.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"If (a) the iterates ","element":"span"},{"style":{"height":14.18},"width":70.22,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-27.png","element":"img","alt":" w(t) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"minimize the objective, i.e., ","element":"span"},{"style":{"height":18.18},"width":445.01,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-28.png","element":"img","alt":" LPconv(w(t)) → 0, (b) w(t) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge in direction to yield a separator ","element":"span"},{"style":{"height":18.72},"width":59.64,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-29.png","element":"img","alt":" β∞","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with positive margin, (c) the phase of the Fourier coefficients ","element":"span"},{"style":{"height":23.38},"width":64.14,"height":58.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-30.png","element":"img","alt":" �β(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"of the linear predictors ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-31.png","element":"img","alt":" β(t) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge coordinate-wise, i.e., ","element":"span"},{"style":{"height":20.05},"width":382.16,"height":50.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-32.png","element":"img","alt":" ∀d, eiφ�β(t)[d] → eiφ��β∞[d]","inline":true},{"style":{"fontStyle":"italic"},"text":", and (d) the gradients ","element":"span"},{"style":{"height":20.32},"width":182.27,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-33.png","element":"img","alt":"∇βL(β(t))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge in direction, then the limit direction ","element":"span"},{"style":{"height":18.72},"width":59.65,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-34.png","element":"img","alt":" β∞","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is given by,","element":"span"}],[{"style":{"width":"85%"},"width":1356,"height":104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-35.png","element":"img"}],[{"text":"We already see how introducing a single convolutional layer changes the implicit bias of gradient descent—even without any explicit regularization, gradient descent on the parameters of convolutional network architecture returns solutions that are biased to have sparsity in the frequency domain.","element":"span"}],[{"text":"Furthermore, unlike fully connected networks, for convolutional networks we also see that the implicit bias changes with the depth of the network as shown by the following theorem. ","element":"span"},{"id":"id-32","style":{"fontWeight":"bold"},"text":"Theorem 2a ","element":"span"},{"text":"(Linear Convolutional Networks of any Depth)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For any depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"style":{"fontStyle":"italic"},"text":", under the conditions of Theorem ","element":"span"},{"href":"#id-27","style":{"fontStyle":"italic"},"text":"2, ","element":"a"},{"style":{"fontStyle":"italic"},"text":"the limit direction ","element":"span"},{"style":{"height":18.72},"width":185.43,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-36.png","element":"img","alt":" β∞ = lim","inline":true}],[{"style":{"fontStyle":"italic"},"text":"point of the ","element":"span"},{"href":"#id-27","style":{"fontStyle":"italic"},"text":"fo","element":"a"},{"style":{"fontStyle":"italic"},"text":"llowing optimization problem,","element":"span"}],[{"id":"id-28","style":{"width":"68%"},"width":1078,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-37.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the ","element":"span"},{"style":{"height":7.2},"width":33.6,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-38.png","element":"img","alt":" ℓp","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"penalty given by ","element":"span"},{"style":{"height":33.1},"width":417.77,"height":82.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-39.png","element":"img","alt":" ∥z∥p =��Di=1 |z[i]|p�1/p","inline":true},{"style":{"fontStyle":"italic"},"text":"(also called the bridge penalty) is a norm for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and a quasi-norm for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p < ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"text":"When ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L > ","element":"span"},{"text":"2","element":"span"},{"text":", and thus ","element":"span"},{"style":{"height":16},"width":205.71,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-40.png","element":"img","alt":" p = 2/L < 1","inline":true},{"text":", problem ","element":"span"},{"href":"#id-28","text":"(10) ","element":"a"},{"text":"is non-convex and intractable ","element":"span"},{"href":"#id-29","referenceIndex":6,"text":"Ge et al. ","element":"a"},{"href":"#id-29","referenceIndex":6,"text":"[2011]","element":"a"},{"text":". Hence, we cannot expect to ensure convergence to a global minimum. Instead we show convergence to a first order stationary point of ","element":"span"},{"href":"#id-28","text":"(10) ","element":"a"},{"text":"in the sense of sub-stationary points of ","element":"span"},{"href":"#id-30","referenceIndex":24,"text":"Rockafellar ","element":"a"},{"href":"#id-30","referenceIndex":24,"text":"[1979] ","element":"a"},{"text":"for optimization problems with non-smooth and non-convex objectives. These are solutions where the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"local directional derivative ","element":"span"},{"text":"along the directions in the tangent cone of the constraints are all zero.","element":"span"}],[{"text":"The first order stationary points, or sub-stationary points, of ","element":"span"},{"href":"#id-28","text":"(10) ","element":"a"},{"text":"are the set of feasible predictors ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-41.png","element":"img","alt":" β","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":17.39},"width":242.4,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-42.png","element":"img","alt":" ∃{αn ≥ 0}Nn=1 ","inline":true,"padRight":true},{"text":"satisfying the following: ","element":"span"},{"style":{"height":16},"width":608.78,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-43.png","element":"img","alt":" ∀n, yn⟨xn, β⟩ > 1 =⇒ αn = 0, and","inline":true}],[{"id":"id-62","style":{"width":"61%"},"width":981,"height":87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-44.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":9.59},"width":44.19,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-45.png","element":"img","alt":" �xn","inline":true,"padRight":true},{"text":"is the Fourier transformation of ","element":"span"},{"style":{"height":9.59},"width":44.19,"height":23.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-46.png","element":"img","alt":" xn","inline":true},{"text":", and ","element":"span"},{"style":{"height":11.2},"width":39.37,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-47.png","element":"img","alt":" ∂◦","inline":true,"padRight":true},{"text":"denotes the local sub-differential (or Clarke’s sub-differential) operator defined as ","element":"span"},{"style":{"height":16},"width":965.43,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-48.png","element":"img","alt":" ∂◦f(β) = conv{v : ∃(zk)k s.t. zk → β and ∇f(zk) → v}.","inline":true}],[{"text":"For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-49.png","element":"img","alt":"�β","inline":true,"padRight":true},{"text":"represented in polar form as ","element":"span"},{"style":{"height":19.67},"width":429.25,"height":49.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-50.png","element":"img","alt":"�β = |�β|eiφ�β ∈ CD, ∥�β∥p","inline":true,"padRight":true},{"text":"is convex and the local sub-differential is indeed the global sub-differential given by,","element":"span"}],[{"style":{"width":"83%"},"width":1322,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/5-51.png","element":"img"}],[{"id":"id-63","style":{"width":"0%"},"width":9,"height":3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-0.png","element":"img"}],[{"text":"For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p < ","element":"span"},{"text":"1","element":"span"},{"text":", the local sub-differential of ","element":"span"},{"style":{"height":16.79},"width":84.5,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-1.png","element":"img","alt":" ∥�β∥p","inline":true,"padRight":true},{"text":"is given by,","element":"span"}],[{"style":{"width":"85%"},"width":1353,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-2.png","element":"img"}],[{"text":"Figures ","element":"span"},{"href":"#id-31","text":"1a–","element":"a"},{"href":"#id-31","text":"1b ","element":"a"},{"text":"summarize the implications of the main results in the paper. The proof of this Theorem, exploits the following representation of ","element":"span"},{"style":{"height":16},"width":155.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-3.png","element":"img","alt":" Pconv(β)","inline":true,"padRight":true},{"text":"in the Fourier domain. ","element":"span"},{"id":"id-36","style":{"fontWeight":"bold"},"text":"Lemma 3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For full-dimensional convolutions, ","element":"span"},{"style":{"height":16},"width":242.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-4.png","element":"img","alt":" β = Pconv(w)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is equivalent to","element":"span"}],[{"style":{"width":"35%"},"width":558,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-5.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where for ","element":"span"},{"style":{"height":16.58},"width":417.75,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-6.png","element":"img","alt":" l = 1, 2, . . . , L, �w1 ∈ CD ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are the Fourier coefficients of the parameters ","element":"span"},{"style":{"height":15.78},"width":161.72,"height":39.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-7.png","element":"img","alt":" wl ∈ RD.","inline":true}],[{"text":"From above lemma (proved in Appendix ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":"), we can see a connection of convolutional networks to a special network where the linear transformation between layers is restricted to diagonal entries (see depiction in Figure ","element":"span"},{"href":"#id-31","text":"1c)","element":"a"},{"text":", we refer to such networks as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"linear diagonal network","element":"span"},{"text":".","element":"span"}],[{"text":"The proof of Theorem ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"and Theorem ","element":"span"},{"href":"#id-27","text":"2-","element":"a"},{"href":"#id-32","text":"2a ","element":"a"},{"text":"are provided in Appendix ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":", respectively.","element":"span"}]]},{"heading":"4 Understanding Gradient Descent in the Parameter Space","paragraphs":[[{"text":"We can decompose the characterization of implicit bias of gradient descent on a parameterization ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"w","element":"span"},{"text":") ","element":"span"},{"text":"into two parts: (a) what is the implicit bias of gradient descent in the space of parameters ","element":"span"},{"style":{"fontWeight":"bold"},"text":"w","element":"span"},{"text":"?, and (b) what does this imply in term of the linear predictor ","element":"span"},{"style":{"height":16},"width":181.4,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-8.png","element":"img","alt":" β = P(w)","inline":true},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"text":"how does the bias in parameter space translate to the linear predictor learned from the model class?","element":"span"}],[{"text":"We look at the first question for a broad class of linear models, where the linear predictor is given by a homogeneous polynomial mapping of the parameters: ","element":"span"},{"style":{"height":17.38},"width":435.39,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-9.png","element":"img","alt":" β = P(w), where w ∈ RP ","inline":true,"padRight":true},{"text":"are the parameters of the model and ","element":"span"},{"style":{"height":14.18},"width":236.33,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-10.png","element":"img","alt":" P : RP → RD ","inline":true,"padRight":true},{"text":"satisfies definition below. This class covers the linear convolutional, fully connected networks, and diagonal networks discussed in Section ","element":"span"},{"text":"3.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition ","element":"span"},{"text":"(Homogeneous Polynomial)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A multivariate polynomial function ","element":"span"},{"style":{"height":14.18},"width":236.33,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-11.png","element":"img","alt":" P : RP → RD","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is said to be homogeneous, if for some finite integer ","element":"span"},{"style":{"height":17.38},"width":734.78,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-12.png","element":"img","alt":" ν < ∞, ∀α ∈ R, v ∈ RP , P(αv) = ανP(v).","inline":true}],[{"id":"id-33","style":{"fontWeight":"bold"},"text":"Theorem 4 ","element":"span"},{"text":"(Homogeneous Polynomial Parameterization)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For any homogeneous polynomial map ","element":"span"},{"style":{"height":14.18},"width":253.83,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-13.png","element":"img","alt":"P : RP → RD","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"from parameters ","element":"span"},{"style":{"height":14.18},"width":145.97,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-14.png","element":"img","alt":" w ∈ RD","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"to linear predictors, almost all datasets ","element":"span"},{"style":{"height":17.38},"width":204.84,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-15.png","element":"img","alt":" {xn, yn}Nn=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"separable by ","element":"span"},{"style":{"height":17.38},"width":398.24,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-16.png","element":"img","alt":" B := {P(w) : w ∈ RP }","inline":true},{"style":{"fontStyle":"italic"},"text":", almost all initializations ","element":"span"},{"style":{"height":14.18},"width":74.08,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-17.png","element":"img","alt":" w(0)","inline":true},{"style":{"fontStyle":"italic"},"text":", and any bounded sequence of step sizes ","element":"span"},{"style":{"height":16},"width":85.67,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-18.png","element":"img","alt":" {ηt}t","inline":true},{"style":{"fontStyle":"italic"},"text":", consider the sequence of gradient descent updates ","element":"span"},{"href":"#id-25","style":{"height":17.78},"width":269.1,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-19.png","element":"img","alt":" w(t) from eq. (7)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for minimizing the empirical risk objective ","element":"span"},{"href":"#id-23","style":{"height":16},"width":216.24,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-20.png","element":"img","alt":" LP(w) in (4)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with exponential loss ","element":"span"},{"style":{"height":16},"width":338.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-21.png","element":"img","alt":" ℓ(u, y) = exp(−uy).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"If (a) the iterates ","element":"span"},{"style":{"height":14.18},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-22.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"asymptotically minimize the objective, i.e., ","element":"span"},{"style":{"height":18.18},"width":498.91,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-23.png","element":"img","alt":" LP(w(t)) = L(P(w(t))) → 0","inline":true},{"style":{"fontStyle":"italic"},"text":", (b) ","element":"span"},{"style":{"height":14.18},"width":70.22,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-24.png","element":"img","alt":" w(t)","inline":true},{"style":{"fontStyle":"italic"},"text":", and consequently ","element":"span"},{"style":{"height":19.53},"width":254.88,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-25.png","element":"img","alt":" β(t) = P(w(t))","inline":true},{"style":{"fontStyle":"italic"},"text":", converge in direction to yield a separator with positive margin, and (c) the gradients w.r.t. to the linear predictors, ","element":"span"},{"style":{"height":20.32},"width":182.27,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-26.png","element":"img","alt":" ∇βL(β(t))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge in direction, then the limit direction of the parameters ","element":"span"},{"style":{"height":11.53},"width":185.93,"height":28.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-27.png","element":"img","alt":" w∞ = lim","inline":true}],[{"style":{"fontStyle":"italic"},"text":"point of the following optimization problem,","element":"span"}],[{"id":"id-34","style":{"width":"72%"},"width":1148,"height":66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-28.png","element":"img"}],[{"text":"Theorem ","element":"span"},{"href":"#id-33","text":"4 ","element":"a"},{"text":"is proved in Appendix ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":". The proof of Theorem ","element":"span"},{"href":"#id-33","text":"4 ","element":"a"},{"text":"involves showing that the asymptotic direction of gradient descent iterates satisfies the KKT conditions for first order stationary points of ","element":"span"},{"href":"#id-34","text":"(14)","element":"a"},{"text":". This crucially relies on two properties. First, the sequence of gradients ","element":"span"},{"href":"#id-16","referenceIndex":8,"style":{"height":20.32},"width":182.27,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-29.png","element":"img","alt":" ∇βL(β(t))","inline":true,"padRight":true},{"text":"converge in direction to a positive span of support vectors of ","element":"span"},{"style":{"height":18.72},"width":182.18,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-30.png","element":"img","alt":" β∞ = lim","inline":true}],[{"href":"#id-16","referenceIndex":8,"text":"[2018]","element":"a"},{"text":"), and this result relies on the loss function ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-31.png","element":"img","alt":" ℓ","inline":true,"padRight":true},{"text":"being exponential tailed. Sec","element":"span"},{"href":"#id-16","referenceIndex":8,"text":"ondly, if ","element":"a"},{"style":{"fontStyle":"italic"},"text":"P ","element":"span"},{"text":"is not homogeneous, then the optimization problems ","element":"span"},{"style":{"height":17.38},"width":530.03,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-32.png","element":"img","alt":" minw∥w∥22 s.t. ∀n, ⟨xn, yn⟩ ≥ γ","inline":true,"padRight":true},{"text":"for different values ","element":"span"},{"text":"of unnormalized margin ","element":"span"},{"style":{"height":10.4},"width":22,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-33.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"are not equivalent and lead to different separators. Thus, for general non-homogeneous ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":", the unnormalized margin of one does not have a significance and the necessary conditions for the first order stationarity of ","element":"span"},{"href":"#id-34","text":"(14) ","element":"a"},{"text":"are not satisfied.","element":"span"}],[{"text":"Finally, we also note that in many cases (including linear convolutional networks) the optimization problem ","element":"span"},{"href":"#id-34","text":"(14) ","element":"a"},{"text":"is non-convex and intractable (see ","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g., ","element":"span"},{"href":"#id-29","referenceIndex":6,"text":"Ge et al. ","element":"a"},{"href":"#id-29","referenceIndex":6,"text":"[2011]","element":"a"},{"text":"). So we cannot expect ","element":"span"},{"style":{"height":11.53},"width":65.73,"height":28.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-34.png","element":"img","alt":" w∞","inline":true,"padRight":true},{"text":"to be always be a global minimizer of eq. ","element":"span"},{"href":"#id-34","text":"(14)","element":"a"},{"text":". We however suspect that it is possible to obtain a stronger result that ","element":"span"},{"style":{"height":11.53},"width":65.74,"height":28.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/6-35.png","element":"img","alt":" w∞","inline":true,"padRight":true},{"text":"reaches a higher order stationary point or even a local minimum of the explicitly regularized estimator in eq. ","element":"span"},{"href":"#id-34","text":"(14)","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Implications of the implicit bias in predictor space ","element":"span"},{"text":"While eq. ","element":"span"},{"href":"#id-34","text":"(14) ","element":"a"},{"text":"characterizes the bias of gradient descent in the parameter space, what we really care about is the effective bias introduced in the space of functions learned by the network. In our case, this class of functions is the set of linear predictors ","element":"span"},{"style":{"height":17.38},"width":300.74,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-0.png","element":"img","alt":" {β ∈ RD}. The ℓ2","inline":true,"padRight":true},{"text":"norm penalized solution in eq. ","element":"span"},{"href":"#id-34","text":"(14)","element":"a"},{"text":", is equivalently given by,","element":"span"}],[{"id":"id-35","style":{"width":"93%"},"width":1485,"height":75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-1.png","element":"img"}],[{"text":"The problems in eq. ","element":"span"},{"href":"#id-34","text":"(14) ","element":"a"},{"text":"and eq. ","element":"span"},{"href":"#id-35","text":"(15) ","element":"a"},{"text":"have the same global minimizers, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"style":{"height":10.99},"width":49.73,"height":27.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-2.png","element":"img","alt":" w∗ ","inline":true,"padRight":true},{"text":"is global minimizer of eq. ","element":"span"},{"href":"#id-34","text":"(14) ","element":"a"},{"text":"if and only if ","element":"span"},{"style":{"height":16.33},"width":213.69,"height":40.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-3.png","element":"img","alt":" β∗ = P(w∗)","inline":true,"padRight":true},{"text":"minimizes eq. ","element":"span"},{"href":"#id-35","text":"(15)","element":"a"},{"text":". However, such an equivalence does not extend to the stationary points of the two problems. Specifically, it is possible that a stationary point of eq. ","element":"span"},{"href":"#id-34","text":"(14) ","element":"a"},{"text":"is merely a feasible point for eq. ","element":"span"},{"href":"#id-35","text":"(15) ","element":"a"},{"text":"with no special significance. So instead of using Theorem ","element":"span"},{"href":"#id-33","text":"4, ","element":"a"},{"text":"for the specific networks in Section ","element":"span"},{"text":"3, ","element":"span"},{"text":"we directly show (in Appendix) that gradient descent updates converge in direction to a first order stationary point of the problem in eq. ","element":"span"},{"href":"#id-35","text":"(15)","element":"a"},{"text":".","element":"span"}]]},{"heading":"5 Understanding Gradient Descent in Predictor Space","paragraphs":[[{"text":"In the previous section, we saw that the implicit bias of gradient descent on a parameterization ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"w","element":"span"},{"text":") ","element":"span"},{"text":"can be described in terms of the optimization problem ","element":"span"},{"href":"#id-34","text":"(14) ","element":"a"},{"text":"and the implied penalty function ","element":"span"},{"style":{"height":19.06},"width":488.2,"height":47.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-4.png","element":"img","alt":"RP(β) = minw:P(w)=β∥w∥22","inline":true},{"text":". We now turn to studying this implied penalty ","element":"span"},{"style":{"height":16},"width":119.82,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-5.png","element":"img","alt":" RP(β)","inline":true,"padRight":true},{"text":"and obtaining ","element":"span"},{"text":"explicit forms for it, which will reveal the precise form of the implicit bias in terms of the learned linear predictor. The proofs of the lemmas in this section are provided in the Appendix ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 5. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For fully connected networks of any depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L > ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":",","element":"span"}],[{"style":{"width":"66%"},"width":1053,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-6.png","element":"img"}],[{"text":"We see that ","element":"span"},{"style":{"height":22.26},"width":885.6,"height":55.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-7.png","element":"img","alt":" β∗RPfull = argminβ RPfull(β) s.t. ∀n, yn⟨xn, β⟩ ≥ 1","inline":true,"padRight":true},{"text":"in eq. ","element":"span"},{"href":"#id-35","text":"(15) ","element":"a"},{"text":"for fully connected ","element":"span"},{"text":"networks is independent of the depth of the network ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":". In Theorem ","element":"span"},{"href":"#id-26","text":"1, ","element":"a"},{"text":"we indeed show that gradient descent for this class of networks converges in the direction of ","element":"span"},{"style":{"height":22.26},"width":142.85,"height":55.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-8.png","element":"img","alt":" β∗RPfull .","inline":true}],[{"text":"Next, we motivate the characterization of ","element":"span"},{"style":{"height":16},"width":119.82,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-9.png","element":"img","alt":" RP(β)","inline":true,"padRight":true},{"text":"for linear convolutional networks by first looking at the special ","element":"span"},{"style":{"fontStyle":"italic"},"text":"linear diagonal network ","element":"span"},{"text":"depicted in Figure ","element":"span"},{"href":"#id-31","text":"1c. ","element":"a"},{"text":"The depth–","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"diagonal network is parameterized by ","element":"span"},{"style":{"height":18.3},"width":349.94,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-10.png","element":"img","alt":" w = [wl ∈ RD]Ll=1","inline":true,"padRight":true},{"text":"and the mapping to a linear predictor is given ","element":"span"},{"text":"by ","element":"span"},{"style":{"height":16.79},"width":834.54,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-11.png","element":"img","alt":" Pdiag(w) = diag(w1)diag(w2) . . . diag(wL−1)wL.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Lemma 6. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For a depth–","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"style":{"fontStyle":"italic"},"text":"diagonal network with parameters ","element":"span"},{"style":{"height":18.3},"width":463.46,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-12.png","element":"img","alt":" w = [wl ∈ RD]Ll−1, we have","inline":true}],[{"style":{"width":"68%"},"width":1087,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-13.png","element":"img"}],[{"text":"Finally, for full width linear convolutional networks parameterized by ","element":"span"},{"style":{"height":18.3},"width":311.44,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-14.png","element":"img","alt":" w = [wl ∈ RD]Ll=1","inline":true},{"text":", recall the ","element":"span"},{"text":"following representation of ","element":"span"},{"style":{"height":16},"width":242.33,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-15.png","element":"img","alt":" β = Pconv(w)","inline":true,"padRight":true},{"text":"in Fourier from Lemma ","element":"span"},{"href":"#id-36","text":"3.","element":"a"}],[{"style":{"width":"34%"},"width":554,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-16.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16.59},"width":206.44,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-17.png","element":"img","alt":"�β, �wl ∈ CD","inline":true,"padRight":true},{"text":"are Fourier basis representation of ","element":"span"},{"style":{"height":16.58},"width":202.13,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-18.png","element":"img","alt":" β, wl ∈ RD","inline":true},{"text":", respectively. Extending the result of diagonal networks for the complex vector spaces, we get the following characterization of ","element":"span"},{"style":{"height":16},"width":179.07,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-19.png","element":"img","alt":"RPconv(β)","inline":true,"padRight":true},{"text":"for linear convolutional networks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 7. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For a depth–","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"style":{"fontStyle":"italic"},"text":"convolutional network with parameters ","element":"span"},{"style":{"height":18.3},"width":463.46,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-20.png","element":"img","alt":" w = [wl ∈ RD]Ll−1, we have","inline":true}],[{"style":{"width":"69%"},"width":1096,"height":77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/7-21.png","element":"img"}]]},{"heading":"6 Discussion","paragraphs":[[{"text":"In this paper, we characterized the implicit bias of gradient descent on linear convolutional networks. We showed that even in the case of linear activations and a full width convolution, wherein the convolutional network defines the exact same model class as fully connected networks, merely changing to a convolutional parameterization introduces radically different, and very interesting, bias ","element":"span"},{"text":"when training with gradient descent. Namely, training a convolutional representation with gradient descent implicitly biases towards sparsity in the frequency domain representation of linear predictor.","element":"span"}],[{"text":"For convenience and simplicity of presentation, we studied one dimensional circular convolutions. Our results can be directly extended to higher dimensional input signals and convolutions, including the two-dimensional convolutions common in image processing and computer vision. We also expect similar results for convolutions with zero padding instead of circular convolutions, although this requires more care with analysis of the edge effects.","element":"span"}],[{"text":"A more significant way in which our setup differs from usual convolutional networks is that we use full width convolutions, while in practice it is common to use convolutions with bounded width, much smaller then the input dimensionality. This setting is within the scope of Theorem ","element":"span"},{"href":"#id-33","text":"4, ","element":"a"},{"text":"as the linear transformation is still homogeneous. However, understanding the implied bias in the predictor space, i.e. understanding ","element":"span"},{"style":{"height":16},"width":119.81,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-0.png","element":"img","alt":" RP(β)","inline":true,"padRight":true},{"text":"requires additional work. It will be very interesting to see if restricting the width of the convolutional network gives rise to further interesting behaviors.","element":"span"}],[{"text":"Another important direction for future study is understanding the implicit bias for networks with multiple outputs. For both fully connected and convolutional networks, we looked at networks with a single output. With ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C > ","element":"span"},{"text":"1 ","element":"span"},{"text":"outputs, the network implements a linear transformation ","element":"span"},{"style":{"height":14.4},"width":245.75,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-1.png","element":"img","alt":" x �→ βx where","inline":true},{"style":{"height":16.58},"width":185.56,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-2.png","element":"img","alt":"β ∈ RC×D","inline":true,"padRight":true},{"text":"is now a matrix. Results for matrix sensing in ","element":"span"},{"href":"#id-16","referenceIndex":8,"text":"Gunasekar et al. ","element":"a"},{"href":"#id-16","referenceIndex":8,"text":"[2018] ","element":"a"},{"text":"imply that for two layer fully connected networks with multiple outputs, the implicit bias is to a maximum margin solution with respect to the nuclear norm ","element":"span"},{"style":{"height":16},"width":81.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-3.png","element":"img","alt":" ∥β∥⋆","inline":true},{"text":". This is already different from the implicit bias of a one-layer “network” (i.e. optimizing ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-4.png","element":"img","alt":" β","inline":true,"padRight":true},{"text":"directly), which would be in terms of the Frobenius norm ","element":"span"},{"style":{"height":16},"width":91.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-5.png","element":"img","alt":"∥β∥F","inline":true,"padRight":true},{"text":"(from the result of ","element":"span"},{"href":"#id-12","referenceIndex":26,"text":"Soudry et al. ","element":"a"},{"href":"#id-12","referenceIndex":26,"text":"[2017]","element":"a"},{"text":"). We suspect that with multiple outputs, as more layers are added, even fully connected networks exhibit a shrinking sparsity penalty on the singular values of the effective linear matrix predictor ","element":"span"},{"style":{"height":16.58},"width":184.52,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-6.png","element":"img","alt":" β ∈ RC×D","inline":true},{"text":". Precisely characterizing these biases requires further study.","element":"span"}],[{"text":"When using convolutions as part of a larger network, with multiple parallel filters, max pooling, and non-linear activations, the situation is of course more complex, and we do not expect to get the exact same bias. However, we do expect the bias to be at the very least related to the sparsity-in-frequency-domain bias that we uncover here, and we hope our work can serve as a basis for further such study. There are of course many other implicit and explicit sources of inductive bias—here we show that merely parameterizing transformations via convolutions and using gradient descent for training already induces sparsity in the frequency domain.","element":"span"}],[{"text":"On a technical level, we provided a generic characterization for the bias of gradient descent on linear models parameterized as ","element":"span"},{"style":{"height":16},"width":178.26,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-7.png","element":"img","alt":" β = P(w)","inline":true,"padRight":true},{"text":"for a homogeneous polynomial ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":". The ","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-8.png","element":"img","alt":" ℓ2","inline":true,"padRight":true},{"text":"bias (in parameter space) we obtained is not surprising, but also should not be taken for granted – ","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g., ","element":"span"},{"text":"the result does not hold in general for non-homogeneous ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":", and even with homogeneous polynomials, the characterization is not as crisp when other loss functions are used, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g., ","element":"span"},{"text":"with a squared loss and matrix factorization (a homogeneous degree two polynomial representation), the implicit bias is much more fragile ","element":"span"},{"href":"#id-15","referenceIndex":7,"text":"Gunasekar et al. ","element":"a"},{"href":"#id-15","referenceIndex":7,"text":"[2017]","element":"a"},{"text":", ","element":"span"},{"href":"#id-37","referenceIndex":17,"text":"Li et al. ","element":"a"},{"href":"#id-37","referenceIndex":17,"text":"[2017]","element":"a"},{"text":". Moreover, Theorem ","element":"span"},{"href":"#id-33","text":"4 ","element":"a"},{"text":"only ensures convergence to first order stationary point in the parameter space, which is not sufficient for convergence to stationary points of the implied bias in the model space (eq. ","element":"span"},{"href":"#id-35","text":"(15)","element":"a"},{"text":"). It is of interest for future work to strengthen this result to show either convergence to higher order stationary points or local minima in parameter space, or to directly show the convergence to stationary points of ","element":"span"},{"href":"#id-35","text":"(15)","element":"a"},{"text":".","element":"span"}],[{"text":"It would also be of interest to strengthen other technical aspects of our results: extend the results to loss functions with tight exponential tails (including logistic loss) and handle all datasets including the set of measure zero degenerate datasets—these should be possible following the techniques of ","element":"span"},{"href":"#id-12","referenceIndex":26,"text":"Soudry et al. ","element":"a"},{"href":"#id-12","referenceIndex":26,"text":"[2017]","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":27,"text":"Telgarsky ","element":"a"},{"href":"#id-17","referenceIndex":27,"text":"[2013]","element":"a"},{"text":", ","element":"span"},{"href":"#id-38","referenceIndex":11,"text":"Ji and Telgarsky ","element":"a"},{"href":"#id-38","referenceIndex":11,"text":"[2018]","element":"a"},{"text":". We can also calculate exact rates of convergence to the asymptotic separator along the lines of ","element":"span"},{"href":"#id-12","referenceIndex":26,"text":"Soudry et al. ","element":"a"},{"href":"#id-12","referenceIndex":26,"text":"[2017]","element":"a"},{"text":", ","element":"span"},{"href":"#id-39","referenceIndex":19,"text":"Nacson et al. ","element":"a"},{"href":"#id-39","referenceIndex":19,"text":"[2018]","element":"a"},{"text":", ","element":"span"},{"href":"#id-38","referenceIndex":11,"text":"Ji and Telgarsky ","element":"a"},{"href":"#id-38","referenceIndex":11,"text":"[2018] ","element":"a"},{"text":"showing how fast the inductive bias from optimization kicks in and why it might be beneficial to continue optimizing even after the loss value ","element":"span"},{"style":{"height":19.53},"width":125.57,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-9.png","element":"img","alt":" L(β(t))","inline":true,"padRight":true},{"text":"itself is negligible. Finally, for logistic regression, ","element":"span"},{"href":"#id-38","referenceIndex":11,"text":"Ji and Telgarsky ","element":"a"},{"href":"#id-38","referenceIndex":11,"text":"[2018] ","element":"a"},{"text":"extend the results of asymptotic convergence of gradient descent classifier to the cases where the data is not strictly linearly separable. This is an important relaxation of our assumption on strict linear separability. More generally, for non-separable data, we would like a more fine grained analysis connecting the iterates ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-10.png","element":"img","alt":" β(t) ","inline":true,"padRight":true},{"text":"along the optimization path to the estimates along regularization path, ","element":"span"},{"style":{"height":18.7},"width":491.75,"height":46.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/8-11.png","element":"img","alt":"�β(c) = argminRP(β)≤c L(β)","inline":true},{"text":", where an explicit regularization is added to the optimization objective.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-6","text":"Marcin Andrychowicz, Misha Denil, Sergio Gomez, Matthew W Hoffman, David Pfau, Tom Schaul, and Nando ","element":"span"},{"text":"de Freitas. Learning to learn by gradient descent by gradient descent. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-13","text":"P. L. Bartlett and S. Mendelson. Rademacher and Gaussian complexities: Risk bounds and structural results. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research","element":"span"},{"text":", 2003.","element":"span"}],[{"id":"id-18","text":"Samuel Burer and Renato DC Monteiro. A nonlinear programming algorithm for solving semidefinite programs ","element":"span"},{"text":"via low-rank factorization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Programming","element":"span"},{"text":", 95(2):329–357, 2003.","element":"span"}],[{"id":"id-4","text":"Pratik Chaudhari, Anna Choromanska, Stefano Soatto, Yann LeCun, Carlo Baldassi, Christian Borgs, Jennifer ","element":"span"},{"text":"Chayes, Levent Sagun, and Riccardo Zecchina. Entropy-sgd: Biasing gradient descent into wide valleys. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1611.01838","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-5","text":"Laurent Dinh, Razvan Pascanu, Samy Bengio, and Yoshua Bengio. Sharp minima can generalize for deep nets. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-29","text":"Dongdong Ge, Xiaoye Jiang, and Yinyu Ye. A note on the complexity of lp minimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical programming","element":"span"},{"text":", 2011.","element":"span"}],[{"id":"id-15","text":"Suriya Gunasekar, Blake E Woodworth, Srinadh Bhojanapalli, Behnam Neyshabur, and Nati Srebro. Implicit ","element":"span"},{"text":"regularization in matrix factorization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"NIPS","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-16","text":"Suriya Gunasekar, Jason D. Lee, Daniel Soudry, and Nathan Srebro. Characterizing implicit bias in terms of ","element":"span"},{"text":"optimization geometry. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-2","text":"Sepp Hochreiter and Jürgen Schmidhuber. Flat minima. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural Computation","element":"span"},{"text":", 1997.","element":"span"}],[{"id":"id-10","text":"Elad Hoffer, Itay Hubara, and Daniel Soudry. Train longer, generalize better: closing the generalization gap in ","element":"span"},{"text":"large batch training of neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-38","text":"Ziwei Ji and Matus Telgarsky. ","element":"span"},{"text":"Risk and parameter convergence of logistic regression. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1803.07300","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-19","text":"Michel Journée, Francis Bach, P-A Absil, and Rodolphe Sepulchre. Low-rank optimization on the cone of ","element":"span"},{"text":"positive semidefinite matrices. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Optimization","element":"span"},{"text":", 20(5):2327–2351, 2010.","element":"span"}],[{"id":"id-14","text":"Sham M Kakade, Karthik Sridharan, and Ambuj Tewari. On the complexity of linear prediction: Risk bounds, ","element":"span"},{"text":"margin bounds, and regularization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", 2009.","element":"span"}],[{"id":"id-20","text":"Kenji Kawaguchi. Deep learning without poor local minima. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-3","text":"Nitish Shirish Keskar, Dheevatsa Mudigere, Jorge Nocedal, Mikhail Smelyanskiy, and Ping Tak Peter Tang. On ","element":"span"},{"text":"large-batch training for deep learning: Generalization gap and sharp minima. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Learning Representations","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-22","text":"Jason D. Lee, Max Simchowitz, Michael I. Jordan, and Benjamin Recht. Gradient descent only converges to ","element":"span"},{"text":"minimizers. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"29th Annual Conference on Learning Theory","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-37","text":"Yuanzhi Li, Tengyu Ma, and Hongyang Zhang. Algorithmic regularization in over-parameterized matrix recovery. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1712.09203","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-82","text":"Marian Muresan. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A concrete approach to classical analysis","element":"span"},{"text":", volume 14. Springer, 2009.","element":"span"}],[{"id":"id-39","text":"Mor Shpigel Nacson, Jason Lee, Suriya Gunasekar, Nathan Srebro, and Daniel Soudry. Convergence of gradient ","element":"span"},{"text":"descent on separable data. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1803.01905","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-1","text":"Behnam Neyshabur, Ruslan R Salakhutdinov, and Nati Srebro. Path-sgd: Path-normalized optimization in deep ","element":"span"},{"text":"neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pages 2422–2430, 2015a.","element":"span"}],[{"id":"id-0","text":"Behnam Neyshabur, Ryota Tomioka, and Nathan Srebro. In search of the real inductive bias: On the role of ","element":"span"},{"text":"implicit regularization in deep learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Learning Representations","element":"span"},{"text":", 2015b.","element":"span"}],[{"id":"id-7","text":"Behnam Neyshabur, Ryota Tomioka, Ruslan Salakhutdinov, and Nathan Srebro. Geometry of optimization and ","element":"span"},{"text":"implicit regularization in deep learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-21","text":"Quynh Nguyen and Matthias Hein. The loss surface of deep and wide neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1704.08045","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-30","text":"R Tyrrell Rockafellar. Directionally lipschitzian functions and subdifferential calculus. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the London Mathematical Society","element":"span"},{"text":", 1979.","element":"span"}],[{"id":"id-11","text":"Le Smith, Kindermans. Don’t Decay the Learning Rate, Increase the Batch Size. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ICLR","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-12","text":"Daniel Soudry, Elad Hoffer, and Nathan Srebro. The implicit bias of gradient descent on separable data. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1710.10345","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-17","text":"Matus Telgarsky. Margins, shrinkage and boosting. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 30th International Conference on International Conference on Machine Learning-Volume 28","element":"span"},{"text":", pages II–307. JMLR. org, 2013.","element":"span"}],[{"id":"id-9","text":"Ashia C Wilson, Rebecca Roelofs, Mitchell Stern, Nati Srebro, and Benjamin Recht. The marginal value of ","element":"span"},{"text":"adaptive gradient methods in machine learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-8","text":"Chiyuan Zhang, Samy Bengio, Moritz Hardt, Benjamin Recht, and Oriol Vinyals. Understanding deep learning ","element":"span"},{"text":"requires rethinking generalization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Learning Representations","element":"span"},{"text":", 2017.","element":"span"}]]},{"heading":"Appendix","paragraphs":[[{"text":"The proofs of the theorems in the paper are organized as follows: In Appendix ","element":"span"},{"text":"A ","element":"span"},{"text":"we first give the proof for Theorem ","element":"span"},{"href":"#id-33","text":"4, ","element":"a"},{"text":"which includes linear fully connected and full width convolutional networks as special cases. This gives us some general results that can be special-cased to prove the stronger results for these networks in Section ","element":"span"},{"text":"3. ","element":"span"},{"text":"In Appendix ","element":"span"},{"text":"B, ","element":"span"},{"text":"we prove Theorem ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"on the implicit bias of fully connected linear networks. In Appendix ","element":"span"},{"text":"C, ","element":"span"},{"text":"we prove Theorem ","element":"span"},{"href":"#id-27","text":"2–","element":"a"},{"href":"#id-32","text":"2a ","element":"a"},{"text":"on the implicit bias of linear convolutional networks. Finally, in Appendix ","element":"span"},{"text":"D ","element":"span"},{"text":"we prove the lemmas in Section ","element":"span"},{"text":"5 ","element":"span"},{"text":"on computing the form of implicit bias of linear networks learned using gradient descent.","element":"span"}],[{"text":"Unless specified otherwise, ","element":"span"},{"style":{"height":16},"width":50.99,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-0.png","element":"img","alt":" ∥.∥","inline":true,"padRight":true},{"text":"denotes the Euclidean norm. We additionally use the notation ","element":"span"},{"style":{"height":7.2},"width":116.78,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-1.png","element":"img","alt":" v ∝ v′","inline":true,"padRight":true},{"text":"to denote equality up to strictly positive scalar multipliers, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"text":"when ","element":"span"},{"style":{"height":14.4},"width":403.18,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-2.png","element":"img","alt":" v = γv′ for some γ > 0.","inline":true}],[{"text":"The following is a paraphrasing of Lemma ","element":"span"},{"text":"8 ","element":"span"},{"text":"in ","element":"span"},{"href":"#id-16","referenceIndex":8,"text":"Gunasekar et al. ","element":"a"},{"href":"#id-16","referenceIndex":8,"text":"[2018] ","element":"a"},{"text":"and is used in multiple proofs.","element":"span"}],[{"id":"id-41","style":{"fontWeight":"bold"},"text":"Lemma 8. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"[Lemma ","element":"span"},{"text":"8 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"in ","element":"span"},{"href":"#id-16","referenceIndex":8,"style":{"fontStyle":"italic"},"text":"Gunasekar et al. ","element":"a"},{"href":"#id-16","referenceIndex":8,"style":{"fontStyle":"italic"},"text":"[2018]","element":"a"},{"style":{"fontStyle":"italic"},"text":"] For almost all linearly separable dataset ","element":"span"},{"style":{"height":16},"width":177.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-3.png","element":"img","alt":" {xn, yn}n,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"consider any sequence ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-4.png","element":"img","alt":" β(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"that minimizes the empirical objective in eq. ","element":"span"},{"href":"#id-24","text":"(5)","element":"a"},{"style":{"fontStyle":"italic"},"text":", i.e., ","element":"span"},{"style":{"height":19.53},"width":208.63,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-5.png","element":"img","alt":" L(β(t)) → 0","inline":true},{"style":{"fontStyle":"italic"},"text":". If (a) ","element":"span"},{"style":{"height":18.72},"width":190.88,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-6.png","element":"img","alt":" β∞ := lim","inline":true}],[{"style":{"height":21.1},"width":1584.32,"height":52.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-7.png","element":"img","alt":"∃{αn ≥ 0}n∈S s.t. z∞ = �n∈S αn ynxn, where S = {n : yn⟨β∞, xn⟩ = minn yn⟨β∞, xn⟩} are","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the indices of the data points with smallest margin to the limit direction ","element":"span"},{"style":{"height":18.72},"width":71.41,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-8.png","element":"img","alt":" β∞.","inline":true}]]},{"heading":"A Homogeneous Polynomial Parameterization: Proof of Theorem 4","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"Theorem 4 ","element":"span"},{"text":"(Homogeneous Polynomial Parameterization)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For any homogeneous polynomial map ","element":"span"},{"style":{"height":14.18},"width":253.83,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-9.png","element":"img","alt":"P : RP → RD","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"from parameters ","element":"span"},{"style":{"height":14.18},"width":145.97,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-10.png","element":"img","alt":" w ∈ RD","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"to linear predictors, almost all datasets ","element":"span"},{"style":{"height":17.38},"width":204.84,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-11.png","element":"img","alt":" {xn, yn}Nn=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"separable by ","element":"span"},{"style":{"height":17.38},"width":398.24,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-12.png","element":"img","alt":" B := {P(w) : w ∈ RP }","inline":true},{"style":{"fontStyle":"italic"},"text":", almost all initializations ","element":"span"},{"style":{"height":14.18},"width":74.08,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-13.png","element":"img","alt":" w(0)","inline":true},{"style":{"fontStyle":"italic"},"text":", and any bounded sequence of step sizes ","element":"span"},{"style":{"height":16},"width":85.67,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-14.png","element":"img","alt":" {ηt}t","inline":true},{"style":{"fontStyle":"italic"},"text":", consider the sequence of gradient descent updates ","element":"span"},{"href":"#id-25","style":{"height":17.79},"width":269.1,"height":44.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-15.png","element":"img","alt":" w(t) from eq. (7)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for minimizing the empirical risk objective ","element":"span"},{"href":"#id-23","style":{"height":16},"width":216.24,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-16.png","element":"img","alt":" LP(w) in (4)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with exponential loss ","element":"span"},{"style":{"height":16},"width":338.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-17.png","element":"img","alt":" ℓ(u, y) = exp(−uy).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"If (a) the iterates ","element":"span"},{"style":{"height":14.18},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-18.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"asymptotically minimize the objective, i.e., ","element":"span"},{"style":{"height":18.18},"width":498.91,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-19.png","element":"img","alt":" LP(w(t)) = L(P(w(t))) → 0","inline":true},{"style":{"fontStyle":"italic"},"text":", (b) ","element":"span"},{"style":{"height":14.19},"width":70.22,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-20.png","element":"img","alt":" w(t)","inline":true},{"style":{"fontStyle":"italic"},"text":", and consequently ","element":"span"},{"style":{"height":19.53},"width":254.88,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-21.png","element":"img","alt":" β(t) = P(w(t))","inline":true},{"style":{"fontStyle":"italic"},"text":", converge in direction to yield a separator with positive margin, and (c) the gradients w.r.t. to the linear predictors, ","element":"span"},{"style":{"height":20.32},"width":182.27,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-22.png","element":"img","alt":" ∇βL(β(t))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge in direction, then the limit direction of the parameters ","element":"span"},{"style":{"height":11.53},"width":185.93,"height":28.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-23.png","element":"img","alt":" w∞ = lim","inline":true}],[{"style":{"fontStyle":"italic"},"text":"point of the following optimization problem,","element":"span"}],[{"style":{"width":"72%"},"width":1148,"height":66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-24.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"style":{"height":14.18},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-25.png","element":"img","alt":" w(t) ","inline":true,"padRight":true},{"text":"are the sequence gradient descent iterates from eq. ","element":"span"},{"href":"#id-25","text":"(7) ","element":"a"},{"text":"for minimizing ","element":"span"},{"href":"#id-23","style":{"height":16},"width":263.7,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-26.png","element":"img","alt":" LP(w) in eq (4)","inline":true,"padRight":true},{"text":"with exponential loss over the model class of ","element":"span"},{"style":{"height":17.39},"width":387.17,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-27.png","element":"img","alt":" B = {P(w) : w ∈ RP }","inline":true},{"text":", where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P ","element":"span"},{"text":"is a homogeneous polynomial function. We first introduce some notation.","element":"span"}],[{"text":"1. From the assumption in theorem, we have that ","element":"span"},{"style":{"height":11.53},"width":185.93,"height":28.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-28.png","element":"img","alt":" w∞ = lim","inline":true}],[{"id":"id-40","style":{"width":"99%"},"width":1582,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-29.png","element":"img"}],[{"text":"2. Let ","element":"span"},{"style":{"height":19.53},"width":264.24,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-30.png","element":"img","alt":" β(t) = P(w(t))","inline":true,"padRight":true},{"text":"denote the sequence of linear predictors for this network induced by the gradient descent iterates. We can see that ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-31.png","element":"img","alt":" β(t)","inline":true,"padRight":true},{"text":"converges in direction too using the following arguments: homogeneity of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P ","element":"span"},{"text":"implies that ","element":"span"},{"style":{"height":18.08},"width":526.55,"height":45.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-32.png","element":"img","alt":" P(w(t)/∥w(t)∥) = P(w(t))/∥w(t)∥ν","inline":true,"padRight":true},{"text":"for some ","element":"span"},{"style":{"height":6.8},"width":21,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-33.png","element":"img","alt":" ν","inline":true},{"text":". Hence,","element":"span"}],[{"style":{"width":"48%"},"width":772,"height":60,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-34.png","element":"img"}],[{"text":"3. ","element":"span"},{"style":{"height":28.8},"width":877.4,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-35.png","element":"img","alt":" z(t) = −∇βL(β(t)) = �n exp�−⟨β(t), ynxn⟩�ynxn","inline":true},{"text":". Since we assume that ","element":"span"},{"style":{"height":14.18},"width":56.86,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-36.png","element":"img","alt":" z(t)","inline":true,"padRight":true},{"text":"converges in","element":"span"}],[{"id":"id-43","style":{"width":"99%"},"width":1581,"height":133,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/11-37.png","element":"img"}],[{"text":"4. Let ","element":"span"},{"style":{"height":19.67},"width":361.49,"height":49.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-0.png","element":"img","alt":" ∇wP�w(t)�∈ RP ×D ","inline":true,"padRight":true},{"text":"denote the Jacobian of ","element":"span"},{"style":{"height":29.23},"width":745.82,"height":73.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-1.png","element":"img","alt":" P(w), i.e., ∇wP�w(t)�[p, d] =∂(P(w(t))[d])∂w[p] .","inline":true}],[{"text":"If ","element":"span"},{"style":{"height":14.19},"width":243.41,"height":35.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-2.png","element":"img","alt":" P : RP → RD","inline":true,"padRight":true},{"text":"is a homogeneous polynomial of degree ","element":"span"},{"style":{"height":11.6},"width":98.9,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-3.png","element":"img","alt":" ν > 0","inline":true},{"text":", then ","element":"span"},{"style":{"height":15.78},"width":354.57,"height":39.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-4.png","element":"img","alt":" ∇wP : RP → RP ×D","inline":true,"padRight":true},{"text":"is a homogeneous polynomial of degree ","element":"span"},{"style":{"height":10.8},"width":90.93,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-5.png","element":"img","alt":" ν − 1","inline":true},{"text":". Using eq. ","element":"span"},{"href":"#id-40","text":"(16)","element":"a"},{"text":", we have","element":"span"}],[{"style":{"width":"56%"},"width":903,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-6.png","element":"img"}],[{"text":"Thus, ","element":"span"},{"style":{"height":20.93},"width":165.4,"height":52.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-7.png","element":"img","alt":" ∃δ(t)1 → 0","inline":true},{"text":", such that","element":"span"}],[{"style":{"width":"77%"},"width":1221,"height":68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-8.png","element":"img"}],[{"text":"5. Finally, from the definition of ","element":"span"},{"style":{"height":21.02},"width":1061.24,"height":52.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-9.png","element":"img","alt":" ∇wP(w), we have ∇wLP(w(t)) = ∇wP�w(t)�∇βL(β(t)), and","inline":true,"padRight":true},{"text":"hence from eq. ","element":"span"},{"href":"#id-25","text":"(7)","element":"a"},{"text":",","element":"span"}],[{"id":"id-44","style":{"width":"74%"},"width":1177,"height":68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-10.png","element":"img"}],[{"text":"Using the assumptions in the theorem along with our argument above for convergence of ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-11.png","element":"img","alt":" β(t)","inline":true,"padRight":true},{"text":"in direction, we satisfy the conditions of Lemma ","element":"span"},{"href":"#id-41","text":"8, ","element":"a"},{"text":"which will be crucially used in our proof.","element":"span"}],[{"id":"id-42","style":{"width":"99%"},"width":1585,"height":426,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-12.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Primal feasibility. ","element":"span"},{"text":"We showed earlier that if ","element":"span"},{"style":{"height":14.19},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-13.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"text":"converges in direction, then ","element":"span"},{"style":{"height":19.53},"width":268.28,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-14.png","element":"img","alt":" β(t) = P(w(t))","inline":true,"padRight":true},{"text":"converges in direction to ","element":"span"},{"style":{"height":18.72},"width":179.84,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-15.png","element":"img","alt":" β∞ = lim","inline":true}],[{"text":"we have that ","element":"span"},{"style":{"height":19.52},"width":538.96,"height":48.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-16.png","element":"img","alt":" β∞ satisfies ∀n, yn⟨xn, β∞⟩ > 0","inline":true},{"text":", which also implies ","element":"span"},{"style":{"height":16},"width":522.2,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-17.png","element":"img","alt":" minn yn⟨xn, P(w∞)⟩ > 0 since","inline":true},{"style":{"height":19.52},"width":417.39,"height":48.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-18.png","element":"img","alt":"β∞ ∝ P(w∞). Now, if P","inline":true,"padRight":true},{"text":"is homogeneous of of degree ","element":"span"},{"style":{"height":20.41},"width":712.54,"height":51.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-19.png","element":"img","alt":" ν, then for γ = (minn yn⟨xn, P(w∞)⟩)−1/ν,","inline":true},{"style":{"height":16},"width":800.21,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-20.png","element":"img","alt":"�w∞ = γw∞ satisfies minn yn⟨xn, P(�w∞)⟩ = 1.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Showing other KKT conditions for ","element":"span"},{"style":{"height":11.38},"width":77.5,"height":28.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-21.png","element":"img","alt":" �w∞.","inline":true,"padRight":true},{"text":"The crux of the proof of Theorem ","element":"span"},{"href":"#id-33","text":"4 ","element":"a"},{"text":"involves showing the existence of ","element":"span"},{"style":{"height":16},"width":183.48,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-22.png","element":"img","alt":" {αn ≥ 0}n","inline":true,"padRight":true},{"text":"such that the stationarity and complementary slackness conditions in eq. ","element":"span"},{"href":"#id-42","text":"(20) ","element":"a"},{"text":"are satisfied. This crucially relies on a key lemma (Lemma ","element":"span"},{"href":"#id-41","text":"8) ","element":"a"},{"text":"showing that the gradient in the space of linear predictors ","element":"span"},{"style":{"height":20.32},"width":182.27,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-23.png","element":"img","alt":" ∇βL(β(t))","inline":true,"padRight":true},{"text":"are dominated by positive linear combinations of support vectors of the asymptotic predictor ","element":"span"},{"style":{"height":18.72},"width":71.41,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-24.png","element":"img","alt":" β∞.","inline":true}],[{"text":"Let ","element":"span"},{"style":{"height":16},"width":546.75,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-25.png","element":"img","alt":" S∞ = {n : yn⟨P(�w∞), xn⟩ = 1}","inline":true,"padRight":true},{"text":"denote the indices of support vectors for ","element":"span"},{"style":{"height":16},"width":129.99,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-26.png","element":"img","alt":" P(�w∞)","inline":true},{"text":", which are also the support vectors of ","element":"span"},{"style":{"height":18.72},"width":59.65,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-27.png","element":"img","alt":" β∞","inline":true},{"text":", since by homogeneity of ","element":"span"},{"style":{"height":19.52},"width":478.14,"height":48.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-28.png","element":"img","alt":" P, β∞ ∝ P(w∞) ∝ P(�w∞)","inline":true},{"text":". Thus, from Lemma ","element":"span"},{"href":"#id-41","text":"8, ","element":"a"},{"text":"we have ","element":"span"},{"style":{"height":11.53},"width":176.95,"height":28.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-29.png","element":"img","alt":" z∞ = lim","inline":true}],[{"text":"We propose a positive scaling of this ","element":"span"},{"style":{"height":17.38},"width":147.2,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-30.png","element":"img","alt":" {αn}Nn=1 ","inline":true,"padRight":true},{"text":"as our candidate dual certificate, which satisfies both ","element":"span"},{"text":"dual feasibility and complementary slackness.","element":"span"}],[{"text":"To prove the theorem, the remaining step is to show that ","element":"span"},{"style":{"height":16},"width":694.17,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-31.png","element":"img","alt":" �w∞ ∝ ∇wP(�w∞)z∞. Since �w∞ = γw∞","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P ","element":"span"},{"text":"is homogeneous, this condition is equivalent to showing that ","element":"span"},{"style":{"height":16},"width":375.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-32.png","element":"img","alt":" w∞ ∝ ∇wP(w∞)z∞.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Showing that ","element":"span"},{"style":{"height":16},"width":381.85,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-33.png","element":"img","alt":" w∞ ∝ ∇wP(w∞)z∞.","inline":true,"padRight":true},{"text":"Substituting for ","element":"span"},{"style":{"height":14.19},"width":56.86,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-34.png","element":"img","alt":" z(t)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.19},"width":196.84,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-35.png","element":"img","alt":" ∇wP(w(t))","inline":true,"padRight":true},{"text":"from eqs. ","element":"span"},{"href":"#id-43","text":"(17) ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-44","text":"(18)","element":"a"},{"text":", respectively, in the gradient descent updates (eq. ","element":"span"},{"href":"#id-44","text":"(19)","element":"a"},{"text":"), we have the following:","element":"span"}],[{"style":{"width":"91%"},"width":1445,"height":241,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/12-36.png","element":"img"}],[{"style":{"width":"64%"},"width":1015,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-0.png","element":"img"}],[{"text":"Summing over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", we have","element":"span"}],[{"id":"id-45","style":{"width":"89%"},"width":1425,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-1.png","element":"img"}],[{"text":"We want to argue that the first term, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"style":{"height":16},"width":256.82,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-2.png","element":"img","alt":" ∇wP (w∞) z∞","inline":true},{"text":", is the dominant term. Towards this we state and prove the following intermediate claim","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Claim 1. ","element":"span"},{"style":{"height":19.37},"width":918.86,"height":48.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-3.png","element":"img","alt":" ∥∇wP (w∞) z∞∥ > 0 and �u 0","inline":true},{"text":". Now from Lemma ","element":"span"},{"href":"#id-41","text":"8, ","element":"a"},{"text":"using that ","element":"span"},{"style":{"height":18.33},"width":382.21,"height":45.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-14.png","element":"img","alt":" z∞ = �n∈S∞ αnynxn","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":13.2},"width":120.32,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-15.png","element":"img","alt":" αn ≥ 0","inline":true,"padRight":true},{"text":"(and not all zero since ","element":"span"},{"style":{"height":11.53},"width":52.37,"height":28.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-16.png","element":"img","alt":" z∞ ","inline":true,"padRight":true},{"text":"is unit norm), we immediately get the following","element":"span"}],[{"style":{"width":"100%"},"width":1589,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-17.png","element":"img"}],[{"text":"To prove the second part, we note the following","element":"span"}],[{"style":{"width":"93%"},"width":1479,"height":199,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-18.png","element":"img"}],[{"text":"Thus, if ","element":"span"},{"style":{"height":15.5},"width":682.7,"height":38.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-19.png","element":"img","alt":" lim supt→∞ bt = ∞ then limt→∞ bt = ∞","inline":true},{"text":". On contrary, if ","element":"span"},{"style":{"height":15.5},"width":503.41,"height":38.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-20.png","element":"img","alt":" lim supt→∞ bt = C < ∞, then","inline":true,"padRight":true},{"text":"from eq. ","element":"span"},{"href":"#id-45","text":"(22)","element":"a"},{"text":", for large ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"we get, ","element":"span"},{"style":{"height":28.8},"width":1034.76,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-21.png","element":"img","alt":" ∥w(t)∥ ≤ ∥w(0)∥ + ∥∇P(w∞)z∞∥C +�supt∥δ(t)∥�C < ∞","inline":true}],[{"text":"which contradicts ","element":"span"},{"style":{"height":18.18},"width":224.36,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-22.png","element":"img","alt":" ∥w(t)∥ → ∞.","inline":true}],[{"text":"From above claim, the sequence ","element":"span"},{"style":{"height":19.37},"width":438.22,"height":48.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/13-23.png","element":"img","alt":" bt = �u ","element":"span"},{"text":"0 ","element":"span"},{"text":"with parameters ","element":"span"},{"style":{"height":18.3},"width":420.26,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-12.png","element":"img","alt":"w = [wl ∈ RDl−1×Dl]Ll−1","inline":true},{"text":", the equivalent linear predictor given by ","element":"span"},{"style":{"height":16.79},"width":433.74,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-13.png","element":"img","alt":" Pfull(w) = w1w2 . . . wL","inline":true,"padRight":true},{"text":"is ","element":"span"},{"text":"a homogeneous polynomial of degree ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":".","element":"span"}],[{"text":"Let ","element":"span"},{"style":{"height":21.49},"width":487.26,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-14.png","element":"img","alt":" w(t) = [w(t)l ∈ RDl−1×Dl]Ll=1","inline":true,"padRight":true},{"text":"denote the iterates of individual matrices ","element":"span"},{"style":{"height":9.59},"width":43.09,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-15.png","element":"img","alt":" wl","inline":true,"padRight":true},{"text":"along the gradient ","element":"span"},{"text":"descent path, and ","element":"span"},{"style":{"height":20.32},"width":311.58,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-16.png","element":"img","alt":" β(t) = Pfull(w(t))","inline":true,"padRight":true},{"text":"denote the corresponding sequence of linear predictors.","element":"span"}],[{"text":"We first introduce the following notation.","element":"span"}],[{"text":"1. Let ","element":"span"},{"style":{"height":11.53},"width":206.11,"height":28.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-17.png","element":"img","alt":" w∞ = lim","inline":true},{"style":{"height":7.6},"width":75.81,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-18.png","element":"img","alt":"t→∞","inline":true},{"style":{"height":26.03},"width":93.61,"height":65.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-19.png","element":"img","alt":"w(t)∥w(t)∥","inline":true,"padRight":true},{"text":"denote the limit direction of the parameters, with component ","element":"span"},{"text":"matrices in each layer denoted as ","element":"span"},{"style":{"height":16},"width":224.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-20.png","element":"img","alt":" w∞ = [w∞l ]","inline":true},{"text":". Specializing ","element":"span"},{"href":"#id-40","text":"(16) ","element":"a"},{"text":"for fully connected ","element":"span"},{"text":"networks, we have:","element":"span"}],[{"style":{"width":"94%"},"width":1495,"height":431,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-21.png","element":"img"}],[{"text":"3. Let ","element":"span"},{"style":{"height":20.32},"width":325.87,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-22.png","element":"img","alt":" z(t) = −∇βL(β(t))","inline":true},{"text":". Again repeating eq. ","element":"span"},{"href":"#id-43","text":"(17) ","element":"a"},{"text":"for fully connected networks, we have for some ","element":"span"},{"style":{"height":19.53},"width":448.38,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-23.png","element":"img","alt":" δ(t)z → 0 and p(t) = ∥z(t)∥,","inline":true}],[{"style":{"width":"93%"},"width":1489,"height":191,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-24.png","element":"img"}],[{"text":"The proof of Theorem ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"is fairly straight forward from using Lemma ","element":"span"},{"href":"#id-41","text":"8 ","element":"a"},{"text":"and the intermediate results in the proof of Theorem ","element":"span"},{"href":"#id-33","text":"4.","element":"a"}],[{"style":{"fontWeight":"bold"},"text":"Showing KKT conditions for ","element":"span"},{"style":{"height":21.77},"width":312.21,"height":54.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-25.png","element":"img","alt":"�β∞ ∝ Pfull(w∞).","inline":true,"padRight":true},{"text":"Using our notation described above, we have ","element":"span"},{"style":{"height":16.79},"width":332.74,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-26.png","element":"img","alt":"w∞1:L = Pfull(w∞)","inline":true},{"text":". In the following arguments we show that a positive scaling ","element":"span"},{"style":{"height":20.92},"width":228.45,"height":52.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-27.png","element":"img","alt":" �β∞ = γw∞1:L","inline":true,"padRight":true},{"text":"satisfies the following KKT conditions for the optimality of ","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-28.png","element":"img","alt":" ℓ2","inline":true,"padRight":true},{"text":"maximum margin problem in eq. ","element":"span"},{"href":"#id-48","text":"(8)","element":"a"},{"text":":","element":"span"}],[{"style":{"width":"93%"},"width":1490,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-29.png","element":"img"}],[{"text":"As we saw in proof of Theorem ","element":"span"},{"href":"#id-33","text":"4, ","element":"a"},{"text":"since ","element":"span"},{"style":{"height":16.79},"width":341.21,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-30.png","element":"img","alt":" w∞1:L = Pfull(w∞)","inline":true,"padRight":true},{"text":"has strictly positive margin, us- ","element":"span"},{"text":"ing homogeneity of ","element":"span"},{"style":{"height":15.59},"width":85.44,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-31.png","element":"img","alt":" Pfull","inline":true},{"text":", we can scale ","element":"span"},{"style":{"height":15.47},"width":80.64,"height":38.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-32.png","element":"img","alt":" w∞1:L","inline":true,"padRight":true},{"text":"to get ","element":"span"},{"style":{"height":20.92},"width":252.1,"height":52.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/14-33.png","element":"img","alt":" �β∞ = γw∞1:L","inline":true,"padRight":true},{"text":"with unit margin, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"style":{"height":20.98},"width":340.77,"height":52.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-0.png","element":"img","alt":"∀n, yn⟨xn, �β∞⟩ ≥ 1","inline":true},{"text":". For dual variables, we again use a positive scaling of ","element":"span"},{"style":{"height":9.19},"width":45.49,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-1.png","element":"img","alt":" αn","inline":true,"padRight":true},{"text":"from Lemma ","element":"span"},{"href":"#id-41","text":"8, ","element":"a"},{"text":"such that ","element":"span"},{"style":{"height":18.33},"width":393.82,"height":45.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-2.png","element":"img","alt":" z∞ = �n∈S∞ αn ynxn","inline":true},{"text":". In order to prove the theorem, we need to show that ","element":"span"},{"style":{"height":20.18},"width":173.68,"height":50.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-3.png","element":"img","alt":" �β∞ ∝ z∞","inline":true,"padRight":true},{"text":"or equivalently ","element":"span"},{"style":{"height":15.47},"width":199.76,"height":38.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-4.png","element":"img","alt":" w∞1:L ∝ z∞.","inline":true}],[{"text":"Recall that in the proof of Theorem ","element":"span"},{"href":"#id-33","text":"4, ","element":"a"},{"text":"we showed a version of stationarity in the parameter space in eq. ","element":"span"},{"href":"#id-49","text":"(26)","element":"a"},{"text":", repeated below.","element":"span"}],[{"id":"id-50","style":{"width":"61%"},"width":982,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-5.png","element":"img"}],[{"text":"This case in particular includes ","element":"span"},{"style":{"height":15.59},"width":85.44,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-6.png","element":"img","alt":" Pfull","inline":true,"padRight":true},{"text":"which is homogeneous with ","element":"span"},{"style":{"height":10.8},"width":102.36,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-7.png","element":"img","alt":" ν = L","inline":true},{"text":". We special case the result fully connected network. In particular, for the parameters of the first layer ","element":"span"},{"style":{"height":9.59},"width":49.09,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-8.png","element":"img","alt":" w1","inline":true},{"text":", we have ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"w","element":"span"},{"text":") = ","element":"span"},{"style":{"height":9.59},"width":130.98,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-9.png","element":"img","alt":"w1w2:L","inline":true},{"text":", where ","element":"span"},{"style":{"height":15.78},"width":207.47,"height":39.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-10.png","element":"img","alt":" w1 ∈ Rd×d1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.78},"width":239.32,"height":39.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-11.png","element":"img","alt":" w2:L ∈ Rd1×1","inline":true},{"text":". This implies, for any ","element":"span"},{"style":{"height":16.39},"width":396.08,"height":40.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-12.png","element":"img","alt":" z, ∇w1P(w)z = zw⊤2:L","inline":true},{"text":". ","element":"span"},{"text":"Using this along with eq. ","element":"span"},{"href":"#id-50","text":"(31)","element":"a"},{"text":", we get the following expression for some positive scalar ","element":"span"},{"style":{"height":10.4},"width":22,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-13.png","element":"img","alt":" γ","inline":true}],[{"style":{"width":"97%"},"width":1552,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-14.png","element":"img"}],[{"text":"Since ","element":"span"},{"style":{"height":20.92},"width":195.28,"height":52.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-15.png","element":"img","alt":" w∞1:L ∝ �β∞","inline":true},{"text":", we have shown that ","element":"span"},{"style":{"height":20.18},"width":166.92,"height":50.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-16.png","element":"img","alt":" �β∞ ∝ z∞","inline":true},{"text":", which completes our proof of Theorem ","element":"span"},{"href":"#id-26","text":"1.","element":"a"}]]},{"heading":"C Linear Convolutional Networks: Proof of Theorem 2–2a","paragraphs":[[{"text":"Recall that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"–layer linear convolutional networks have parameters ","element":"span"},{"style":{"height":18.3},"width":324.12,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-17.png","element":"img","alt":" w = [wl ∈ RD]Ll−1","inline":true},{"text":". We first ","element":"span"},{"text":"recall some complex numbers terminology and properties","element":"span"}],[{"text":"1. Complex vectors ","element":"span"},{"style":{"height":14.19},"width":129.06,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-18.png","element":"img","alt":" �z ∈ CD ","inline":true,"padRight":true},{"text":"are represented in polar form as ","element":"span"},{"style":{"height":18.93},"width":523.13,"height":47.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-19.png","element":"img","alt":" �z = |�z|eiφ�z, where |�z| ∈ RD+ and","inline":true},{"style":{"height":17.39},"width":229.44,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-20.png","element":"img","alt":"φ�z ∈ [0, 2π)D ","inline":true,"padRight":true},{"text":"are the vectors with magnitudes and phases, respectively, of components ","element":"span"},{"style":{"height":30.31},"width":1486.44,"height":75.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-21.png","element":"img","alt":" �z.2. For �z = |�z|eiφ�z ∈ CD","inline":true},{"text":", the complex conjugate vector is denoted by ","element":"span"},{"style":{"height":17.38},"width":236.63,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-22.png","element":"img","alt":" �z∗ = |�z|e−iφ�z.","inline":true,"padRight":true},{"text":"3. The complex inner product for ","element":"span"},{"style":{"height":16.58},"width":177.35,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-23.png","element":"img","alt":" �x, �β ∈ CD ","inline":true,"padRight":true},{"text":"is given by ","element":"span"},{"style":{"height":21.76},"width":537.06,"height":54.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-24.png","element":"img","alt":" ⟨�x, �β⟩ = �d �x[d]�β∗[d] = �x⊤�β∗.","inline":true}],[{"text":"4. Let ","element":"span"},{"style":{"height":14.19},"width":187.91,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-25.png","element":"img","alt":" F ∈ CD×D ","inline":true,"padRight":true},{"text":"denote the discrete Fourier transform matrix with ","element":"span"},{"style":{"height":24.12},"width":395.44,"height":60.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-26.png","element":"img","alt":" F[d, p] = 1√DωdpD where","inline":true}],[{"text":"recall that ","element":"span"},{"style":{"height":18.06},"width":208.5,"height":45.14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-27.png","element":"img","alt":" ωD = e− 2πiD","inline":true,"padRight":true},{"text":"is the ","element":"span"},{"style":{"height":13.78},"width":55.86,"height":34.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-28.png","element":"img","alt":" Dth","inline":true,"padRight":true},{"text":"complex root of unity. Thus, for any ","element":"span"},{"style":{"height":14.18},"width":139.28,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-29.png","element":"img","alt":" z ∈ RD","inline":true},{"text":", the representation in Fourier basis is given by ","element":"span"},{"style":{"height":12},"width":183.64,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-30.png","element":"img","alt":" �z = Fz. F","inline":true,"padRight":true},{"text":"and its complex conjugate matrix ","element":"span"},{"style":{"height":12},"width":48.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-31.png","element":"img","alt":" F∗","inline":true,"padRight":true},{"text":"also satisfy: ","element":"span"},{"style":{"height":13.6},"width":728.45,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-32.png","element":"img","alt":" FF∗ = F∗F = I, F = F⊤ and F∗ = F∗⊤.","inline":true}],[{"text":"Before getting into full proofs of Theorem ","element":"span"},{"href":"#id-32","text":"2a–","element":"a"},{"href":"#id-27","text":"2, ","element":"a"},{"text":"we also prove the two lemmas (Lemma ","element":"span"},{"href":"#id-36","text":"3 ","element":"a"},{"text":"and Lemma ","element":"span"},{"href":"#id-51","text":"9) ","element":"a"},{"text":"that establish equivalence of dynamics of gradient descent on full dimensional convolutional networks to those on linear diagonal networks (Figure ","element":"span"},{"href":"#id-31","text":"1c)","element":"a"},{"text":", albeit with complex valued parameters. This makes the analysis of the of convolutional networks simpler and more intuitive.","element":"span"}],[{"text":"We begin by proving Lemma ","element":"span"},{"href":"#id-36","text":"3 ","element":"a"},{"text":"which shows the equivalence of representation between convolutional networks and diagonal networks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For full-dimensional convolutions, ","element":"span"},{"style":{"height":16},"width":242.33,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-33.png","element":"img","alt":" β = Pconv(w)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is equivalent to","element":"span"}],[{"style":{"width":"93%"},"width":1479,"height":124,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-34.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"First, we state the following properties which follow immediately from definitions:","element":"span"}],[{"text":"1. For ","element":"span"},{"style":{"height":16.58},"width":185.94,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-35.png","element":"img","alt":" x, β ∈ RD,","inline":true}],[{"id":"id-55","style":{"width":"73%"},"width":1168,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-36.png","element":"img"}],[{"text":"where recall that the complex inner product is given by ","element":"span"},{"style":{"height":20.98},"width":260.72,"height":52.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-37.png","element":"img","alt":" ⟨�x, �β⟩ = �x⊤�β∗.","inline":true}],[{"text":"2. We next show the following property","element":"span"}],[{"id":"id-53","style":{"width":"70%"},"width":1118,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-38.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":10.4},"width":31,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-39.png","element":"img","alt":" ⊙","inline":true,"padRight":true},{"text":"denotes the Hadamard product (elementwise product), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"style":{"height":16},"width":347.81,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-40.png","element":"img","alt":" (a ⊙ b)[d] = a[d]b[d].","inline":true}],[{"text":"The above equation follows from simple manipulations of definitions: recall that ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"style":{"fontWeight":"bold"},"text":"z","element":"span"},{"text":")[","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":"] =","element":"span"}],[{"style":{"width":"99%"},"width":1585,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/15-41.png","element":"img"}],[{"style":{"width":"101%"},"width":1613,"height":263,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-0.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"follows as ","element":"span"},{"style":{"height":17.77},"width":292.95,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-1.png","element":"img","alt":" ωDD = 1 and in (b)","inline":true,"padRight":true},{"text":"we used the change of variables ","element":"span"},{"style":{"height":16},"width":441.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-2.png","element":"img","alt":" p = (k′ − k) mod D (recall","inline":true,"padRight":true},{"text":"our use of modulo operator as ","element":"span"},{"style":{"height":19.21},"width":406.98,"height":48.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-3.png","element":"img","alt":" a mod D = a − D� aD�).","inline":true}],[{"text":"Recall from eq. ","element":"span"},{"href":"#id-52","text":"(3) ","element":"a"},{"text":"the output of an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"-layer convolutional network is given by","element":"span"}],[{"style":{"width":"56%"},"width":894,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-4.png","element":"img"}],[{"text":"Denote ","element":"span"},{"style":{"height":16},"width":679.45,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-5.png","element":"img","alt":" hL−1(x) = (((x ⋆ w1) ⋆ w2) . . .) ⋆ wL−1","inline":true},{"text":". By iteratively using eq. ","element":"span"},{"href":"#id-53","text":"(34)","element":"a"},{"text":", we have","element":"span"}],[{"id":"id-54","style":{"width":"76%"},"width":1215,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-6.png","element":"img"}],[{"text":"Thus, on one hand using the above equation we have,","element":"span"}],[{"style":{"width":"94%"},"width":1492,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-7.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"follows from substituting for ","element":"span"},{"href":"#id-54","style":{"height":16},"width":398.78,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-8.png","element":"img","alt":" FhL−1(x) from eq. (36)","inline":true,"padRight":true},{"text":"and noting that for any ","element":"span"},{"style":{"height":17.38},"width":188.84,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-9.png","element":"img","alt":" {zl ∈ RD},","inline":true},{"style":{"height":16},"width":840.8,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-10.png","element":"img","alt":"(z1 ⊙ z2 ⊙ . . . zL−1)⊤zL = z⊤1 (z2 ⊙ z3 ⊙ . . . zL)","inline":true},{"text":", and ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":") ","element":"span"},{"text":"uses the definition of complex inner ","element":"span"},{"text":"product ","element":"span"},{"style":{"height":20.98},"width":260.73,"height":52.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-11.png","element":"img","alt":" ⟨�x, �β⟩ = �x⊤�β∗.","inline":true}],[{"text":"Now further using eq. ","element":"span"},{"href":"#id-55","text":"(33) ","element":"a"},{"text":"in above equation, we have","element":"span"}],[{"style":{"width":"77%"},"width":1226,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-12.png","element":"img"}],[{"text":"Thus, for ","element":"span"},{"style":{"height":16},"width":267.11,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-13.png","element":"img","alt":" β = Pconv(w)","inline":true},{"text":", we have shown that ","element":"span"},{"style":{"height":16},"width":760.85,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-14.png","element":"img","alt":"�β = FPconv(w) = �w1 ⊙ �w2 . . . ⊙ �wL =","inline":true,"padRight":true},{"text":"diag","element":"span"},{"style":{"height":16},"width":556.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-15.png","element":"img","alt":"(�w1)diag(�w2) . . . diag(�wL−1)�wL.","inline":true}],[{"text":"For ","element":"span"},{"style":{"height":18.3},"width":318.85,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-16.png","element":"img","alt":" �w = [�wl ∈ CD]Ll=1","inline":true},{"text":", let ","element":"span"},{"style":{"height":16.79},"width":1125.35,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-17.png","element":"img","alt":" Pdiag(�w) = diag(�w1)diag(�w2) . . . diag(�wL−1)�wL = �w1 ⊙ �w2 . . . ⊙","inline":true},{"style":{"height":9.59},"width":55.1,"height":23.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-18.png","element":"img","alt":"�wL","inline":true,"padRight":true},{"text":"denote the equivalent parameterization of convolutional network in Fourier domain.","element":"span"}],[{"text":"The above lemma shows that optimizing ","element":"span"},{"style":{"height":16},"width":178.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-19.png","element":"img","alt":" LPconv(w)","inline":true,"padRight":true},{"text":"in eq. ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"text":"is equivalent to the following minimization problem in terms of representation,","element":"span"}],[{"id":"id-56","style":{"width":"72%"},"width":1157,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-20.png","element":"img"}],[{"text":"The following lemma further shown that not only the representations of ","element":"span"},{"style":{"height":16.79},"width":452.2,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-21.png","element":"img","alt":" Pconv(w) and Pdiag(�w) are","inline":true,"padRight":true},{"text":"equivalent, but there corresponding gradient descent updates for problems in eq. ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"text":"and eq. ","element":"span"},{"href":"#id-56","text":"(39) ","element":"a"},{"text":"are also equivalent up to Fourier transformations.","element":"span"}],[{"id":"id-51","style":{"fontWeight":"bold"},"text":"Lemma 9. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the gradient descent iterates ","element":"span"},{"style":{"height":21.49},"width":280.1,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-22.png","element":"img","alt":" w(t) = [w(t)l ]Ll=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"from eq. ","element":"span"},{"href":"#id-25","text":"(7) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"for minimizing ","element":"span"},{"style":{"height":14.78},"width":108.55,"height":36.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-23.png","element":"img","alt":"LPconv","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"in eq. ","element":"span"},{"href":"#id-23","text":"(4) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"over full dimensional linear convolutional networks. For all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"style":{"fontStyle":"italic"},"text":", the incremental update directions, ","element":"span"},{"style":{"height":21.49},"width":812.99,"height":53.73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-24.png","element":"img","alt":" ∆w(t)l := w(t+1)l − w(t)l = −ηt∇wlLPconv(w(t))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"satisfy the following,","element":"span"}],[{"style":{"width":"76%"},"width":1210,"height":57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-25.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":33.55},"width":286.82,"height":83.87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-26.png","element":"img","alt":" �w(t) =��w(t)l �Ll=1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are the Fourier transformations of ","element":"span"},{"style":{"height":21.49},"width":271.32,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-27.png","element":"img","alt":" w(t) = [w(t)l ]Ll=1","inline":true},{"style":{"fontStyle":"italic"},"text":", respectively.","element":"span"}],[{"style":{"width":"100%"},"width":1586,"height":153,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/16-28.png","element":"img"}],[{"style":{"width":"99%"},"width":1584,"height":278,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-0.png","element":"img"}],[{"text":"Using the above equation we have,","element":"span"}],[{"style":{"width":"95%"},"width":1508,"height":322,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-1.png","element":"img"}],[{"text":"where in ","element":"span"},{"style":{"height":24.03},"width":455.52,"height":60.07,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-2.png","element":"img","alt":" (a) we use ℓ′(�y, y) = ∂ℓ(�y,y)∂�y","inline":true,"padRight":true},{"text":"and the remaining equalities simply follow from manipulation of derivatives. From above equation, we have the following:","element":"span"}],[{"style":{"width":"90%"},"width":1438,"height":256,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-3.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"C.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-27","style":{"fontWeight":"bold"},"text":"2–","element":"a"},{"href":"#id-32","style":{"fontWeight":"bold"},"text":"2a","element":"a"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 2 ","element":"span"},{"text":"(Linear convolutional networks of depth two)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For almost all linearly separable datasets ","element":"span"},{"style":{"height":17.38},"width":204.84,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-4.png","element":"img","alt":"{xn, yn}Nn=1","inline":true},{"style":{"fontStyle":"italic"},"text":", almost all initializations ","element":"span"},{"style":{"height":14.18},"width":74.08,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-5.png","element":"img","alt":" w(0)","inline":true},{"style":{"fontStyle":"italic"},"text":", and any sequence of step sizes ","element":"span"},{"style":{"height":16},"width":85.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-6.png","element":"img","alt":" {ηt}t","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with ","element":"span"},{"style":{"height":10.4},"width":31.78,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-7.png","element":"img","alt":" ηt","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"smaller ","element":"span"},{"style":{"fontStyle":"italic"},"text":"than the local Lipschitz at ","element":"span"},{"style":{"height":14.18},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-8.png","element":"img","alt":" w(t)","inline":true},{"style":{"fontStyle":"italic"},"text":", consider the sequence gradient descent iterates ","element":"span"},{"style":{"height":14.18},"width":70.23,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-9.png","element":"img","alt":" w(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"in eq. ","element":"span"},{"href":"#id-25","text":"(7) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"for minimizing ","element":"span"},{"href":"#id-23","style":{"height":16},"width":335.43,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-10.png","element":"img","alt":" LPconv(w) in eq. (4)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with exponential loss over ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":"–layer linear convolutional networks.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"If (a) the iterates ","element":"span"},{"style":{"height":14.18},"width":70.22,"height":35.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-11.png","element":"img","alt":" w(t) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"minimize the objective, i.e., ","element":"span"},{"style":{"height":18.18},"width":445.01,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-12.png","element":"img","alt":" LPconv(w(t)) → 0, (b) w(t) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge in direction to yield a separator ","element":"span"},{"style":{"height":18.72},"width":59.64,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-13.png","element":"img","alt":" β∞","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with positive margin, (c) the phase of the Fourier coefficients ","element":"span"},{"style":{"height":23.38},"width":64.14,"height":58.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-14.png","element":"img","alt":" �β(t)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"of the linear predictors ","element":"span"},{"style":{"height":18.73},"width":64.14,"height":46.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-15.png","element":"img","alt":" β(t) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge coordinate-wise, i.e., ","element":"span"},{"style":{"height":20.05},"width":382.16,"height":50.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-16.png","element":"img","alt":" ∀d, eiφ�β(t)[d] → eiφ��β∞[d]","inline":true},{"style":{"fontStyle":"italic"},"text":", and (d) the gradients ","element":"span"},{"style":{"height":20.32},"width":182.27,"height":50.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-17.png","element":"img","alt":"∇βL(β(t))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converge in direction, then the limit direction ","element":"span"},{"style":{"height":18.71},"width":59.65,"height":46.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-18.png","element":"img","alt":" β∞","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is given by,","element":"span"}],[{"style":{"width":"85%"},"width":1356,"height":103,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-19.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 2a ","element":"span"},{"text":"(Linear Convolutional Networks of any Depth)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For any depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"style":{"fontStyle":"italic"},"text":", under the conditions of Theorem ","element":"span"},{"href":"#id-27","style":{"fontStyle":"italic"},"text":"2, ","element":"a"},{"style":{"fontStyle":"italic"},"text":"the limit direction ","element":"span"},{"style":{"height":18.72},"width":185.43,"height":46.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-20.png","element":"img","alt":" β∞ = lim","inline":true}],[{"style":{"fontStyle":"italic"},"text":"point of the ","element":"span"},{"href":"#id-27","style":{"fontStyle":"italic"},"text":"fo","element":"a"},{"style":{"fontStyle":"italic"},"text":"llowing optimization problem,","element":"span"}],[{"style":{"width":"68%"},"width":1078,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-21.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the ","element":"span"},{"style":{"height":7.2},"width":33.6,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-22.png","element":"img","alt":" ℓp","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"penalty given by ","element":"span"},{"style":{"height":33.1},"width":417.77,"height":82.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-23.png","element":"img","alt":" ∥z∥p =��Di=1 |z[i]|p�1/p","inline":true},{"style":{"fontStyle":"italic"},"text":"(also called the bridge penalty) is a norm for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and a quasi-norm for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p < ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"text":"For the gradient descent iterates ","element":"span"},{"href":"#id-25","style":{"height":21.49},"width":474.16,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-24.png","element":"img","alt":" w(t) = [w(t)l ]Ll=1 from eq. (7)","inline":true,"padRight":true},{"text":"denote the sequence of corresponding ","element":"span"},{"text":"linear predictors as ","element":"span"},{"style":{"height":19.53},"width":324.03,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-25.png","element":"img","alt":" β(t) = Pconv(w(t))","inline":true},{"text":". Let ","element":"span"},{"style":{"height":23.38},"width":220.28,"height":58.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-26.png","element":"img","alt":"�β(t) = Fβ(t)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":21.49},"width":232.46,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-27.png","element":"img","alt":" �w(t)l = Fw(t)l","inline":true,"padRight":true},{"text":"denote the Fourier transforms of ","element":"span"},{"style":{"height":21.49},"width":214.28,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-28.png","element":"img","alt":" β(t) and w(t)l ","inline":true,"padRight":true},{"text":", respectively, and let ","element":"span"},{"style":{"height":33.55},"width":298.7,"height":83.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-29.png","element":"img","alt":" �w(t) =��w(t)l �Ll=1.","inline":true}],[{"style":{"width":"100%"},"width":1586,"height":183,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/17-30.png","element":"img"}],[{"id":"id-75","style":{"width":"94%"},"width":1495,"height":866,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"KKT conditions for optimality ","element":"span"},{"text":"We want to show that a positive scaling of ","element":"span"},{"style":{"height":19.52},"width":314.17,"height":48.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-1.png","element":"img","alt":" β∞ ∝ Pconv(w∞)","inline":true},{"text":", denoted by ","element":"span"},{"style":{"height":20.98},"width":332.7,"height":52.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-2.png","element":"img","alt":"�β∞ = γPconv(w∞)","inline":true,"padRight":true},{"text":"is a first order stationary point of eq. ","element":"span"},{"href":"#id-28","text":"(10)","element":"a"},{"text":", repeated below,","element":"span"}],[{"style":{"width":"36%"},"width":574,"height":70,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-3.png","element":"img"}],[{"text":"Recall the KKT conditions discussed in Section ","element":"span"},{"text":"3. ","element":"span"},{"text":"The first order stationary points, or sub-stationary points, of ","element":"span"},{"href":"#id-28","text":"(10) ","element":"a"},{"text":"are the set of feasible predictors ","element":"span"},{"style":{"height":17.38},"width":433.04,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-4.png","element":"img","alt":" β such that ∃{αn ≥ 0}Nn=1 ","inline":true,"padRight":true},{"text":"satisfying the following:","element":"span"}],[{"style":{"height":16},"width":608.77,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-5.png","element":"img","alt":"∀n, yn⟨xn, β⟩ > 1 =⇒ αn = 0, and","inline":true}],[{"id":"id-57","style":{"width":"61%"},"width":981,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-6.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":11.2},"width":39.38,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-7.png","element":"img","alt":" ∂◦ ","inline":true,"padRight":true},{"text":"denotes the local sub-differential (or Clarke’s sub-differential) operator defined as ","element":"span"},{"style":{"height":16},"width":166.22,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-8.png","element":"img","alt":" ∂◦f(β) =","inline":true,"padRight":true},{"text":"conv","element":"span"},{"style":{"height":16},"width":712.29,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-9.png","element":"img","alt":"{v : ∃(zk)k s.t. zk → β and ∇f(zk) → v}.","inline":true}],[{"text":"For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1 ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-10.png","element":"img","alt":"�β","inline":true,"padRight":true},{"text":"represented in polar form as ","element":"span"},{"style":{"height":19.67},"width":429.25,"height":49.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-11.png","element":"img","alt":"�β = |�β|eiφ�β ∈ CD, ∥�β∥p","inline":true,"padRight":true},{"text":"is convex and the local sub-differential is indeed the global sub-differential given by,","element":"span"}],[{"id":"id-58","style":{"width":"83%"},"width":1322,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-12.png","element":"img"}],[{"text":"For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p < ","element":"span"},{"text":"1","element":"span"},{"text":", the local sub-differential of ","element":"span"},{"style":{"height":16.79},"width":84.5,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-13.png","element":"img","alt":" ∥�β∥p","inline":true,"padRight":true},{"text":"is given by,","element":"span"}],[{"style":{"width":"85%"},"width":1353,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-14.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Showing KKT conditions for ","element":"span"},{"style":{"height":20.98},"width":333.64,"height":52.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-15.png","element":"img","alt":"�β∞ ∝ Pconv(w∞).","inline":true,"padRight":true},{"text":"As we showed proof of Theorem ","element":"span"},{"href":"#id-33","text":"4, ","element":"a"},{"text":"since ","element":"span"},{"style":{"height":16},"width":195.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-16.png","element":"img","alt":"Pconv(w∞)","inline":true,"padRight":true},{"text":"has strictly positive margin, using homogeneity of ","element":"span"},{"style":{"height":13.19},"width":93.4,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-17.png","element":"img","alt":" Pconv","inline":true},{"text":", we can scale ","element":"span"},{"style":{"height":16},"width":195.31,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-18.png","element":"img","alt":" Pconv(w∞)","inline":true,"padRight":true},{"text":"to get ","element":"span"},{"style":{"height":20.98},"width":333.22,"height":52.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-19.png","element":"img","alt":"�β∞ = γPconv(w∞)","inline":true,"padRight":true},{"text":"with unit margin, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"style":{"height":20.98},"width":341.28,"height":52.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-20.png","element":"img","alt":" ∀n, yn⟨xn, �β∞⟩ ≥ 1","inline":true},{"text":". For dual variables, we again use a positive scaling of ","element":"span"},{"style":{"height":9.19},"width":45.49,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-21.png","element":"img","alt":" αn","inline":true,"padRight":true},{"text":"from Lemma ","element":"span"},{"href":"#id-41","text":"8, ","element":"a"},{"text":"such that ","element":"span"},{"style":{"height":18.33},"width":398.75,"height":45.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-22.png","element":"img","alt":" z∞ = �n∈S∞ αn ynxn.","inline":true}],[{"text":"In order to prove the theorem, we need to show that for some positive scalar ","element":"span"},{"style":{"height":21.86},"width":324.66,"height":54.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-23.png","element":"img","alt":" γ, γ�z∞ ∈ ∂◦∥�β∥2/L","inline":true},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e., ","element":"span"},{"text":"satisfies the conditions in eq. ","element":"span"},{"href":"#id-57","text":"(47) ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-58","text":"(48)","element":"a"},{"text":", for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"= 2 ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L > ","element":"span"},{"text":"2","element":"span"},{"text":", respectively.","element":"span"}],[{"text":"We start from the stationarity condition in the parameter space in eq. ","element":"span"},{"href":"#id-49","text":"(26) ","element":"a"},{"text":"of Theorem ","element":"span"},{"href":"#id-33","text":"4. ","element":"a"},{"text":"For some positive scalar ","element":"span"},{"style":{"height":14.8},"width":173.34,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-24.png","element":"img","alt":" γ, we have","inline":true}],[{"id":"id-59","style":{"width":"64%"},"width":1026,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-25.png","element":"img"}],[{"text":"We will now special case the above equation for fully width convolutional networks.","element":"span"}],[{"text":"From Lemma ","element":"span"},{"href":"#id-36","text":"3, ","element":"a"},{"text":"we have that for all ","element":"span"},{"style":{"height":17.39},"width":261.66,"height":43.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-26.png","element":"img","alt":" w = [wl ∈ RD]","inline":true},{"text":", we have ","element":"span"},{"style":{"height":16.79},"width":454.84,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-27.png","element":"img","alt":" Pconv(w) = F∗Pdiag(Fw)","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":12},"width":48.6,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-28.png","element":"img","alt":" F∗","inline":true,"padRight":true},{"text":"denote discrete Fourier matrix and its inverse in appropriate dimensions. Let ","element":"span"},{"style":{"height":17.9},"width":134.05,"height":44.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/18-29.png","element":"img","alt":" {ed}Dd=1","inline":true,"padRight":true},{"text":"denote the standard basis in ","element":"span"},{"style":{"height":13.39},"width":54.78,"height":33.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-0.png","element":"img","alt":" RD","inline":true},{"text":". We first note that for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , L ","element":"span"},{"text":"and for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , D","element":"span"},{"text":", the following holds","element":"span"}],[{"style":{"width":"91%"},"width":1446,"height":370,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-1.png","element":"img"}],[{"text":"This implies, for ","element":"span"},{"style":{"height":16.99},"width":660.69,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-2.png","element":"img","alt":" l = 1, 2, . . . , L and any z ∈ RD, we have","inline":true}],[{"style":{"width":"87%"},"width":1393,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-3.png","element":"img"}],[{"text":"Substituting the above equation in eq. ","element":"span"},{"href":"#id-59","text":"(49)","element":"a"},{"text":", we have,","element":"span"}],[{"style":{"width":"83%"},"width":1324,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":20.12},"width":81.51,"height":50.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-5.png","element":"img","alt":"�w∞∗l′","inline":true,"padRight":true},{"text":"denotes the complex conjugate of ","element":"span"},{"style":{"height":20.12},"width":77.5,"height":50.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-6.png","element":"img","alt":"�w∞l′ .","inline":true}],[{"id":"id-60","style":{"width":"99%"},"width":1582,"height":356,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-7.png","element":"img"}],[{"text":"Also, by multiplying the LHS of eq. ","element":"span"},{"href":"#id-60","text":"(55) ","element":"a"},{"text":"across all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l ","element":"span"},{"text":"and taking ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"th root over positive scalars, we have for ","element":"span"},{"style":{"height":14},"width":332.73,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-8.png","element":"img","alt":" d = 0, 1, . . . , D − 1,","inline":true}],[{"id":"id-61","style":{"width":"66%"},"width":1054,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-9.png","element":"img"}],[{"text":"Finally, let ","element":"span"},{"style":{"height":10.4},"width":22,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-10.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"be a positive scaling of ","element":"span"},{"style":{"height":20.58},"width":417.98,"height":51.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-11.png","element":"img","alt":" β∞ such that �β∞ = γβ∞","inline":true,"padRight":true},{"text":"has unit margin. Let ","element":"span"},{"style":{"height":24.83},"width":250.62,"height":62.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-12.png","element":"img","alt":" ��β∞= F �β∞ =","inline":true}],[{"style":{"width":"99%"},"width":1582,"height":180,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-13.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"C.1.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Case of ","element":"span"},{"style":{"height":16},"width":355.07,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-14.png","element":"img","alt":" L > 2 or p = 2/L < 1","inline":true}],[{"text":"For ","element":"span"},{"style":{"height":16},"width":228.14,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-15.png","element":"img","alt":" p = 2/L < 1","inline":true},{"text":", since ","element":"span"},{"style":{"height":22.56},"width":396.7,"height":56.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-16.png","element":"img","alt":"�z∞ = �n∈S∞ αnyn�xn","inline":true},{"text":", eq. ","element":"span"},{"href":"#id-61","text":"(58) ","element":"a"},{"text":"is indeed the first order stationarity ","element":"span"},{"text":"condition for eq. ","element":"span"},{"href":"#id-28","text":"(10) ","element":"a"},{"text":"as described in eq. ","element":"span"},{"href":"#id-62","text":"(11) ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-63","text":"(13)","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"C.1.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Case of ","element":"span"},{"style":{"height":16},"width":355.09,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-17.png","element":"img","alt":" L = 2 or p = 2/L = 1","inline":true}],[{"text":"For the case of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1","element":"span"},{"text":", in addition to eq. ","element":"span"},{"href":"#id-61","text":"(58)","element":"a"},{"text":", we need to show that ","element":"span"},{"style":{"height":20.18},"width":173.32,"height":50.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-18.png","element":"img","alt":" γ|�z∞| ≤ 1","inline":true},{"text":". From eq. ","element":"span"},{"href":"#id-61","text":"(58)","element":"a"},{"text":", for ","element":"span"},{"style":{"height":32.02},"width":787.83,"height":80.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-19.png","element":"img","alt":"L = 2 we have��β∞[d]�� ̸= 0 =⇒ γ|�z∞[d]| = 1.","inline":true}],[{"text":"We need to further show that ","element":"span"},{"style":{"height":32.03},"width":744.59,"height":80.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/19-20.png","element":"img","alt":" ∀d s.t.��β∞[d]�� ∝��β∞[d]�� = 0, γ|�z∞[d]| ≤ 1.","inline":true}],[{"style":{"width":"46%"},"width":740,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-0.png","element":"img"}],[{"text":"Using Lemma ","element":"span"},{"href":"#id-51","text":"9 ","element":"a"},{"text":"for for the special case of ","element":"span"},{"text":"2","element":"span"},{"text":"–layer linear convolutional network, for ","element":"span"},{"style":{"height":13.2},"width":52.88,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-1.png","element":"img","alt":" ∀d,","inline":true}],[{"style":{"width":"99%"},"width":1582,"height":255,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-2.png","element":"img"}],[{"text":"Further, from eq. ","element":"span"},{"href":"#id-60","text":"(55)","element":"a"},{"text":", we have ","element":"span"},{"style":{"height":20.18},"width":414.85,"height":50.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-3.png","element":"img","alt":" ∀d, |�w∞1 [d]|2 = |�w∞2 [d]|2","inline":true},{"text":", and hence","element":"span"}],[{"id":"id-65","style":{"width":"66%"},"width":1060,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-4.png","element":"img"}],[{"text":"From the convergence of complex numbers, we have the following:","element":"span"}],[{"id":"id-64","style":{"width":"93%"},"width":1488,"height":755,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-5.png","element":"img"}],[{"text":"In the remainder of the proof, we only consider ","element":"span"},{"style":{"height":20.18},"width":313.73,"height":50.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-6.png","element":"img","alt":" d with |�z∞[d]| ̸= 0.","inline":true}],[{"text":"Consider ","element":"span"},{"style":{"height":21.49},"width":61.95,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-7.png","element":"img","alt":" u(t)d","inline":true,"padRight":true},{"id":"id-72","text":"defined below,","element":"span"}],[{"id":"id-68","style":{"width":"99%"},"width":1581,"height":501,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-8.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"follows from using ","element":"span"},{"style":{"height":17.26},"width":645.48,"height":43.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-9.png","element":"img","alt":" eiφ�z∞[d] = eiφ�β∞[d] = eiφ �w∞1 [d] · eiφ �w∞2 [d]","inline":true,"padRight":true},{"text":"whenever ","element":"span"},{"style":{"height":19.52},"width":177.44,"height":48.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-10.png","element":"img","alt":" β∞[d] ̸= 0","inline":true,"padRight":true},{"text":"(from eq. ","element":"span"},{"href":"#id-64","text":"(62)","element":"a"},{"text":"), and ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":") ","element":"span"},{"text":"follows from eq. ","element":"span"},{"href":"#id-65","text":"(60)","element":"a"},{"text":".","element":"span"}],[{"style":{"width":"99%"},"width":1571,"height":254,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/20-11.png","element":"img"}],[{"text":"Additionally, since ","element":"span"},{"style":{"height":16.69},"width":310.26,"height":41.73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-0.png","element":"img","alt":" eiφ�z(t)[d] → eiφ�z∞[d]","inline":true},{"text":", we can write ","element":"span"},{"style":{"height":29.27},"width":732.86,"height":73.17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-1.png","element":"img","alt":" e±i�φ�z(t)[d]−φ�z∞[d]�= 1 + δ(t)1,d ± iδ(t)2,d where","inline":true},{"style":{"height":23.89},"width":232.22,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-2.png","element":"img","alt":"δ(t)1,d, δ(t)2,d → 0","inline":true,"padRight":true},{"text":"are real scalars. Substituting in above equation and rearranging the terms, we have","element":"span"}],[{"id":"id-67","style":{"width":"95%"},"width":1519,"height":168,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-3.png","element":"img"}],[{"text":"where in ","element":"span"},{"style":{"height":28.8},"width":948.73,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-4.png","element":"img","alt":" (a) we define τ (t)d = iδ(t)2,d��w(t)∗2 [d] − �w(t)1 [d] · e−iφ�z∞[d]�.","inline":true}],[{"text":"The following intermediate lemma is proved in Appendix ","element":"span"},{"href":"#id-66","text":"C.1.3.","element":"a"}],[{"id":"id-69","style":{"width":"99%"},"width":1584,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-5.png","element":"img"}],[{"style":{"height":24.48},"width":264.85,"height":61.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-6.png","element":"img","alt":"p(t) → |�z∞[d]|","inline":true},{"text":", there exists ","element":"span"},{"style":{"height":23.89},"width":148.34,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-7.png","element":"img","alt":" δ(t)4,d → 0","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":23.89},"width":543.08,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-8.png","element":"img","alt":" |�z(t)[d]| = |�z∞[d]|p(t) + δ(t)4,dp(t)","inline":true},{"text":". Substituting ","element":"span"},{"text":"these representations in eq. ","element":"span"},{"href":"#id-67","text":"(65)","element":"a"},{"text":", we have the following dynamics for ","element":"span"},{"style":{"height":16},"width":99.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-9.png","element":"img","alt":" ud(t),","inline":true}],[{"style":{"width":"82%"},"width":1302,"height":168,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-10.png","element":"img"}],[{"text":"where in ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"we have accumulated all diminishing terms into ","element":"span"},{"style":{"height":28.8},"width":545.95,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-11.png","element":"img","alt":" δ(t)d = δ(t)4,d�1 + δ(t)1,d + δ(t)3,d�+","inline":true}],[{"style":{"width":"27%"},"width":440,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-12.png","element":"img"}],[{"text":"Step 2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Remainder of the proof: ","element":"span"},{"text":"We now prove our theorem by looking the following quantity: For any ","element":"span"},{"style":{"height":39.09},"width":787.05,"height":97.73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-13.png","element":"img","alt":" d, d′ with �z∞[d],�z∞[d′] ̸= 0, define κ(t)d,d′ =��u(t)du(t)d′","inline":true}],[{"text":"We will show that whenever ","element":"span"},{"style":{"height":23.89},"width":616.1,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-14.png","element":"img","alt":" |�z∞[d]| > |�z∞[d′]|, we get κ(t)d,d′ → ∞","inline":true},{"text":". Along with eq. ","element":"span"},{"href":"#id-68","text":"(64)","element":"a"},{"text":", this would ","element":"span"},{"text":"imply that ","element":"span"},{"style":{"height":38.4},"width":253.77,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-15.png","element":"img","alt":" limt→∞ κ(t)d,d′ =�","inline":true}],[{"text":"have ","element":"span"},{"style":{"height":20.18},"width":354.28,"height":50.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-16.png","element":"img","alt":" γ|�z∞[d]| ≤ γ|�z∞[d′]|","inline":true},{"text":". Moreover from eq.","element":"span"},{"href":"#id-61","text":"(57)","element":"a"},{"text":"), we know that ","element":"span"},{"style":{"height":20.18},"width":232.38,"height":50.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-17.png","element":"img","alt":" γ|�z∞[d′]| = 1","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":10.8},"width":34.74,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-18.png","element":"img","alt":" d′","inline":true,"padRight":true},{"text":"with","element":"span"}],[{"style":{"width":"69%"},"width":1096,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-19.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Showing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"|","element":"span"},{"style":{"height":23.89},"width":594.12,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-20.png","element":"img","alt":"�z∞[d]| > |�z∞[d′]| =⇒ κ(t)d,d′ → ∞:","inline":true}],[{"text":"For any ","element":"span"},{"style":{"height":20.18},"width":638.54,"height":50.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-21.png","element":"img","alt":" 2ϵ > 0, let |�z∞[d]| − |�z∞[d′]| = 2ϵ > 0","inline":true},{"text":". We note that the since the loss ","element":"span"},{"style":{"height":19.53},"width":309.94,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-22.png","element":"img","alt":" L(β(t)) → 0, norm","inline":true,"padRight":true},{"text":"of the gradient ","element":"span"},{"style":{"height":18.18},"width":427.1,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-23.png","element":"img","alt":" p(t) = ∥z(t)∥ = ∥�zt∥ → 0","inline":true},{"text":". Hence, for any finite step size sequence ","element":"span"},{"style":{"height":16},"width":73.74,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-24.png","element":"img","alt":" {ηt}","inline":true},{"text":", there exists ","element":"span"},{"style":{"height":28.8},"width":966.07,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-25.png","element":"img","alt":"t1 such that ∀t ≥ t1 and ∀d, ηtp(t)�|�z∞[d]| + |δ(t)d |�< 0.5","inline":true,"padRight":true},{"text":"and the following inequalities hold,","element":"span"}],[{"style":{"width":"93%"},"width":1484,"height":534,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/21-26.png","element":"img"}],[{"text":"where in ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"follows from using ","element":"span"},{"style":{"height":16},"width":288.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-0.png","element":"img","alt":"1/(1+x) ≥ (1 − x)","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x < ","element":"span"},{"text":"1 ","element":"span"},{"text":"since ","element":"span"},{"style":{"height":28.8},"width":508.89,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-1.png","element":"img","alt":" ηtp(t)�|�z∞[d]| + |δ(t)d |�< 0.5","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":12.8},"width":116.46,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-2.png","element":"img","alt":" t ≥ t1","inline":true},{"text":", and in ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"c","element":"span"},{"text":")","element":"span"},{"text":", we absorbed all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"o","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":")) ","element":"span"},{"text":"terms as ","element":"span"},{"style":{"height":23.89},"width":143.68,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-3.png","element":"img","alt":" δ(t)d,d′p(t)","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":23.89},"width":178.29,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-4.png","element":"img","alt":" δ(t)d,d′ → 0","inline":true,"padRight":true},{"text":"and used ","element":"span"},{"style":{"height":20.18},"width":470.48,"height":50.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-5.png","element":"img","alt":"|�z∞[d]| − |�z∞[d′]| = 2ϵ > 0.","inline":true}],[{"id":"id-70","style":{"width":"99%"},"width":1581,"height":148,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-6.png","element":"img"}],[{"text":"Further, from the conditions of the theorem, for almost all initializations, ","element":"span"},{"style":{"height":21.49},"width":222.07,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-7.png","element":"img","alt":" |�w(0)l [d]| > 0","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":". For step sizes ","element":"span"},{"style":{"height":16},"width":73.74,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-8.png","element":"img","alt":" {ηt}","inline":true,"padRight":true},{"text":"smaller than the local Lipschitz constant, for all finite ","element":"span"},{"style":{"height":10.8},"width":127.27,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-9.png","element":"img","alt":" t′ < ∞","inline":true},{"text":", we also have ","element":"span"},{"style":{"height":21.49},"width":221.6,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-10.png","element":"img","alt":"|w(t′)l [d]| > 0","inline":true},{"text":". Moreover from Lemma ","element":"span"},{"href":"#id-69","text":"10, ","element":"a"},{"text":"we have that ","element":"span"},{"style":{"height":21.49},"width":292.8,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-11.png","element":"img","alt":" |u(t)d |, |u(t)d′ | → ∞","inline":true,"padRight":true},{"text":"and hence ","element":"span"},{"style":{"height":13.19},"width":52.53,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-12.png","element":"img","alt":" ∃t3","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":21.49},"width":310.19,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-13.png","element":"img","alt":"∀t ≥ t3, |u(t)d | > 0","inline":true},{"text":", but for any finite ","element":"span"},{"style":{"height":21.49},"width":337.59,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-14.png","element":"img","alt":" t′ < ∞, |u(t′)d′ | < ∞","inline":true},{"text":". Thus, for ","element":"span"},{"style":{"height":16},"width":335.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-15.png","element":"img","alt":" t0 = max{t1, t2, t3}","inline":true},{"text":", using ","element":"span"},{"text":"the above observations, we have that ","element":"span"},{"style":{"height":39.09},"width":218.19,"height":97.73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-16.png","element":"img","alt":" κ(t0)d,d′ =��u(t0)du(t0)d′","inline":true}],[{"text":"Now, using eq. ","element":"span"},{"href":"#id-70","text":"(71)","element":"a"},{"text":", for all ","element":"span"},{"style":{"height":12.8},"width":109.8,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-17.png","element":"img","alt":" t ≥ t0,","inline":true}],[{"id":"id-71","style":{"width":"100%"},"width":1587,"height":545,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-18.png","element":"img"}],[{"text":"Moreover, we have ","element":"span"},{"style":{"height":21.49},"width":166.4,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-19.png","element":"img","alt":" u(t)d → ∞","inline":true,"padRight":true},{"text":"for at least one ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":", and for any finite step sizes and finite ","element":"span"},{"style":{"height":21.49},"width":255.88,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-20.png","element":"img","alt":" t0, |u(t0)d | < ∞.","inline":true,"padRight":true},{"text":"This then implies that for some ","element":"span"},{"style":{"height":28.8},"width":1063.45,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-21.png","element":"img","alt":" µ < ∞, exp��tu=t0 µηup(u)�→ ∞ =⇒ �tu=t0 ηup(u) → ∞","inline":true},{"text":". ","element":"span"},{"text":"Thus, for any ","element":"span"},{"style":{"height":11.6},"width":89.31,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-22.png","element":"img","alt":" ϵ > 0","inline":true},{"text":", we also have ","element":"span"},{"style":{"height":21.59},"width":764.67,"height":53.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-23.png","element":"img","alt":"�tu=t0(1 + ϵηup(u)) ≥ ϵ �tu=t0 ηup(u) → ∞.","inline":true}],[{"text":"From eq. ","element":"span"},{"href":"#id-71","text":"(72) ","element":"a"},{"text":"and above claim, we conclude that for all ","element":"span"},{"style":{"height":23.89},"width":711.91,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-24.png","element":"img","alt":" d, d′, if |�z∞[d]| > |�z∞[d′]|, then κ(t)d,d′ → ∞.","inline":true}],[{"text":"This completes the proof of the theorem. ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-25.png","element":"img","alt":"□","inline":true}],[{"id":"id-66","style":{"fontWeight":"bold"},"text":"C.1.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-69","style":{"fontWeight":"bold"},"text":"10","element":"a"}],[{"style":{"width":"95%"},"width":1522,"height":84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-26.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Recalling ","element":"span"},{"style":{"height":21.49},"width":62.62,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-27.png","element":"img","alt":" τ (t)d","inline":true,"padRight":true},{"text":"from eq. ","element":"span"},{"href":"#id-67","text":"(65) ","element":"a"},{"text":"and ","element":"span"},{"style":{"height":21.49},"width":61.95,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-28.png","element":"img","alt":" u(t)d","inline":true,"padRight":true},{"text":"from eq. ","element":"span"},{"href":"#id-72","text":"(63)","element":"a"},{"text":", we have the following:","element":"span"}],[{"id":"id-73","style":{"width":"94%"},"width":1492,"height":238,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-29.png","element":"img"}],[{"text":"For all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"if ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-30.png","element":"img","alt":"�β","inline":true}],[{"href":"#id-65","style":{"height":30.96},"width":673.38,"height":77.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-31.png","element":"img","alt":"| �w(t)2 [d]|/g(t) → | �w∞1 [d]|| �w∞2 [d]| = 1 (from eq. (60)","inline":true},{"text":"), and also that ","element":"span"},{"style":{"height":17.76},"width":671.23,"height":44.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-32.png","element":"img","alt":" e−iφ�z∞[d]+iφ�β(t)[d] → e−iφ�z∞[d]+iφ�β∞[d] =","inline":true,"padRight":true},{"text":"1 ","element":"span"},{"text":"(from eq. ","element":"span"},{"href":"#id-64","text":"(62)","element":"a"},{"text":"). This along with eq. ","element":"span"},{"href":"#id-73","text":"(73) ","element":"a"},{"text":"gives us ","element":"span"},{"style":{"height":33.28},"width":152.32,"height":83.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/22-33.png","element":"img","alt":"τ (t)du(t)d → 0.","inline":true}],[{"text":"Moreover, since ","element":"span"},{"style":{"height":24.18},"width":279.41,"height":60.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-0.png","element":"img","alt":" |�β(t)[d]| → ∞","inline":true},{"text":", we have ","element":"span"},{"style":{"height":20.93},"width":137.62,"height":52.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-1.png","element":"img","alt":" |�w(t)2 [d]|","inline":true,"padRight":true},{"text":"or ","element":"span"},{"style":{"height":20.93},"width":285.49,"height":52.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-2.png","element":"img","alt":" |�w(t)2 [d]| → ∞","inline":true},{"text":". ","element":"span"},{"text":"Further, using ","element":"span"},{"style":{"height":22.17},"width":1390.54,"height":55.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-3.png","element":"img","alt":"e−iφ�z∞[d]+iφ�β(t)[d] → 1, we have |u(t)d | = |�w(t)2 [d]| + |�w(t)1 [d]|e−iφ�z∞[d]+iφ�β(t)[d] → ∞.","inline":true}],[{"text":"We now only need to show that these results also hold for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"such that ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-4.png","element":"img","alt":"�β","inline":true}],[{"text":"assumptions of the theorem that even when ","element":"span"},{"style":{"height":14},"width":27,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-5.png","element":"img","alt":"�β","inline":true}],[{"style":{"height":17.26},"width":123.78,"height":43.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-6.png","element":"img","alt":"eiφ�β∞[d]","inline":true},{"text":". We now prove the lemma by showing the following steps for ","element":"span"},{"style":{"height":24.17},"width":394.11,"height":60.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-7.png","element":"img","alt":" d such that �β∞[d] = 0. :","inline":true}],[{"style":{"width":"70%"},"width":1112,"height":155,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-8.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof of lemma assuming Step 1 and Step 2 hold ","element":"span"},{"text":"The above steps would imply that in eq. ","element":"span"},{"href":"#id-73","text":"(73)","element":"a"},{"text":",","element":"span"}],[{"style":{"width":"93%"},"width":1477,"height":480,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-9.png","element":"img"}],[{"text":"These eqs. along with eq. ","element":"span"},{"href":"#id-73","text":"(73) ","element":"a"},{"text":"in turn prove the lemma, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e.","element":"span"},{"text":", ","element":"span"},{"style":{"height":33.28},"width":418.16,"height":83.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-10.png","element":"img","alt":"τ (t)du(t)d → 0 and |u(t)d | → ∞.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Showing Step 1 and Step 2","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Step ","element":"span"},{"text":"1. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Show ","element":"span"},{"style":{"height":32.48},"width":211.54,"height":81.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-11.png","element":"img","alt":"| �w(t)1 [d]|| �w(t)2 [d]| → 1.","inline":true}],[{"style":{"width":"99%"},"width":1582,"height":227,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-12.png","element":"img"}],[{"text":"Note that since ","element":"span"},{"style":{"height":18.18},"width":329.16,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-13.png","element":"img","alt":" |�z(t)[d]|2 → 0 and ηt","inline":true,"padRight":true},{"text":"are finite, we have that ","element":"span"},{"style":{"height":13.19},"width":52.53,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-14.png","element":"img","alt":" ∃t1","inline":true,"padRight":true},{"text":"such that for all ","element":"span"},{"style":{"height":18.18},"width":336.38,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-15.png","element":"img","alt":" t ≥ t1, ηt|�z(t)[d]|2 ≤","inline":true,"padRight":true},{"text":"1","element":"span"},{"text":". From the above equation, we have the following for ","element":"span"},{"style":{"height":12.8},"width":109.79,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-16.png","element":"img","alt":" t ≥ t1,","inline":true}],[{"id":"id-74","style":{"width":"95%"},"width":1513,"height":302,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-17.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"follows from iterating over ","element":"span"},{"style":{"height":18.18},"width":579.62,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-18.png","element":"img","alt":" t and using |�z(t)[d]|2 ≤ 1 for t ≥ t1.","inline":true}],[{"text":"Since ","element":"span"},{"style":{"height":24.43},"width":588.97,"height":61.07,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-19.png","element":"img","alt":" |�β(t)[d]| = |�w(t)1 [d]| · |�w(t)2 [d]| → ∞","inline":true},{"text":", at least one of ","element":"span"},{"style":{"height":20.93},"width":293.03,"height":52.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-20.png","element":"img","alt":" |�w(t)1 [d]|, |�w(t)2 [d]|","inline":true,"padRight":true},{"text":"must diverge. Without ","element":"span"},{"text":"loss of generality, let ","element":"span"},{"style":{"height":20.93},"width":1235.98,"height":52.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-21.png","element":"img","alt":" |�w(t)2 [d]| → ∞. Let c(t) := |�w(t)1 [d]|2 − |�w(t)2 [d]|2 with |c(t)| < ∞. We have","inline":true}],[{"style":{"width":"66%"},"width":1059,"height":118,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-22.png","element":"img"}],[{"text":"where the convergence in ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"follows since ","element":"span"},{"style":{"height":16},"width":177.9,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-23.png","element":"img","alt":" |c(t)| < ∞","inline":true,"padRight":true},{"text":"(from eq. ","element":"span"},{"href":"#id-74","text":"(76)","element":"a"},{"text":") and ","element":"span"},{"style":{"height":20.93},"width":249.53,"height":52.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/23-24.png","element":"img","alt":" |�w(t)2 [d]| → ∞.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Step ","element":"span"},{"text":"2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Show Re","element":"span"},{"style":{"height":29.52},"width":731.83,"height":73.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-0.png","element":"img","alt":"(e−iφ�z∞[d]+iφ�β∞[d]) = 2 cos�φ�z∞[d] − φ�β∞[d]","inline":true}],[{"text":"Note that from Step 1 above, we have that ","element":"span"},{"style":{"height":32.48},"width":232.58,"height":81.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-1.png","element":"img","alt":"| �w(t)1 [d]|2| �w(t)2 [d]|2 → 1","inline":true},{"text":", which implies ","element":"span"},{"style":{"height":33.05},"width":339.82,"height":82.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-2.png","element":"img","alt":" | �w(t)1 [d]|2+| �w(t)2 [d]|22|�β(t)[d]| =","inline":true}],[{"style":{"width":"99%"},"width":1578,"height":163,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-3.png","element":"img"}],[{"text":"Also, from eq. ","element":"span"},{"href":"#id-75","text":"(44)","element":"a"},{"text":", there exists ","element":"span"},{"style":{"height":23.89},"width":148.25,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-4.png","element":"img","alt":" δ(t)2,d → 0","inline":true},{"text":", such that","element":"span"}],[{"id":"id-76","style":{"width":"78%"},"width":1246,"height":59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-5.png","element":"img"}],[{"text":"Using the above representations, along with eq. ","element":"span"},{"href":"#id-65","text":"(59)","element":"a"},{"text":", we have the following,","element":"span"}],[{"style":{"width":"95%"},"width":1518,"height":285,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-6.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"follows ","element":"span"},{"text":"from ","element":"span"},{"text":"substituting ","element":"span"},{"text":"eqs. ","element":"span"},{"href":"#id-76","text":"(79)","element":"a"},{"text":"-","element":"span"},{"href":"#id-76","text":"(80)","element":"a"},{"text":", ","element":"span"},{"text":"and ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":") ","element":"span"},{"text":"follows ","element":"span"},{"text":"from ","element":"span"},{"text":"using ","element":"span"},{"style":{"height":18.18},"width":450.7,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-7.png","element":"img","alt":"|�z(t)[d]| ≤ p(t) → 0","inline":true,"padRight":true},{"text":"and defining ","element":"span"},{"style":{"height":28.8},"width":868.33,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-8.png","element":"img","alt":" δ(t)3,d = δ(t)2,d�1 + δ(t)1,d + 1/2ηt�z(t)[d]e−iφ�β(t)[d]� +","inline":true},{"style":{"height":28.8},"width":688.5,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-9.png","element":"img","alt":"�z∞[d]�δ(t)1,d + 1/2ηt�z(t)[d]e−iφ�β(t)[d]�→ 0.","inline":true}],[{"text":"Denote ","element":"span"},{"style":{"height":22.62},"width":400.38,"height":56.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-10.png","element":"img","alt":" ∆d = φ�β∞[d] − φ�z∞[d]","inline":true},{"text":". Additionally, from the assumption in the theorem, we have ","element":"span"},{"style":{"height":17.76},"width":319.24,"height":44.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-11.png","element":"img","alt":"eiφ�β(t)[d] → eiφ�β∞[d]","inline":true},{"text":", hence there exists ","element":"span"},{"style":{"height":24.57},"width":873.52,"height":61.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-12.png","element":"img","alt":" δ(t)4,d → 0 such that eiφ�β(t)[d]−iφ�z∞[d] = ei∆d(1 + δ(t)4,d).","inline":true}],[{"text":"Now, from the above equation, for any ","element":"span"},{"style":{"height":13.6},"width":207.65,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-13.png","element":"img","alt":" t0 and t ≥ t0","inline":true},{"text":", we derive the updates for ","element":"span"},{"style":{"height":24.18},"width":141.6,"height":60.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-14.png","element":"img","alt":" |�β(t)[d]|,","inline":true}],[{"id":"id-77","style":{"width":"106%"},"width":1684,"height":749,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-15.png","element":"img"}],[{"text":"(since ","element":"span"},{"style":{"height":23.89},"width":239.71,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-16.png","element":"img","alt":" p(t), δ(t)3,d → 0","inline":true},{"text":"); in ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":") ","element":"span"},{"text":"we defined ","element":"span"},{"style":{"height":23.89},"width":863.8,"height":59.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-17.png","element":"img","alt":" δ(t)6,d = 1/2δ(t)∗4,d ei∆d + 1/2δ(t)3,de−i∆d + δ(t)5,d → 0; (c)","inline":true,"padRight":true},{"text":"is ","element":"span"},{"text":"obtained by iterating over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"; and ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":") ","element":"span"},{"text":"follows from using ","element":"span"},{"style":{"height":16},"width":300.19,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-18.png","element":"img","alt":" (1 + x) ≤ exp(x).","inline":true}],[{"text":"If possible, let ","element":"span"},{"style":{"height":23.89},"width":619.44,"height":59.73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-19.png","element":"img","alt":" cos(∆d) = −2ϵ < 0. Since |δ(t)6,d| → 0","inline":true},{"text":", and for finite step sizes ","element":"span"},{"style":{"height":16},"width":337.79,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-20.png","element":"img","alt":" ηtp(t) → 0, ∃t0 such","inline":true,"padRight":true},{"text":"that for all ","element":"span"},{"href":"#id-77","style":{"height":28.8},"width":1183.57,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-21.png","element":"img","alt":" t ≥ t0, |δ(t)6,d| < ϵ|�z∞[d]| and exp�−4ϵ|�z∞[d]|ηtp(t)�≤ 1. From eq. (82)","inline":true},{"text":", we now have","element":"span"}],[{"style":{"width":"72%"},"width":1146,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/24-22.png","element":"img"}],[{"text":"Finally, for any finite step sizes and finite ","element":"span"},{"style":{"height":12.39},"width":30.39,"height":30.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-0.png","element":"img","alt":" t0","inline":true},{"text":", we have ","element":"span"},{"style":{"height":24.18},"width":274.47,"height":60.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-1.png","element":"img","alt":" |�β(t0)[d]|2 < ∞","inline":true,"padRight":true},{"text":"and this creates a contradiction since the LHS in the above equation diverges, ","element":"span"},{"style":{"height":24.18},"width":321.56,"height":60.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-2.png","element":"img","alt":" |�β(t+1)[d]|2 → ∞","inline":true},{"text":". ","element":"span"},{"text":"Hence, in order for the updates in eq. ","element":"span"},{"href":"#id-77","text":"(82) ","element":"a"},{"text":"to lead to a divergent ","element":"span"},{"style":{"height":24.18},"width":171.89,"height":60.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-3.png","element":"img","alt":" |�β(t+1)[d]|","inline":true},{"text":", we necessarily require that","element":"span"}],[{"style":{"width":"69%"},"width":1096,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-4.png","element":"img"}],[{"text":"This completes the proof of the lemma.","element":"span"}]]},{"heading":"D Computing RP(β): Proofs of Lemmas in Section 5","paragraphs":[[{"text":"In this appendix we prove the lemmas in Section ","element":"span"},{"text":"5 ","element":"span"},{"text":"that compute the form of induced bias of linear networks in the space of predictors. Recall that for linear predictors parameterized as ","element":"span"},{"style":{"height":16},"width":179.43,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-5.png","element":"img","alt":" β = P(w)","inline":true},{"text":", ","element":"span"},{"style":{"height":19.06},"width":500.07,"height":47.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-6.png","element":"img","alt":"RP(β) = minw:P(w)=β∥w∥22.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Lemma 5. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For fully connected networks of any depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L > ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":",","element":"span"}],[{"style":{"width":"66%"},"width":1053,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-7.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Recall that for fully connected networks of any depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"style":{"fontStyle":"italic"},"text":"> ","element":"span"},{"text":"0 ","element":"span"},{"text":"with parameters ","element":"span"},{"style":{"height":18.3},"width":420.26,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-8.png","element":"img","alt":"w = [wl ∈ RDl−1×Dl]Ll−1","inline":true},{"text":", the equivalent linear predictor given by ","element":"span"},{"style":{"height":16.79},"width":434.24,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-9.png","element":"img","alt":" Pfull(w) = w1w2 . . . wL.","inline":true}],[{"id":"id-78","style":{"width":"99%"},"width":1584,"height":393,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-10.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"follows as arithmetic mean is greater than the geometric mean.","element":"span"}],[{"style":{"width":"89%"},"width":1417,"height":308,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-11.png","element":"img"}],[{"text":"This ensures that ","element":"span"},{"style":{"height":21.47},"width":867.05,"height":53.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-12.png","element":"img","alt":" Pfull(w) = w1w2 . . . wL = β and ∥w∥22 = L∥β∥2/L2 ","inline":true,"padRight":true},{"text":", and hence","element":"span"}],[{"id":"id-79","style":{"width":"74%"},"width":1187,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-13.png","element":"img"}],[{"text":"Combining eq. ","element":"span"},{"href":"#id-78","text":"(83) ","element":"a"},{"text":"and eq. ","element":"span"},{"href":"#id-79","text":"(84)","element":"a"},{"text":", we get ","element":"span"},{"style":{"height":22.36},"width":372.91,"height":55.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-14.png","element":"img","alt":" RPfull(β) = L∥β∥2/L2","inline":true}],[{"text":"The proofs of the lemmas for computing ","element":"span"},{"style":{"height":16},"width":125.91,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-15.png","element":"img","alt":" RP(w)","inline":true,"padRight":true},{"text":"for diagonal and convolutional networks are similar to those of fully connected network.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 6. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For a depth–","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"style":{"fontStyle":"italic"},"text":"diagonal network with parameters ","element":"span"},{"style":{"height":18.3},"width":463.46,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-16.png","element":"img","alt":" w = [wl ∈ RD]Ll−1, we have","inline":true}],[{"style":{"width":"68%"},"width":1087,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-17.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Recall that for an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"–layer linear diagonal networks with parameters ","element":"span"},{"style":{"height":18.3},"width":382.32,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-18.png","element":"img","alt":" w = [wl ∈ RD]Ll−1, the","inline":true,"padRight":true},{"text":"equivalent linear predictor is given by ","element":"span"},{"style":{"height":16.79},"width":834.54,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/25-19.png","element":"img","alt":" Pdiag(w) = diag(w1)diag(w2) . . . diag(wL−1)wL.","inline":true}],[{"text":"Let ","element":"span"},{"style":{"height":18.3},"width":350.19,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-0.png","element":"img","alt":" w⋆(β) = [w⋆l (β)]Ll=1","inline":true,"padRight":true},{"text":"be the minimizer of ","element":"span"},{"style":{"height":19.56},"width":370.88,"height":48.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-1.png","element":"img","alt":" minw:Pdiag(w)=β∥w∥22","inline":true},{"text":", so that ","element":"span"},{"style":{"height":16.79},"width":317.2,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-2.png","element":"img","alt":" β = Pdiag(w⋆(β))","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.07},"width":394.12,"height":47.67,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-3.png","element":"img","alt":" RPdiag(β) = ∥w⋆(β)∥22","inline":true},{"text":". We then have,","element":"span"}],[{"id":"id-80","style":{"width":"83%"},"width":1318,"height":218,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-4.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"a","element":"span"},{"text":") ","element":"span"},{"text":"again follows as arithmetic mean is greater than the geometric mean.","element":"span"}],[{"text":"Similar to the case of fully connected networks, we now choose ","element":"span"},{"style":{"height":16},"width":154.91,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-5.png","element":"img","alt":" w = [wl]","inline":true,"padRight":true},{"text":"that satisfies ","element":"span"},{"style":{"height":16.79},"width":236.27,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-6.png","element":"img","alt":" Pdiag(w) = β","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":24.91},"width":282.66,"height":62.27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-7.png","element":"img","alt":" ∥w∥22 = L∥β∥2/L2/L","inline":true},{"text":". This would ensure that,","element":"span"}],[{"style":{"width":"55%"},"width":878,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-8.png","element":"img"}],[{"text":"We can check that these properties are satisfied by choosing ","element":"span"},{"style":{"fontWeight":"bold"},"text":"w ","element":"span"},{"text":"as follows: for ","element":"span"},{"style":{"height":14},"width":361.8,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-9.png","element":"img","alt":" d = 0, 1, . . . D − 1, let","inline":true},{"style":{"height":19.53},"width":1135.22,"height":48.82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-10.png","element":"img","alt":"w1[d] = sign(β(d)) |β(d)|1/L and wl[d] = |β(d)|1/L for l = 2, 3, . . . , L.","inline":true}],[{"text":"Combining this argument with eq. ","element":"span"},{"href":"#id-80","text":"85 ","element":"a"},{"text":"concludes the proof.","element":"span"}],[{"text":"For convolutional networks, the argument is the exactly the same as that for diagonal network adapted for complex vectors. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Lemma 7. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For a depth–","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"style":{"fontStyle":"italic"},"text":"convolutional network with parameters ","element":"span"},{"style":{"height":18.3},"width":463.46,"height":45.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-11.png","element":"img","alt":" w = [wl ∈ RD]Ll−1, we have","inline":true}],[{"style":{"width":"69%"},"width":1096,"height":77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-12.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Denote the Fourier basis coefficients of ","element":"span"},{"style":{"height":17.38},"width":573.71,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-13.png","element":"img","alt":" wl ∈ RD and β = Pconv(w) ∈ RD ","inline":true,"padRight":true},{"text":"in polar form as","element":"span"}],[{"style":{"width":"46%"},"width":732,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-14.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":18.92},"width":238.5,"height":47.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-15.png","element":"img","alt":" |�wl|, |�β| ∈ RD+","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":21.3},"width":320.56,"height":53.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-16.png","element":"img","alt":" φ �wl, φ�β ∈ [0, 2π)D","inline":true,"padRight":true},{"text":"are the vectors with magnitudes and phases, respec- ","element":"span"},{"text":"tively, of ","element":"span"},{"style":{"height":14},"width":100.68,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-17.png","element":"img","alt":" �wl, �β.","inline":true}],[{"text":"From Lemma ","element":"span"},{"href":"#id-36","text":"3, ","element":"a"},{"text":"the Fourier basis representation of ","element":"span"},{"style":{"height":16},"width":242.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-18.png","element":"img","alt":" β = Pconv(w)","inline":true,"padRight":true},{"text":"is given by","element":"span"}],[{"style":{"width":"57%"},"width":914,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-19.png","element":"img"}],[{"text":"where we have overloaded the notation ","element":"span"},{"style":{"height":15.59},"width":88.88,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-20.png","element":"img","alt":" Pdiag","inline":true,"padRight":true},{"text":"to denote the mapping of diagonal networks in complex vector fields, and ","element":"span"},{"style":{"height":18.3},"width":205.04,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-21.png","element":"img","alt":" �w = [�wl]Ll=1","inline":true},{"text":". We thus have for ","element":"span"},{"style":{"height":14},"width":332.72,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-22.png","element":"img","alt":" d = 0, 1, . . . , D − 1,","inline":true}],[{"style":{"width":"64%"},"width":1020,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-23.png","element":"img"}],[{"text":"From orthonormality of discrete Fourier transformation, we have for all ","element":"span"},{"style":{"height":17.38},"width":401.74,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-24.png","element":"img","alt":" w, ∥w∥22 = ∥�w∥22. Thus,","inline":true}],[{"style":{"width":"77%"},"width":1231,"height":78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-25.png","element":"img"}],[{"text":"We can now adapt the proof of diagonal networks here. Let ","element":"span"},{"style":{"height":18.3},"width":472.89,"height":45.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-26.png","element":"img","alt":" �w⋆(β) = [�w⋆l (β) ∈ CD]Ll=1","inline":true,"padRight":true},{"text":"be the ","element":"span"},{"text":"minimizer of ","element":"span"},{"style":{"height":22.49},"width":1355.67,"height":56.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-27.png","element":"img","alt":" min �w:�β=Pdiag( �w)∥�w∥22, so that �β = Pdiag(�w⋆(β)) and RPconv(β) = ∥�w⋆(β)∥22, and","inline":true}],[{"id":"id-81","style":{"width":"82%"},"width":1302,"height":224,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/26-28.png","element":"img"}],[{"text":"Similar to the diagonal networks, we can choose the parameters in the Fourier domain ","element":"span"},{"style":{"height":16},"width":181.98,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-0.png","element":"img","alt":"�w = [�wl ∈","inline":true},{"style":{"height":17.38},"width":68.68,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-1.png","element":"img","alt":"CD]","inline":true,"padRight":true},{"text":"to ensure that ","element":"span"},{"style":{"height":24.91},"width":597.04,"height":62.27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-2.png","element":"img","alt":" Pdiag(�w) = �β and ∥�w∥22 = L∥�β∥2/L2/L ","inline":true,"padRight":true},{"text":"as follows: for ","element":"span"},{"style":{"height":14.4},"width":365.71,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-3.png","element":"img","alt":" d = 0, 1, . . . D − 1, let","inline":true}],[{"style":{"width":"77%"},"width":1233,"height":238,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-4.png","element":"img"}],[{"text":"Combining this with eq. ","element":"span"},{"href":"#id-81","text":"87 ","element":"a"},{"text":"concludes the proof.","element":"span"}]]},{"heading":"E Background Results","paragraphs":[[{"id":"id-46","style":{"fontWeight":"bold"},"text":"Theorem 11 ","element":"span"},{"text":"(Stolz–Cesaro theorem, proof in Theorem ","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"22 ","element":"span"},{"text":"of ","element":"span"},{"href":"#id-82","referenceIndex":18,"text":"Muresan ","element":"a"},{"href":"#id-82","referenceIndex":18,"text":"[2009]","element":"a"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume that ","element":"span"},{"style":{"height":16.51},"width":138.6,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-5.png","element":"img","alt":"{ak}∞k=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":16.51},"width":134.64,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-6.png","element":"img","alt":" {bk}∞k=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are two sequences of real numbers such that ","element":"span"},{"style":{"height":16.51},"width":134.64,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-7.png","element":"img","alt":" {bk}∞k=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is strictly monotonic ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and diverging (i.e., monotone increasing with ","element":"span"},{"style":{"height":13.19},"width":140.16,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-8.png","element":"img","alt":" bk → ∞","inline":true},{"style":{"fontStyle":"italic"},"text":", or monotone decreasing with ","element":"span"},{"style":{"height":13.19},"width":171.16,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-9.png","element":"img","alt":" bk → −∞","inline":true},{"style":{"fontStyle":"italic"},"text":"). Additionally, if ","element":"span"},{"style":{"height":21.55},"width":363.2,"height":53.87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-10.png","element":"img","alt":" limk→∞ak+1−akbk+1−bk = L","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"exists, then ","element":"span"},{"style":{"height":18.62},"width":182.2,"height":46.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1806.00468/images/27-11.png","element":"img","alt":" limk→∞akbk ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"exists and is equal to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}]]}],"_version":"3.3.4"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]