1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMi4wMjUxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2022-05-24T01:21:30.000Z","paperID":"2002.02515","published":"2020-02-06T21:17:32.000Z","authors":"[\"Feng-Lei Fan\",\"Rongjie Lai\",\"Ge Wang\"]","title":"Quasi-Equivalence of Width and Depth of Neural Networks","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-05T03:06:19.063Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9kdWFsaXR5LW9mLXdpZHRoLWFuZC1kZXB0aC1vZi1uZXVyYWwtbmV0d29ya3MifQ==","type":"pwc","url":"https://paperswithcode.com/paper/duality-of-width-and-depth-of-neural-networks","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[{"id":"eyJuYW1lIjoiZ2VuZXJhbCBjbGFzc2lmaWNhdGlvbiIsInR5cGUiOiJ0YXNrIn0=","name":"general classification","description":"In general classification, the input is a set of labeled data and the output is a model that can predict the labels of new, unseen data. This task is commonly used in various fields such as email spam detection, image recognition, and medical diagnosis, where it helps in categorizing data into predefined classes.","scoreTrending":null,"count":{"stars":2267,"papers":839,"models":1591},"__typename":"Tag"},{"id":"eyJuYW1lIjoiY2xhc3NpZmljYXRpb24iLCJ0eXBlIjoidGFzayJ9","name":"classification","description":"In classification, the input is a set of labeled data and the output is a model that can predict the label of new, unseen data. This task is commonly used in real-world applications like spam detection, image recognition, and medical diagnosis.","scoreTrending":null,"count":{"stars":12367,"papers":5355,"models":4048},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[{"author":"ge wang","node":{"id":"eyJhZGRyZXNzIjoid2FuZ2c2QHJwaS5lZHUifQ==","address":"wangg6@rpi.edu","name":"G. Wang","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"xI3IzloAAAAJ"},{"thirdPartyID":"m6ZNDewAAAAJ"},{"thirdPartyID":"VSAoUo4AAAAJ"},{"thirdPartyID":"J4b99-cAAAAJ"},{"thirdPartyID":"nf8ZFpQAAAAJ"},{"thirdPartyID":"iF1M1sIAAAAJ"},{"thirdPartyID":"pjK2mQwAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJlYzc1YjNhYy03NGRmLTQ1MWMtYWQyOS01ZjRkY2RkMTExZTkifQ==","name":"ronggang wang","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTcwMi4wMDI4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.00288"},{"id":"eyJwYXBlcklEIjoiMTcwOC4wMDk2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1708.00961"},{"id":"eyJwYXBlcklEIjoiMjAwMS4wMjUyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2001.02522"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNTY1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.05656"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wNzUyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.07521"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wMDU4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.00587"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wNDI1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.04256"},{"id":"eyJwYXBlcklEIjoiMjQwMy4wODMxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.08319"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wMDM2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.00363"},{"id":"eyJwYXBlcklEIjoiMTcwNy4wOTYzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1707.09636"},{"id":"eyJwYXBlcklEIjoiMjAxMS4wMzM4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.03384"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wOTM4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.09382"},{"id":"eyJwYXBlcklEIjoiMTkwNy4wMzA2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.03063"},{"id":"eyJwYXBlcklEIjoiMjIwOS4xMjEwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2209.12104"},{"id":"eyJwYXBlcklEIjoiMjQwMS4wMjA1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.02051"},{"id":"eyJwYXBlcklEIjoiMjEwNC4xNDUyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.14528"},{"id":"eyJwYXBlcklEIjoiMjQwMy4xMjc2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.12766"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wMDA5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.00098"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xMjQyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.12421"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wMTYxMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.01612"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xMjk3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.12975"},{"id":"eyJwYXBlcklEIjoiMjIwNS4wMDEyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2205.00122"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wMTU2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.01561"},{"id":"eyJwYXBlcklEIjoiMjIwOS4xNTEzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2209.15136"},{"id":"eyJwYXBlcklEIjoiMjExMC4wMjAyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.02027"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wNzY4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.07687"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wOTAzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.09038"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wOTAwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.09003"},{"id":"eyJwYXBlcklEIjoiMjIxMS4xMDM4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.10388"},{"id":"eyJwYXBlcklEIjoiMTgxMC4xMzA1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1810.13059"},{"id":"eyJwYXBlcklEIjoiMjMwNy4xNTQyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.15421"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wNjY1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.06651"},{"id":"eyJwYXBlcklEIjoiMTgxMi4xMTY3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1812.11675"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMjcwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.12700"},{"id":"eyJwYXBlcklEIjoiMjMwNC4wOTU3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.09571"},{"id":"eyJwYXBlcklEIjoiMTkwMS4wNTU5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.05593"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wNTIzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.05237"},{"id":"eyJwYXBlcklEIjoiMjAwMi4xMTg2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.11863"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xNzA0MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.17041"},{"id":"eyJwYXBlcklEIjoiMTkwNy4wMTI2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.01262"},{"id":"eyJwYXBlcklEIjoiMjMwMy4xMjg2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.12861"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wMjUxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.02515"},{"id":"eyJwYXBlcklEIjoiMjQwMi4xNjg5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.16891"},{"id":"eyJwYXBlcklEIjoiMjMwNC4wMjY0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.02649"},{"id":"eyJwYXBlcklEIjoiMjMwMS4wODgxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2301.08815"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wMzExOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.03119"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wMTUwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.01501"},{"id":"eyJwYXBlcklEIjoiMjExMC4wMzU4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.03588"},{"id":"eyJwYXBlcklEIjoiMTkwNy4wOTk1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.09951"},{"id":"eyJwYXBlcklEIjoiMjEwOS4xMjQ4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2109.12484"},{"id":"eyJwYXBlcklEIjoiMjEwMy4xMzU1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.13557"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xNTQ5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.15494"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wMzY5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.03691"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xMDYzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.10630"},{"id":"eyJwYXBlcklEIjoiMjAwOC4wNTkyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.05925"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wOTgyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.09822"},{"id":"eyJwYXBlcklEIjoiMTkxMi4wNDI3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.04278"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNTg4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.05880"},{"id":"eyJwYXBlcklEIjoiMjQwOS4xNDMxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2409.14316"},{"id":"eyJwYXBlcklEIjoiMjAxMS4xMzY3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.13675"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wNjc0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.06743"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wNTcyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.05725"},{"id":"eyJwYXBlcklEIjoiMjQwMy4xNjA4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.16080"},{"id":"eyJwYXBlcklEIjoiMjIwNy4xMTY3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.11678"},{"id":"eyJwYXBlcklEIjoiMjIxMS4wNzI3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.07273"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wNjk0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.06949"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wMjY0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.02644"},{"id":"eyJwYXBlcklEIjoiMjQwNy4wNzA5OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.07099"},{"id":"eyJwYXBlcklEIjoiMjIwNC4xMTUxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2204.11515"},{"id":"eyJwYXBlcklEIjoiMjQwMy4wNjEyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.06128"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xNTcyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.15725"},{"id":"eyJwYXBlcklEIjoiMjIwNC4wNzk5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2204.07994"},{"id":"eyJwYXBlcklEIjoiMjQwOC4wNDY2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2408.04665"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNjQ2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.06461"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wNjc0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.06749"},{"id":"eyJwYXBlcklEIjoiMTcwMy4wNDA4OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1703.04088"},{"id":"eyJwYXBlcklEIjoiMTkwMy4xMDcxNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.10716"},{"id":"eyJwYXBlcklEIjoiMTgwMS4wMTExNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1801.01117"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wNTk2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.05961"},{"id":"eyJwYXBlcklEIjoiMjEwOC4xMjQ5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.12492"},{"id":"eyJwYXBlcklEIjoiMjEwOS4xNDE1MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2109.14151"},{"id":"eyJwYXBlcklEIjoiMjExMS4wODIyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.08227"},{"id":"eyJwYXBlcklEIjoiMjExMS4xNTA0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.15040"},{"id":"eyJwYXBlcklEIjoiMjIwMS4xMTQxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.11410"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wMjk1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.02959"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNzY2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.07665"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xMTExOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.11118"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wNzUxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.07519"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wNzc0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.07743"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wOTAzMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.09031"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xNzA1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.17050"},{"id":"eyJwYXBlcklEIjoiMjE0NTAiLCJwdWJsaXNoZXIiOiJjdnByIn0=","publisher":"cvpr","paperID":"21450"},{"id":"eyJwYXBlcklEIjoiMjE2MTMiLCJwdWJsaXNoZXIiOiJjdnByIn0=","publisher":"cvpr","paperID":"21613"},{"id":"eyJwYXBlcklEIjoiMjIwOTgiLCJwdWJsaXNoZXIiOiJjdnByIn0=","publisher":"cvpr","paperID":"22098"},{"id":"eyJwYXBlcklEIjoiMjQwMS4xMzI3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.13270"},{"id":"eyJwYXBlcklEIjoiMjIxMC4xNzAzOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.17039"},{"id":"eyJwYXBlcklEIjoiMjQwMS4wODE1NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.08154"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wMTI4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.01289"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wMzExOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.03118"},{"id":"eyJwYXBlcklEIjoiMjMxMC4wMjk2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.02964"},{"id":"eyJwYXBlcklEIjoiNzE2NTMiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71653"},{"id":"eyJwYXBlcklEIjoiMjQwMy4xMjMzMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.12331"},{"id":"eyJwYXBlcklEIjoiNzM1ODAiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"73580"},{"id":"eyJwYXBlcklEIjoiMjQwNC4wNjM1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.06353"},{"id":"eyJwYXBlcklEIjoiMjQwNS4xMjI2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.12262"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wMjQ5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.02495"},{"id":"eyJwYXBlcklEIjoiMjQwNi4wNzY0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.07645"},{"id":"eyJwYXBlcklEIjoiMzEyNTciLCJwdWJsaXNoZXIiOiJjdnByIn0=","publisher":"cvpr","paperID":"31257"},{"id":"eyJwYXBlcklEIjoiMzA3NTgiLCJwdWJsaXNoZXIiOiJjdnByIn0=","publisher":"cvpr","paperID":"30758"},{"id":"eyJwYXBlcklEIjoiMjQwNy4wNTgxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.05810"},{"id":"eyJwYXBlcklEIjoiMjQwOS4wMzYzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2409.03634"},{"id":"eyJwYXBlcklEIjoiMjQwOS4xNTcxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2409.15715"}]}]}},{"author":"rongjie lai","node":{"id":"eyJhZGRyZXNzIjoibGFpckBycGkuZWR1In0=","address":"lair@rpi.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"Wp3DnKUAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI3ZTBlZmJkOS05MGExLTRlOGUtYjMxZC00N2M5ZDYxNDk4ZDcifQ==","name":"rongjie lai","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgwOS4wNzM5OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1809.07399"},{"id":"eyJwYXBlcklEIjoiMTkxMi4xMDA5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.10094"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wNzg1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.07857"},{"id":"eyJwYXBlcklEIjoiMTgwNC4wNDMxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1804.04310"},{"id":"eyJwYXBlcklEIjoiMjAwOS4wMjQzOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.02439"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNjA4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.06081"},{"id":"eyJwYXBlcklEIjoiMTkwNS4xMjIxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.12218"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wMjUxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.02515"},{"id":"eyJwYXBlcklEIjoiMjAwNS4xMTYyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.11622"},{"id":"eyJwYXBlcklEIjoiMjMwMy4wOTg2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2303.09863"},{"id":"eyJwYXBlcklEIjoiMjAwNy4xMzA0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.13049"},{"id":"eyJwYXBlcklEIjoiMjIwNi4xNDk5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.14990"},{"id":"eyJwYXBlcklEIjoiMTgwOC4xMDA3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.10073"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wNTI3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.05279"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wMjY4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.02680"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wODY4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.08680"}]}]}},{"author":"feng lei fan","node":{"id":"eyJhZGRyZXNzIjoiZmFuZjJAcnBpLmVkdSJ9","address":"fanf2@rpi.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"pjK2mQwAAAAJ"},{"thirdPartyID":"YPmyK2wAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI5ZDlkODMwYy0xY2JmLTRiZWQtYTk2MS1hMzA3ZGFjN2Q3MzEifQ==","name":"fenglei fan","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgxMS4wOTAwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.09003"},{"id":"eyJwYXBlcklEIjoiMTgxMi4xMTY3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1812.11675"},{"id":"eyJwYXBlcklEIjoiMTgwNy4wMzIxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.03215"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wMjUxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.02515"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNTM4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.05386"}]},{"id":"eyJ1aWQiOiIwOWY5OWRjMC0xODY4LTQ1OWItYjQ3OS0wYTBhODVlOGI4YTEifQ==","name":"feng lei fan","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAxMS4wMzM4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.03384"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wOTAwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.09003"},{"id":"eyJwYXBlcklEIjoiMjExMC4wNjA4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.06081"},{"id":"eyJwYXBlcklEIjoiMTgxMi4xMTY3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1812.11675"},{"id":"eyJwYXBlcklEIjoiMTgwNy4wMzIxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.03215"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wMjUxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.02515"},{"id":"eyJwYXBlcklEIjoiMjQwNy4wOTU4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2407.09580"},{"id":"eyJwYXBlcklEIjoiMjIxMC4wNTQzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.05436"},{"id":"eyJwYXBlcklEIjoiMjIwNC4wMTcwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2204.01707"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNTM4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.05386"},{"id":"eyJwYXBlcklEIjoiMjEwOC4xMjg2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.12862"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wNzgxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.07814"},{"id":"eyJwYXBlcklEIjoiMjMwNy4xNjM2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.16363"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xOTQ3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.19477"}]}]}}]},"__typename":"paper","authorArray":["Feng-Lei Fan","Rongjie Lai","Ge Wang"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2002.02515","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2002.02515","publisher":"arxiv","paperJSON":{"title":"Quasi-Equivalence of Width and Depth of Neural Networks","paperID":"2002.02515","avgLineHeight":13.55,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"$31","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Keywords: ","element":"span"},{"text":"Deep networks, wide networks, ReLU networks, quasi-equivalence, network transformation","element":"span"}]]},{"heading":"1. Introduction","paragraphs":[[{"text":"Over the past years, deep learning ","element":"span"},{"href":"#id-0","text":"(Goodfellow et al., ","element":"a"},{"href":"#id-0","text":"2016; ","element":"a"},{"href":"#id-1","text":"Fan et al., ","element":"a"},{"href":"#id-1","text":"2018a) ","element":"a"},{"text":"has become the mainstream approach of machine learning and achieved the state-of-the-art performance in many important tasks ","element":"span"},{"href":"#id-2","text":"(Dahl et al., ","element":"a"},{"href":"#id-2","text":"2011; ","element":"a"},{"href":"#id-3","text":"Kumar et al., ","element":"a"},{"href":"#id-3","text":"2016; ","element":"a"},{"href":"#id-4","text":"Chen et al., ","element":"a"},{"href":"#id-4","text":"2017; ","element":"a"},{"href":"#id-5","text":"Wang, ","element":"a"},{"href":"#id-5","text":"2016)","element":"a"},{"text":". One of the key reasons that accounts for the success of deep learning is the increased depth, which allows a hierarchical representation of features. There are a number of papers dedicated to explaining why deep networks are better than shallow ones. Encouraging progress has been made along this direction. The idea to show the superiority of deep networks is basically to find a special family of functions that are very hard to be approximated by a shallow network but easy to be approximated by a deep network, or that a deep network can express complicated functions that a wide network could not ","element":"span"},{"href":"#id-6","text":"(Szymanski and McCane, ","element":"a"},{"href":"#id-6","text":"2014; ","element":"a"},{"href":"#id-7","text":"Cohen et al., ","element":"a"},{"href":"#id-7","text":"2016; ","element":"a"},{"href":"#id-8","text":"Mhaskar and Poggio, ","element":"a"},{"href":"#id-8","text":"2016; ","element":"a"},{"href":"#id-9","text":"Eldan and Shamir, ","element":"a"},{"href":"#id-9","text":"2016; ","element":"a"},{"href":"#id-10","text":"Montufar ","element":"a"},{"href":"#id-10","text":"et al., ","element":"a"},{"href":"#id-10","text":"2014; ","element":"a"},{"href":"#id-11","text":"Bianchini and Scarselli, ","element":"a"},{"href":"#id-11","text":"2014)","element":"a"},{"text":". For example, in ","element":"span"},{"href":"#id-9","text":"Eldan and Shamir ","element":"a"},{"href":"#id-9","text":"(2016)","element":"a"},{"text":", a special class of radial functions was constructed so that a one-hidden-layer network needs to use an exponential number of neurons to obtain a good approximation, but a two-hidden-layer network only requires a polynomial number of neurons for the same purpose. With the number of linear regions as the complexity measure, ","element":"span"},{"href":"#id-10","text":"Montufar et al. ","element":"a"},{"href":"#id-10","text":"(2014) ","element":"a"},{"text":"showed that the number of linear regions grows exponentially with the depth of a network but only polynomially with the width of a network. In ","element":"span"},{"href":"#id-11","text":"Bianchini and Scarselli ","element":"a"},{"href":"#id-11","text":"(2014)","element":"a"},{"text":", a topological measure was utilized to characterize the complexity of functions. Then, it was shown that deep networks can represent more complex functions than what the shallow counterparts express. Besides, width-bounded but depth-unbounded universal approximators were also developed ","element":"span"},{"href":"#id-12","text":"(Lu et al., ","element":"a"},{"href":"#id-12","text":"2017; ","element":"a"},{"href":"#id-13","text":"Lin and Jegelka, ","element":"a"},{"href":"#id-13","text":"2018; ","element":"a"},{"href":"#id-14","text":"Fan et al., ","element":"a"},{"href":"#id-14","text":"2018c) ","element":"a"},{"text":"in analogy to the depth-bounded but width-unbounded universal approximators ","element":"span"},{"href":"#id-15","text":"(Funahashi, ","element":"a"},{"href":"#id-15","text":"1989; ","element":"a"},{"href":"#id-16","text":"Hornik et al., ","element":"a"},{"href":"#id-16","text":"1989)","element":"a"},{"text":".","element":"span"}],[{"text":"Recently, the effects of width are discussed by more and more studies ","element":"span"},{"href":"#id-17","text":"(Cheng et al., ","element":"a"},{"href":"#id-17","text":"2016; ","element":"a"},{"href":"#id-18","text":"Chen and Liu, ","element":"a"},{"href":"#id-18","text":"2017; ","element":"a"},{"href":"#id-19","text":"Zagoruyko and Komodakis, ","element":"a"},{"href":"#id-19","text":"2016)","element":"a"},{"text":". Since width and depth are the most basic topology measures of a neural network, exploring the roles of width and depth in neural networks is a problem of strong interest and importance. Currently, there exist both width-bounded and depth-bounded universal approximators. Since both width-bounded and depth-bounded networks can represent any function, they can represent each other as well, which suggests the width-depth equivalence of neural networks. Nevertheless, how a neural network learns a mapping is quite different from the way used in proving the universal approximation. Moreover, the core of the width-depth conversion is to employ a network to learn another network instead of any function. Therefore, the width-depth conversion based on universal approximation falls short to capture the relationship between width and depth.","element":"span"}],[{"text":"Specifically, we argue that the width-depth conversion via universal approximation is simplistic, inefficient, and lack of insight: 1) (Simplistic) As mentioned earlier, the way used in enabling universal approximation is to divide a target function into many functions over tiny hypercubes. In practice, a network usually does not do so, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g.","element":"span"},{"text":"$32","element":"span"},{"style":{"fontStyle":"italic"},"text":"e.g.","element":"span"},{"text":", how to non-trivially do transformation between a wide and a deep network, and what the mechanism of interaction is between the width and depth of a network.","element":"span"}],[{"text":"To fill this gap, inspired by the De Morgan law, here we demonstrate from two perspectives that the width and depth of neural networks are quasi-equivalent. The first perspective leverages that a ReLU network is a piecewise linear function over polytopes, while second perspective utilizes the nested structure of deep networks and the parallel structure of wide networks. Specifically, in the first perspective, we revisit the De Morgan law:","element":"span"}],[{"style":{"width":"77%"},"width":1343,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/2-0.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.42},"width":44.73,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/2-1.png","element":"img","alt":" Ai","inline":true,"padRight":true},{"text":"is a propositional rule (","element":"span"},{"style":{"height":17.6},"width":748.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/2-2.png","element":"img","alt":"e.g., IF input ∈ [ai, bi]m, THEN input","inline":true,"padRight":true},{"text":"belongs to some class), and such rules are disjoint. A neural network can be linked to a rule-based system such as a collection of propositional rules. Straightforwardly, we can construct either a deep network to realize a union of propositional rules (left side) or a wide network that realizes the complement of the intersection of those rules after complement (right side). As a result, the constructed deep and wide networks are equivalent to each other. Furthermore, we elaborate the quasi-equivalence of general regression and classification networks by constructing two transforms mapping an arbitrary ReLU network to a wide network and a deep network, respectively, thereby verifying a general quasi-equivalence of the width and depth of ReLU networks. ","element":"span"},{"text":"Our constructive scheme is largely based on the fact that a ReLU network partitions the space into polytopes ","element":"span"},{"href":"#id-20","text":"(Chu et al., ","element":"a"},{"href":"#id-20","text":"2018)","element":"a"},{"text":". This enables us to have a simplicial complex in the space and then to establish a quasi-equivalence of networks using the essential building blocks, fan-shaped (more generally, hyper-cone-shape) functions, in the form of modularized ReLU networks.","element":"span"}],[{"id":"id-27","text":"Table 1: Network structures and complexities through transformation of regression and ","element":"figcaption","subtype":"caption"},{"text":"classification networks. ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"D ","element":"figcaption","subtype":"caption"},{"text":"is the input dimension, and ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"M ","element":"figcaption","subtype":"caption"},{"text":"is the complexity measure of a function class represented by ReLU networks.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"68%"},"width":1176,"height":223,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/2-3.png","element":"img"}],[{"text":"In the second perspective, we replace the mainstream artificial neuron type with a quadratic counterpart and extend our first perspective by utilizing the factorization and continued fraction representations of the same univariate polynomial to construct wide and deep networks, respectively. Specifically, a univariate polynomial function can be expressed as follows:","element":"span"}],[{"id":"id-25","style":{"width":"89%"},"width":1548,"height":456,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/2-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.82},"width":552.78,"height":44.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/3-0.png","element":"img","alt":" ai ̸= 0, and rj, sj, tj and bl, cl","inline":true,"padRight":true},{"text":"are related. Previously, inspired by neuronal diversity, our group designed the quadratic neuron ","element":"span"},{"href":"#id-1","text":"(Fan et al., ","element":"a"},{"href":"#id-1","text":"2018a) ","element":"a"},{"text":"that replaces the inner product in a conventional neuron with a quadratic function. Due to the merits of the idea, the network based on quadratic neurons have been increasingly studied and applied ","element":"span"},{"href":"#id-21","text":"(Bu and ","element":"a"},{"href":"#id-21","text":"Karpatne, ","element":"a"},{"href":"#id-21","text":"2021; ","element":"a"},{"href":"#id-22","text":"Ji et al., ","element":"a"},{"href":"#id-22","text":"2021; ","element":"a"},{"href":"#id-23","text":"Mantini and Shah, ","element":"a"},{"href":"#id-23","text":"2021; ","element":"a"},{"href":"#id-24","text":"Xu et al., ","element":"a"},{"href":"#id-24","text":"2022)","element":"a"},{"text":". Here, we can construct a wide quadratic network and a deep quadratic network to implement the left side and right side of Eq. ","element":"span"},{"href":"#id-25","text":"(2)","element":"a"},{"text":", respectively. This establishes the equivalence between wide and deep quadratic networks. Finally, we generalize such an equivalence into a multivariate setting based on Kolmogorov-Arnold theorem ","element":"span"},{"href":"#id-26","text":"(Kolmogorov, ","element":"a"},{"href":"#id-26","text":"1956)","element":"a"},{"text":".","element":"span"}],[{"text":"Our main contribution is the establishment of the width-depth quasi-equivalence of neural networks. We summarize our main results on ReLU networks (the first perspective) and quadratic networks (the second perspective) in Tables ","element":"span"},{"href":"#id-27","text":"1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-28","text":"2, ","element":"a"},{"text":"respectively. Specifically, Table 1 lists the width and depth of the wide and deep networks constructed in our first perspective, where the complexity measure ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"is the minimum number of simplices needed to cover the polytopes formed by a ReLU network. Table 2 shows the width and depth of the wide and deep quadratic networks constructed in our second perspective, where the complexity measures ","element":"span"},{"style":{"height":15.02},"width":211.65,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/3-1.png","element":"img","alt":" K1 and K2","inline":true,"padRight":true},{"text":"are the degrees of polynomials to represent a function of interest. Clearly, given a complexity measure, the width of the constructed wide network is greater than the depth, while the depth of the constructed deep network is greater than the width.","element":"span"}],[{"id":"id-28","text":"Table 2: Network structures and complexities through the extension of the De Morgan law. ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"D ","element":"figcaption","subtype":"caption"},{"text":"is the input dimension, and ","element":"figcaption","subtype":"caption"},{"style":{"height":15.02},"width":203.16,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/3-2.png","element":"img","alt":" K1 and K2","inline":true,"padRight":true},{"text":"are the complexity measure of a function class.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"68%"},"width":1179,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/3-3.png","element":"img"}],[{"text":"To put our contributions in perspective, we would like to mention relevant studies. ","element":"span"},{"href":"#id-29","text":"Jacot ","element":"a"},{"href":"#id-29","text":"et al. ","element":"a"},{"href":"#id-29","text":"(2018) ","element":"a"},{"text":"proposed the theory of the neural tangent kernel (NTK), which provides a useful lens to understand a network when the width of a network goes to infinity. ","element":"span"},{"href":"#id-30","text":"Kawaguchi ","element":"a"},{"href":"#id-30","text":"et al. ","element":"a"},{"href":"#id-30","text":"(2019) ","element":"a"},{"text":"analyzed the effect of width and depth on the quality of local minima. They showed that the quality of local minima improves toward the global minima as depth and width become larger. ","element":"span"},{"href":"#id-31","text":"Levine et al. ","element":"a"},{"href":"#id-31","text":"(2020) ","element":"a"},{"text":"revealed the width-depth interplay in a self-attention network. ","element":"span"},{"text":"We discuss the width of neural networks as related to NTK in the ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information I. ","element":"span"},{"text":"To the best of our knowledge, our study is the first work to reveal the width-depth quasi-equivalence of neural networks.","element":"span"}]]},{"heading":"2. Quasi-Equivalence by De Morgan’s Law","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"2.1 Preliminaries","element":"span"}],[{"text":"For convenience, we use ","element":"span"},{"style":{"height":17.6},"width":334.74,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/3-4.png","element":"img","alt":" σ(x) = max{0, x}","inline":true,"padRight":true},{"text":"to denote the ReLU function. We mainly discuss ReLU networks in this work, thus all networks in the rest of this paper are referred as ReLU networks unless otherwise specified. At the same time, we focus on the fully-connected ReLU networks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A regression network is a network with continuous outputs, while a classifi-cation network is a network that produces categorical outputs (for example, ","element":"span"},{"style":{"height":17.6},"width":304.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-0.png","element":"img","alt":" {0, 1, · · · , 9} for","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"digit recognition). The classification network is obtained by thresholding a ReLU network in the last layer. In this study, we investigate a classification network with binary labels without loss of generality.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 2 (Width and depth of a feedforward network ","element":"span"},{"href":"#id-32","style":{"fontWeight":"bold"},"text":"(Arora et al., ","element":"a"},{"href":"#id-32","style":{"fontWeight":"bold"},"text":"2016)","element":"a"},{"style":{"fontWeight":"bold"},"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For any number of hidden layers ","element":"span"},{"style":{"height":13.2},"width":113.45,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-1.png","element":"img","alt":" k ∈ N","inline":true},{"style":{"fontStyle":"italic"},"text":", input and output dimensions ","element":"span"},{"style":{"height":16.38},"width":455.4,"height":40.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-2.png","element":"img","alt":" w0, wk+1 ∈ N, a Rw0 →","inline":true},{"style":{"height":12.33},"width":109.48,"height":30.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-3.png","element":"img","alt":"Rwk+1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"feedforward network is given by specifying a sequence of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"style":{"fontStyle":"italic"},"text":"natural numbers ","element":"span"},{"style":{"height":11.2},"width":265.94,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-4.png","element":"img","alt":" w1, w2, . . . , wk","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"representing widths of the hidden layers, a set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"style":{"fontStyle":"italic"},"text":"affine transformations ","element":"span"},{"style":{"height":14.95},"width":318.19,"height":37.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-5.png","element":"img","alt":" Ti : Rwi−1 → Rwi","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , k ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and a linear transformation ","element":"span"},{"style":{"height":16.38},"width":386.25,"height":40.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-6.png","element":"img","alt":" Tk+1 : Rwk → Rwk+1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"corresponding to weights of the hidden layers. The function ","element":"span"},{"style":{"height":16.4},"width":270.12,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-7.png","element":"img","alt":" f : Rn1 → Rn2 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"computed or represented by this network is","element":"span"}],[{"style":{"width":"69%"},"width":1194,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-8.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":8},"width":22,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-9.png","element":"img","alt":" ◦","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes function composition, and ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-10.png","element":"img","alt":" σ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is an activation function. The depth of a ReLU DNN is defined as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1","element":"span"},{"style":{"fontStyle":"italic"},"text":". The width of a ReLU DNN is ","element":"span"},{"text":"max ","element":"span"},{"style":{"height":17.6},"width":255.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-11.png","element":"img","alt":" {w1, . . . , wk}.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Definition 3 (Width and depth of a shortcut network) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a shortcut network ","element":"span"},{"style":{"height":15.2},"width":52.27,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-12.png","element":"img","alt":" Π,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"we delete a minimum number of links such that the resultant network ","element":"span"},{"style":{"height":12},"width":55.27,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-13.png","element":"img","alt":" Π′ ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is a feedforward network without any isolated neuron. Then, we define the width and depth of ","element":"span"},{"style":{"height":12},"width":40,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-14.png","element":"img","alt":" Π","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"as the width and depth of ","element":"span"},{"style":{"height":12},"width":55.28,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-15.png","element":"img","alt":" Π′","inline":true},{"style":{"fontStyle":"italic"},"text":", respectively.","element":"span"}],[{"text":"Over the past several years, increasingly diversified network architectures, such as randomly wired networks ","element":"span"},{"href":"#id-33","text":"(Xie et al., ","element":"a"},{"href":"#id-33","text":"2019)","element":"a"},{"text":", networks with stochastic structures ","element":"span"},{"href":"#id-34","text":"(Deng et al., ","element":"a"},{"href":"#id-34","text":"2020)","element":"a"},{"text":", etc. are used as backbones for deep learning. Our definitions for width and depth are applicable to many unusual network configurations. Moreover, they are also natural extensions of the conventional width and depth definitions and can make sense for common networks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 4 (Simplicial complex) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"style":{"fontStyle":"italic"},"text":"-simplex ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"style":{"fontStyle":"italic"},"text":"-dimensional convex hull provided by convex combinations of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"affinely independent vectors ","element":"span"},{"style":{"height":20.02},"width":451.05,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-16.png","element":"img","alt":" {vi}Di=0 ⊂ RD. In other","inline":true}],[{"style":{"fontStyle":"italic"},"text":"words, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"=","element":"span"}],[{"style":{"width":"31%"},"width":542,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-17.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"called a face of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"style":{"fontStyle":"italic"},"text":". A simplicial complex ","element":"span"},{"style":{"height":24.8},"width":140.35,"height":62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-18.png","element":"img","alt":" S =�","inline":true}],[{"style":{"fontStyle":"italic"},"text":"satisfying: 1) every face of a simplex from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is also in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"style":{"fontStyle":"italic"},"text":"; 2) the non-empty intersection of any two simplices ","element":"span"},{"style":{"height":15.6},"width":193.13,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-19.png","element":"img","alt":" S1, S2 ∈ S","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is a face of both ","element":"span"},{"style":{"height":15.02},"width":204.74,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/4-20.png","element":"img","alt":" S1 and S2.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proposition 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a function represented by a ReLU network, then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a piecewise linear function that splits the space into polytopes, where each polytope is convex and associated with a linear function.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"Inspired by the idea in ","element":"span"},{"href":"#id-20","text":"(Chu et al., ","element":"a"},{"href":"#id-20","text":"2018)","element":"a"},{"text":", the proof here is for all ReLU networks, including networks using shortcuts. Let a vector ","element":"span"},{"style":{"height":17.6},"width":309.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-0.png","element":"img","alt":" C = {c1, ..., cN}","inline":true,"padRight":true},{"text":"denote the firing states of all neurons in the network, where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"is the total number of neurons, and ","element":"span"},{"style":{"height":17.6},"width":217.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-1.png","element":"img","alt":" ci ∈ {0, 1}.","inline":true},{"style":{"height":15.02},"width":473.63,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-2.png","element":"img","alt":"ci = 0 means that the i","inline":true},{"text":"-th neuron is not fired and vice versa. The firing state of every neuron is determined by the input ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":", then we denote the set of instances that share the same collective neuron firing state ","element":"span"},{"style":{"height":15.24},"width":58.24,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-3.png","element":"img","alt":" Ch","inline":true,"padRight":true},{"text":"as the polytope ","element":"span"},{"style":{"height":17.6},"width":651.73,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-4.png","element":"img","alt":" Ph: Ph = {x | C(x) = Ch}. For","inline":true},{"style":{"height":14.84},"width":127.83,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-5.png","element":"img","alt":"x ∈ Ph","inline":true},{"text":", the output of the neuron ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"is a linear function, denoted as ","element":"span"},{"style":{"height":20.33},"width":110.03,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-6.png","element":"img","alt":" n(i)(x","inline":true},{"text":"). Then, the firing state of the neuron ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"is controlled by a linear inequality (","element":"span"},{"style":{"height":20.33},"width":603.96,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-7.png","element":"img","alt":"n(i)(x) > 0 or n(i)(x) ≤ 0). In","inline":true,"padRight":true},{"text":"total, ","element":"span"},{"style":{"height":17.6},"width":218.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-8.png","element":"img","alt":" C(x) = Ch","inline":true,"padRight":true},{"text":"is equivalent to a set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"linear inequality constraints, indicating that ","element":"span"},{"style":{"height":14.84},"width":48.02,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-9.png","element":"img","alt":"Ph","inline":true,"padRight":true},{"text":"is a convex polytope.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 5 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We define the complexity of the function represented by a ReLU network as the minimum number of simplices: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"style":{"fontStyle":"italic"},"text":"that are needed to cover each and every polytope to support the function of the ReLU network.","element":"span"}],[{"id":"id-38","style":{"width":"49%"},"width":854,"height":379,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/5-10.png","element":"img"}],[{"text":"Figure 1: (a) A one-hidden-layer network with three neurons to classify concentric rings. (b) A one-hidden-layer network with six neurons to classify concentric rings.","element":"figcaption","subtype":"caption"}],[{"text":"Here, we elaborate why ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"is a good measure. ","element":"span"},{"text":"Previously, because a deep network with piecewise linear activation is a piecewise linear function, the number of linear regions (polytopes) was intensively studied to measure the complexity of a neural network. For example, ","element":"span"},{"href":"#id-10","text":"Montufar et al. ","element":"a"},{"href":"#id-10","text":"(2014) ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-35","text":"Serra et al. ","element":"a"},{"href":"#id-35","text":"(2018) ","element":"a"},{"text":"estimated the upper and lower bounds of the number of linear regions with respect to the number of neurons at each layer. ","element":"span"},{"href":"#id-36","text":"Xiong et al. ","element":"a"},{"href":"#id-36","text":"(2020) ","element":"a"},{"text":"computed the bounds for convolutional neural networks. ","element":"span"},{"href":"#id-37","text":"Park ","element":"a"},{"href":"#id-37","text":"et al. ","element":"a"},{"href":"#id-37","text":"(2021) ","element":"a"},{"text":"proposed neural activation coding to maximize the number of linear regions to improve the model performance. Despite these results, we find that there exists a problem with the number of linear regions as a complexity measure. It may happen that simple and complex networks realize the same number of regions for a given task. As shown in Figure ","element":"span"},{"href":"#id-38","text":"1, ","element":"a"},{"text":"two networks divide the space into two linear regions to separate concentric rings. But one network uses three neurons to define a triangle domain, while the other has six neurons to form a hexagon. In this example, according to the number of linear regions, the two networks have the same complexity but the six-neuron network is apparently more complex than the other. To address this problem, the complexity of a linear region should be taken into account as well. We argue that how many simplices a linear region comprises indicates how complex a linear region is. ","element":"span"},{"text":"Therefore, we propose to use the number of simplices as a legitimate complexity measure. In Figure ","element":"span"},{"href":"#id-38","text":"1, ","element":"a"},{"text":"counting the number of simplices, the complexity of two networks are 2 and 4, respectively, which is a better characterization.","element":"span"}],[{"text":"Let us estimate the lower bound of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":". To this end, we need to take advantage of the lower bound of the number of polytopes. Empirical bounds of the number of polytopes (","element":"span"},{"style":{"height":18.22},"width":71.1,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-0.png","element":"img","alt":"Np)","inline":true,"padRight":true},{"text":"in a feedforward ReLU network were estimated in ","element":"span"},{"href":"#id-10","text":"(Montufar et al., ","element":"a"},{"href":"#id-10","text":"2014; ","element":"a"},{"href":"#id-35","text":"Serra et al., ","element":"a"},{"href":"#id-35","text":"2018; ","element":"a"},{"href":"#id-39","text":"Serra and Ramalingam, ","element":"a"},{"href":"#id-39","text":"2020)","element":"a"},{"text":", where one result in ","element":"span"},{"href":"#id-10","text":"(Montufar et al., ","element":"a"},{"href":"#id-10","text":"2014) ","element":"a"},{"text":"states that let ","element":"span"},{"style":{"height":15.2},"width":288.41,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-1.png","element":"img","alt":"ni, i = 1, · · · , L","inline":true},{"text":", be the number of neurons in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th layer, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"be the dimension of the input space, ","element":"span"},{"style":{"height":17.02},"width":53.06,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-2.png","element":"img","alt":" Np","inline":true,"padRight":true},{"text":"is lower bounded by","element":"span"},{"style":{"height":31.6},"width":489.74,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-3.png","element":"img","alt":"� �L−1i=1 [ niD ]D�· �Dj=0�nLj�","inline":true},{"text":". The polytopes constructed therein are hypercubes. Because the minimum number of simplices that fill a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional hypercube is ","element":"span"},{"style":{"height":28.99},"width":231.66,"height":72.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-4.png","element":"img","alt":"2D·D!(D+1)(D+1)/2 ,","inline":true}],[{"id":"id-56","style":{"width":"74%"},"width":1286,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-5.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Definition 6 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We define a wide network and a deep network as follows. Let us assume a function that can be sufficiently complex and yet can be represented by a network. When such a function becomes increasingly complex, the structure of this network must be also increasingly complex, depending on the complexity of the function. We call a network wide if its width is larger than its depth by at least an order of magnitude in ","element":"span"},{"style":{"height":19.13},"width":356.08,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-6.png","element":"img","alt":" M, e.g., O(Mα+1)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"vs ","element":"span"},{"style":{"height":17.6},"width":345.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-7.png","element":"img","alt":" O(Mα), where M","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the complexity measure and ","element":"span"},{"style":{"height":12.4},"width":118.72,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-8.png","element":"img","alt":" α > 0","inline":true},{"style":{"fontStyle":"italic"},"text":". Similarly, we call a network deep if its depth is larger than its width by at least an order of magnitude in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"text":"It is underscored that we use two different concepts: the complexity of the function class represented by networks and the structural complexity of a network. ","element":"span"},{"text":"The former measures the complexity of the function, while the latter measures the topological structure of a network. ","element":"span"},{"text":"In our transformation scheme, the structures of constructs/networks are determined by the complexity of the function of interest.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 7 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We call a wide network ","element":"span"},{"style":{"height":15.02},"width":245.98,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-9.png","element":"img","alt":" N1 : Ω → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is equivalent to a deep network ","element":"span"},{"style":{"height":14.62},"width":87.33,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-10.png","element":"img","alt":" N2 :","inline":true,"padRight":true},{"text":"Ω ","element":"span"},{"style":{"height":17.6},"width":639.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-11.png","element":"img","alt":" → R, if N1(x) = N2(x), ∀x ∈ Ω","inline":true},{"style":{"fontStyle":"italic"},"text":". We call a wide network ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-12.png","element":"img","alt":" N1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is quasi-equivalent to a deep network ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-13.png","element":"img","alt":" N2","inline":true},{"style":{"fontStyle":"italic"},"text":", if there is ","element":"span"},{"style":{"height":17.6},"width":931.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-14.png","element":"img","alt":" δ > 0, m({x ∈ Ω | N1(x) ̸= N2(x)} < δ, where m","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is a Lebesgue measurement defined on ","element":"span"},{"text":"Ω","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.2 Motivating Example","element":"span"}],[{"text":"An important school of neural network interpretability research ","element":"span"},{"href":"#id-40","text":"(Fan and Wang, ","element":"a"},{"href":"#id-40","text":"2020; ","element":"a"},{"href":"#id-41","text":"Adadi ","element":"a"},{"href":"#id-41","text":"and Berrada, ","element":"a"},{"href":"#id-41","text":"2018) ","element":"a"},{"text":"is to extract interpretable rules from a network ","element":"span"},{"href":"#id-42","text":"(Thrun, ","element":"a"},{"href":"#id-42","text":"1995; ","element":"a"},{"href":"#id-43","text":"Setiono ","element":"a"},{"href":"#id-43","text":"and Liu, ","element":"a"},{"href":"#id-43","text":"1995; ","element":"a"},{"href":"#id-44","text":"Saad and II, ","element":"a"},{"href":"#id-44","text":"2007) ","element":"a"},{"text":"using decompositional or pedagogical methods ","element":"span"},{"href":"#id-42","text":"(Thrun, ","element":"a"},{"href":"#id-42","text":"1995)","element":"a"},{"text":". Pedagogical methods decode a set of rules that imitate the input-output relationship of a network, whereas these rules do not necessarily correspond to the parameters of the network. One common type of rules are propositional in the IF-THEN format, where the preconditions are provided as a set of hypercubes with respect to the input: IF ","element":"span"},{"style":{"height":17.6},"width":573.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/6-15.png","element":"img","alt":" input ∈ [ai, bi]m, THEN input","inline":true,"padRight":true},{"text":"belongs to some class. Since there is a connection between the rule-based inference and the network-based","element":"span"}],[{"text":"inference, we consider a neural network in terms of propositional rules. Furthermore, we","element":"span"}],[{"style":{"width":"80%"},"width":1383,"height":703,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-0.png","element":"img"}],[{"text":"Figure 2: The width and depth equivalence in light of the De Morgan law duality. ","element":"figcaption","subtype":"caption"},{"text":"In this construction, a deep network implements ","element":"figcaption","subtype":"caption"},{"style":{"height":15.42},"width":309.35,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-1.png","element":"img","alt":" A1 ∨ A2 · · · ∨ An","inline":true,"padRight":true},{"text":"using a trapezoid function, and a wide network implements ","element":"figcaption","subtype":"caption"},{"style":{"height":31.6},"width":710.38,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-2.png","element":"img","alt":" ¬�(¬A1) ∧ (¬A2) · · · ∧ (¬An)�using","inline":true,"padRight":true},{"text":"the trap-like function. (","element":"figcaption","subtype":"caption"},{"style":{"height":18.73},"width":55.09,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-3.png","element":"img","alt":"·)+ ","inline":true,"padRight":true},{"text":"denotes the ReLU activation.","element":"figcaption","subtype":"caption"}],[{"text":"know that the De Morgan law holds true for disjoint propositional rules. Mathematically, the De Morgan law is formulated as","element":"span"}],[{"id":"id-45","style":{"width":"77%"},"width":1343,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.42},"width":44.73,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-5.png","element":"img","alt":" Ai","inline":true,"padRight":true},{"text":"is a rule, and ","element":"span"},{"style":{"height":15.42},"width":73.82,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-6.png","element":"img","alt":" ¬Ai","inline":true,"padRight":true},{"text":"is its negation. The De Morgan law gives a duality in the sense of binary logic that the operations ","element":"span"},{"style":{"height":12.8},"width":151.42,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-7.png","element":"img","alt":" ∨ and ∧","inline":true,"padRight":true},{"text":"are dual, which means that for any propositional rule system described by ","element":"span"},{"style":{"height":15.42},"width":298.81,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-8.png","element":"img","alt":" A1 ∨A2 · · ·∨An","inline":true},{"text":", there exists an equivalent dual propositional rule system ","element":"span"},{"style":{"height":31.6},"width":596.04,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-9.png","element":"img","alt":" ¬�(¬A1) ∧ (¬A2) · · · ∧ (¬An)�.","inline":true}],[{"text":"Regarding each rule as an indicator function over a hypercube:","element":"span"}],[{"style":{"width":"39%"},"width":679,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/7-10.png","element":"img"}],[{"text":"in Figure ","element":"span"},{"href":"#id-45","text":"2, ","element":"a"},{"text":"we construct a deep network that realizes a logic union of propositional rules (the left hand side of Eq. ","element":"span"},{"href":"#id-45","text":"(5)","element":"a"},{"text":") and a wide network that realizes the negation of the logic intersection of those rules after negation (the right hand side of Eq. ","element":"span"},{"href":"#id-45","text":"(5)","element":"a"},{"text":"). As a result, the constructed deep and wide networks are equivalent by the De Morgan law.","element":"span"}],[{"text":"The above motivating example inspires us to consider the width-depth equivalence in a broader domain. First, a ReLU network is a piecewise linear function over polytopes. To generate rules, such a piecewise linear function should be divided into simplices instead of hypercubes. As shown in Figure ","element":"span"},{"href":"#id-46","text":"3, ","element":"a"},{"text":"we only need two rules if we build rules over simplices, which is much more efficient than building rules over hypercubes. Second, the network can be a regression network rather than a classification network. Thus, representing a linear function rather than an indicator function is demanded. Based on these two considerations, we generalize an indicator function over a hypercube to a linear function over a simplex ","element":"span"},{"style":{"height":15.02},"width":38.76,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/8-0.png","element":"img","alt":" Si","inline":true,"padRight":true},{"text":"in a bounded domain:","element":"span"}],[{"id":"id-46","style":{"width":"49%"},"width":853,"height":543,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/8-1.png","element":"img"}],[{"text":"Figure 3: Building rules over simplices is more efficient than over hypercubes for a network.","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"2.3 Quasi-Equivalence of Width and Depth of Networks","element":"span"}],[{"text":"This section describes the first contribution of our paper. We formulate the transformation from an arbitrary ReLU network to a wide network and a deep network, respectively. We use a network-based building block to represent a linear function over a simplex. Integrating such building blocks can represent any piecewise linear function over polytopes, thereby elaborating a general equivalence of the width and depth of networks. ","element":"span"},{"text":"Particularly, a regression ReLU network is converted into both a wide and a deep ReLU network (","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-47","style":{"fontWeight":"bold"},"text":"9","element":"a"},{"text":"), while a classification ReLU network is a special case of a regression ReLU network (","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-48","style":{"fontWeight":"bold"},"text":"10","element":"a"},{"text":"). In the regression networks, the transformation of a univariate network is rather different from that of a multivariate network. As a result, the equivalence for the wide and deep networks in the univariate case is precise, whereas the multivariate wide and deep networks are made approximately equivalent up to an arbitrarily small error. What’s more, in the multivariate case, the width of the wide network is not the same as the depth of the deep network. This is why we term such an equivalence as a quasi-equivalence.","element":"span"}],[{"text":"2.3.1 Regression Networks","element":"span"}],[{"text":"The sketch of transforming a regression ReLU network is that we first construct either a wide modular network or a deep modular network to represent the corresponding function over each and every simplex, then we aggregate the results into deep or wide networks in series or parallel, respectively, to represent the original network well.","element":"span"}],[{"id":"id-49","style":{"fontWeight":"bold"},"text":"Theorem 8 (Equivalence of Univariate Regression Networks) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given any ReLU network ","element":"span"},{"style":{"height":17.6},"width":324.69,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/8-2.png","element":"img","alt":" f : [−B, B] → R","inline":true},{"style":{"fontStyle":"italic"},"text":", there is a wide ReLU network ","element":"span"},{"style":{"height":17.6},"width":356.83,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/8-3.png","element":"img","alt":" H1 : [−B, B] → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a deep ReLU network ","element":"span"},{"style":{"height":17.6},"width":358.21,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/8-4.png","element":"img","alt":" H2 : [−B, B] → R","inline":true},{"style":{"fontStyle":"italic"},"text":", such that ","element":"span"},{"style":{"height":17.6},"width":734.14,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/8-5.png","element":"img","alt":" f(x) = H1(x) = H2(x), ∀x ∈ [−B, B].","inline":true}],[{"text":"Our main result is formally summarized as the following quasi-equivalence theorem for the multivariate case.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 9 (Quasi-Equivalence of Multivariate Regression Networks) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that the representation of an arbitrary ReLU network is ","element":"span"},{"style":{"height":19.53},"width":514.78,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-0.png","element":"img","alt":" h : [−B, B]D → R, and M","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the minimum number of simplices to cover the polytopes to support ","element":"span"},{"style":{"height":16.4},"width":326.24,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-1.png","element":"img","alt":" h, for any δ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exist a wide ReLU network ","element":"span"},{"style":{"height":21},"width":970.97,"height":52.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-2.png","element":"img","alt":" H1 of width O�D(D + 1)(2D − 1)M�and depth D","inline":true},{"style":{"fontStyle":"italic"},"text":", and also a deep ReLU network ","element":"span"},{"style":{"height":19.13},"width":923.22,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-3.png","element":"img","alt":" H2 of width (D + 1)D2 and depth O [(D + 1)M]","inline":true},{"style":{"fontStyle":"italic"},"text":", satisfying that","element":"span"}],[{"id":"id-47","style":{"width":"65%"},"width":1125,"height":174,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":17.6},"width":79.55,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-5.png","element":"img","alt":" m(·)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the standard measure in ","element":"span"},{"text":"[","element":"span"},{"style":{"height":19.53},"width":195.36,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-6.png","element":"img","alt":"−B, B]D.","inline":true}],[{"text":"We defer the proof of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-49","style":{"fontWeight":"bold"},"text":"8 ","element":"a"},{"text":"to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information II","element":"span"},{"text":", and split the proof of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-47","style":{"fontWeight":"bold"},"text":"9 ","element":"a"},{"text":"into the two-dimensional case (more intuitive) in Appendix and the general case in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information II ","element":"span"},{"text":"for better readability.","element":"span"}],[{"id":"id-52","style":{"width":"48%"},"width":840,"height":406,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-7.png","element":"img"}],[{"text":"Figure 4: Due to the unboundedness and continuity, the representations in ","element":"figcaption","subtype":"caption"},{"href":"#id-50","text":"(Wang and Sun, ","element":"a","subtype":"caption"},{"href":"#id-50","text":"2005; ","element":"a","subtype":"caption"},{"href":"#id-51","text":"He et al., ","element":"a","subtype":"caption"},{"href":"#id-51","text":"2018) ","element":"a","subtype":"caption"},{"text":"is handicapped in representing a function over polytopes that make a non-convex region.","element":"figcaption","subtype":"caption"}],[{"text":"The key idea to represent a linear function over a simplex is to construct high-dimensional fan-shaped functions that are supported in fan-shaped domains, and to use these constructs to eliminate non-zero functional values outside the simplex of interest. This is a new and local way to represent a piecewise linear function over polytopes. In contrast, there are two global ways to represent piecewise linear functions ","element":"span"},{"href":"#id-50","text":"(Wang and Sun, ","element":"a"},{"href":"#id-50","text":"2005; ","element":"a"},{"href":"#id-51","text":"He et al., ","element":"a"},{"href":"#id-51","text":"2018)","element":"a"},{"text":". In ","element":"span"},{"href":"#id-50","text":"(Wang and Sun, ","element":"a"},{"href":"#id-50","text":"2005)","element":"a"},{"text":", for every piecewise linear function ","element":"span"},{"style":{"height":16.4},"width":369.7,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-8.png","element":"img","alt":" f : Rn → R, there","inline":true,"padRight":true},{"text":"exists a finite set of linear functions ","element":"span"},{"style":{"height":12},"width":194.8,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-9.png","element":"img","alt":" g1, · · · , gm","inline":true,"padRight":true},{"text":"and subsets ","element":"span"},{"style":{"height":17.6},"width":326.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-10.png","element":"img","alt":" T1, · · · , TP ⊆ [m","inline":true},{"text":"] such that ","element":"span"},{"style":{"height":32.06},"width":1121.07,"height":80.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-11.png","element":"img","alt":"f = �Pp=1 spmaxi∈Tp {gi}, where sp ∈ {−1, +1}, p = 1, · · · , P","inline":true},{"text":". In ","element":"span"},{"href":"#id-51","text":"(He et al., ","element":"a"},{"href":"#id-51","text":"2018)","element":"a"},{"text":", the rep- ","element":"span"},{"text":"resentation is ","element":"span"},{"style":{"height":28.42},"width":351.94,"height":71.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-12.png","element":"img","alt":" f = max1≤p≤Pmini∈Tp{gi}","inline":true},{"text":". Nevertheless, due to the unboundedness and continuity, ","element":"span"},{"text":"the global representation of a piecewise linear function is handicapped over polytopes that make a non-convex region. Let us use Figure ","element":"span"},{"href":"#id-52","text":"4 ","element":"a"},{"text":"to illustrate our point, where the relations of ","element":"span"},{"style":{"height":12},"width":156.08,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-13.png","element":"img","alt":" g1, g2, g3","inline":true,"padRight":true},{"text":"are summarized in Table ","element":"span"},{"href":"#id-53","text":"3, ","element":"a"},{"text":"and the function value over the purple area is zero.","element":"span"}],[{"text":"• ","element":"span"},{"text":"Representation in ","element":"span"},{"href":"#id-50","text":"Wang and Sun ","element":"a"},{"href":"#id-50","text":"(2005)","element":"a"},{"text":"; ","element":"span"},{"href":"#id-51","text":"He et al. ","element":"a"},{"href":"#id-51","text":"(2018)","element":"a"},{"text":": ","element":"span"},{"style":{"height":17.6},"width":367.29,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/9-14.png","element":"img","alt":" f = max{g1, g2, g3}","inline":true}],[{"text":"Table 3: Regions and relations of functions.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"30%"},"width":528,"height":219,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-0.png","element":"img"}],[{"id":"id-53","text":"• ","element":"span"},{"text":"Ours: ","element":"span"},{"style":{"height":20.52},"width":1349.69,"height":51.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-1.png","element":"img","alt":" f = (g1){x∈Ω1} + (g2){x∈Ω2,1} + (g2){x∈Ω2,2} + (g2){x∈Ω2,3} + (g3){x∈Ω3}.","inline":true}],[{"text":"It can be seen that due to the unboundedness and continuity, ","element":"span"},{"style":{"height":17.6},"width":367.29,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-2.png","element":"img","alt":" f = max{g1, g2, g3}","inline":true,"padRight":true},{"text":"is inaccurate over the purple area. But our representation is accurate over the purple area because it is local. We highlight the construction of fan-shaped functions, which opens a new door for high-dimensional piecewise function representation. ","element":"span"},{"text":"Particularly, the employment of fan-shaped functions will enable a neural network to express a manifold more effectively.","element":"span"}],[{"style":{"width":"98%"},"width":1701,"height":370,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-3.png","element":"img"}],[{"text":"Figure 5: Typical fan-shaped functions constructed by a modularized network to eliminate non-zero functional values outside the simplex of interest.","element":"figcaption","subtype":"caption"}],[{"id":"id-55","text":"Since such a fan-shaped function is a basic building block in our construction of wide and ","element":"span"},{"text":"deep equivalent networks, let us explain it in a two-dimensional case for easy visualization. An essential building block expressed by a network in Figure ","element":"span"},{"href":"#id-54","text":"5(","element":"a"},{"text":"a) is based on the following function:","element":"span"}],[{"id":"id-54","style":{"width":"67%"},"width":1170,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":23.8},"width":1273.1,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-5.png","element":"img","alt":" h1(x) = p(1)1 x1 + p(1)2 x2 + r(1), and h2(x) = p(2)1 x1 + p(2)2 x2 + r(2)","inline":true,"padRight":true},{"text":"are provided by ","element":"span"},{"text":"two linearly independent vectors ","element":"span"},{"style":{"height":23.8},"width":588.94,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-6.png","element":"img","alt":" {(p(1)1 , p(1)2 ), (p(2)1 , p(2)2 )}, and µ","inline":true,"padRight":true},{"text":"is a positive controlling ","element":"span"},{"text":"factor. Eq. ","element":"span"},{"href":"#id-55","text":"(7) ","element":"a"},{"text":"is a ReLU network of depth=2 and width=2 according to our width-depth definition. As illustrated in Figure ","element":"span"},{"href":"#id-54","text":"5(","element":"a"},{"text":"b), the support of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":") contains three boundaries and four polytopes (two of which only allow zero value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"). ","element":"span"},{"text":"For convenience, given a linear function ","element":"span"},{"style":{"height":19.13},"width":1378.43,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-7.png","element":"img","alt":" ℓ(x) = c1x1 + c2x2 + c3, we define ℓ− = {x ∈ R2 | ℓ(x) < 0} and","inline":true},{"style":{"height":19.13},"width":487.36,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-8.png","element":"img","alt":"ℓ+ = {x ∈ R2 | ℓ(x) ≥ 0}","inline":true},{"text":". Thus, we can write Ω","element":"span"},{"style":{"height":20.34},"width":793.83,"height":50.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-9.png","element":"img","alt":"1 = h+1 ∩ h−2 and Ω2 = (h1 − µh2)− ∩ h+2 .","inline":true,"padRight":true},{"text":"There are three properties of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":"). First, the common line shared by Ω","element":"span"},{"style":{"height":17.6},"width":405.51,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-10.png","element":"img","alt":"1 and Ω2 is h2(x) = 0.","inline":true,"padRight":true},{"text":"Second, the size of Ω","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-11.png","element":"img","alt":"2","inline":true,"padRight":true},{"text":"is adjustable by controlling ","element":"span"},{"style":{"height":12},"width":26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-12.png","element":"img","alt":" µ","inline":true},{"text":". Note that ","element":"span"},{"style":{"height":17.6},"width":464.75,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-13.png","element":"img","alt":" h1(x) − µh2(x) = 0 can","inline":true,"padRight":true},{"text":"move very close to ","element":"span"},{"style":{"height":17.6},"width":402.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-14.png","element":"img","alt":" h2(x) = 0 as µ → ∞","inline":true},{"text":", which makes Ω","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-15.png","element":"img","alt":"2","inline":true,"padRight":true},{"text":"negligible. In the limiting case, the support of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":") converges to the fan-shaped domain Ω","element":"span"},{"style":{"height":17.6},"width":593.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/10-16.png","element":"img","alt":"1. Because h1(x) − µh2(x) = 0","inline":true,"padRight":true},{"text":"is almost parallel to ","element":"span"},{"style":{"height":17.6},"width":355.81,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-0.png","element":"img","alt":" h2(x) = 0 when µ","inline":true,"padRight":true},{"text":"is big enough, we approximate the area of Ω","element":"span"},{"style":{"height":10.62},"width":75.44,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-1.png","element":"img","alt":"2 as","inline":true,"padRight":true},{"text":"the product of the length of ","element":"span"},{"style":{"height":19.13},"width":498.06,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-2.png","element":"img","alt":" h2(x) = 0 within [−B, B]2 ","inline":true,"padRight":true},{"text":"and the distance between two lines, which yields ","element":"span"},{"style":{"height":19.64},"width":295.99,"height":49.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-3.png","element":"img","alt":" |Ω2| ≤ 2√2B/µ","inline":true},{"text":". Third, the function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"over the fan-shaped area Ω","element":"span"},{"style":{"height":15.02},"width":133.42,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-4.png","element":"img","alt":"1 is h1.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark 1. ","element":"span"},{"text":"As a ReLU network of interest partitions the space into more and more polytopes, the number of needed simplices will go increasingly larger. Because the lower bound of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"in Eq. ","element":"span"},{"href":"#id-56","text":"(4) ","element":"a"},{"text":"is far larger than ","element":"span"},{"style":{"height":19.13},"width":293.87,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-5.png","element":"img","alt":" D or (D +1)D2","inline":true},{"text":", the width of ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-6.png","element":"img","alt":" H1(x","inline":true},{"text":") and the depth of ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-7.png","element":"img","alt":" H2(x","inline":true},{"text":") will dominate. Furthermore, the width of ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-8.png","element":"img","alt":" H1(x","inline":true},{"text":") is higher than its depth by an order of magnitude in terms of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":", and the depth of ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-9.png","element":"img","alt":" H2(x","inline":true},{"text":") is higher than its width in a similar way. Therefore, ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-10.png","element":"img","alt":" H1(x","inline":true},{"text":") is a wide network, and ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-11.png","element":"img","alt":" H2(x","inline":true},{"text":") is a deep network.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 2. ","element":"span"},{"text":"Compared to universal approximation, our construction, with the use of fan-shaped functions, is valuable in the following aspects: 1) Our construction utilizes the character of ReLU networks. It divides the space into finitely many simplices instead of infinitely many tiny hypercubes; 2) Given a target network, the complexity of our construction does not change with the prescribed error rate. In contrast, the complexity of the construction schemes used in the universal approximation analyses would increase as the preset error decreases. Therefore, our construction is much more efficient; 3) Our construction offers a new and local way to represent a piecewise linear function over polytopes, which is more flexible in representing discontinuous piecewise linear function than the global ways ","element":"span"},{"href":"#id-50","text":"(Wang and Sun, ","element":"a"},{"href":"#id-50","text":"2005; ","element":"a"},{"href":"#id-51","text":"He et al., ","element":"a"},{"href":"#id-51","text":"2018)","element":"a"},{"text":". Furthermore, inspired by the network structure used to construct the proposed fan-shaped function, we find that intra-layer links can enhance the representation capability of a shallow network, closely relevant to the published results on “depth separation”. Because intra-layer links greatly increase the number of pieces represented by a network, a shallow network with intra-layer links can express a complicated piecewise linear function as well as a deep network! For more details on this new finding, please see ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information IX ","element":"span"},{"text":"for details.","element":"span"}],[{"text":"2.3.2 Classification Networks","element":"span"}],[{"text":"A regression network gives a continuous output, while a classification network produces categorical outputs (for example, ","element":"span"},{"style":{"height":17.6},"width":232.91,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-12.png","element":"img","alt":" {0, 1, · · · , 9}","inline":true,"padRight":true},{"text":"for digit recognition). ","element":"span"},{"text":"The classification network is derived by thresholding the output of the last layer of a ReLU network. In this study, we investigate a classification network with binary labels without loss of generality. We can directly build the equivalence for classification networks in the same way as the regression networks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 10 (Quasi-Equivalence of Classification Networks) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Without loss of generality for multi-class classification, we assume a binary output. Suppose that the representation of an arbitrary ReLU network is ","element":"span"},{"style":{"height":19.53},"width":621.18,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-13.png","element":"img","alt":" h : [−B, B]D → {0, 1}, and M","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the minimum number of simplices to cover the polytopes to support ","element":"span"},{"style":{"height":16.4},"width":320.41,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-14.png","element":"img","alt":" h, for any δ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exist a wide ReLU network ","element":"span"},{"style":{"height":21},"width":977.18,"height":52.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-15.png","element":"img","alt":" H1 of width O�D(D + 1)(2D − 1)M�and depth D","inline":true},{"style":{"fontStyle":"italic"},"text":", and also a deep ReLU network ","element":"span"},{"style":{"height":19.13},"width":923.22,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-16.png","element":"img","alt":" H2 of width (D + 1)D2 and depth O [(D + 1)M]","inline":true},{"style":{"fontStyle":"italic"},"text":", satisfying that","element":"span"}],[{"id":"id-48","style":{"width":"65%"},"width":1125,"height":174,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-17.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":17.6},"width":79.55,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-18.png","element":"img","alt":" m(·)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the standard measure in ","element":"span"},{"text":"[","element":"span"},{"style":{"height":19.53},"width":195.36,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/11-19.png","element":"img","alt":"−B, B]D.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"The key is to regard the classification network as a special case of regression network. Then, applying the construction techniques used in the proof of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-47","style":{"fontWeight":"bold"},"text":"9 ","element":"a"},{"text":"will lead to that for any ","element":"span"},{"style":{"height":15.6},"width":113.05,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-0.png","element":"img","alt":" δ > 0,","inline":true}],[{"style":{"width":"64%"},"width":1123,"height":169,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-1.png","element":"img"}],[{"text":"which verifies the correctness of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-48","style":{"fontWeight":"bold"},"text":"10","element":"a"},{"text":".","element":"span"}],[{"text":"A classification neural network can be interpreted as a disjoint rule-based system ","element":"span"},{"style":{"height":15.42},"width":90.62,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-2.png","element":"img","alt":" A1 ∨","inline":true},{"style":{"height":15.42},"width":200.29,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-3.png","element":"img","alt":"A2 · · ·∨An","inline":true,"padRight":true},{"text":"by splitting the representation of a neural network into many decision polytopes: IF (","element":"span"},{"style":{"height":15.6},"width":145.04,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-4.png","element":"img","alt":"input ∈","inline":true,"padRight":true},{"text":"certain polytope), THEN (","element":"span"},{"style":{"fontStyle":"italic"},"text":"input ","element":"span"},{"text":"belongs to some class). Furthermore, each rule is a local function supported over a decision region.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 3. ","element":"span"},{"text":"The De Morgan equivalence in Figure ","element":"span"},{"href":"#id-45","text":"2 ","element":"a"},{"text":"can be summarized as","element":"span"}],[{"style":{"width":"77%"},"width":1341,"height":145,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-5.png","element":"img"}],[{"text":"when the rules are based on hypercubes. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-48","style":{"fontWeight":"bold"},"text":"10 ","element":"a"},{"text":"corresponds to","element":"span"}],[{"style":{"width":"69%"},"width":1206,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-6.png","element":"img"}],[{"text":"when rules are based on simplices.","element":"span"}]]},{"heading":"3. Quasi-Equivalence Extended to Networks of “Quadratic Neurons”","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"3.1 Preliminaries","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 11 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Quadratic neurons ","element":"span"},{"href":"#id-1","style":{"fontStyle":"italic"},"text":"(Fan et al., ","element":"a"},{"href":"#id-1","style":{"fontStyle":"italic"},"text":"2018a,","element":"a"},{"href":"#id-57","style":{"fontStyle":"italic"},"text":"b, ","element":"a"},{"href":"#id-58","style":{"fontStyle":"italic"},"text":"2020) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"integrate the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"style":{"fontStyle":"italic"},"text":"-variable input ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"style":{"fontStyle":"italic"},"text":"with a quadratic function as follows before being nonlinearly processed:","element":"span"}],[{"style":{"width":"79%"},"width":1375,"height":195,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-7.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":13.02},"width":209.71,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-8.png","element":"img","alt":" wr, wg, wb","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are vectors of the same dimensionality as that of ","element":"span"},{"style":{"height":17.42},"width":190.89,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-9.png","element":"img","alt":" x, br, bg, c","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are biases, and ","element":"span"},{"style":{"height":12},"width":34,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/12-10.png","element":"img","alt":" ⊙","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the Hadamard product.","element":"span"}],[{"text":"The network using quadratic neurons is interesting in several ways: 1) Over the past years the design of neural networks has been focusing on architectures, such as shortcut connections, transformer structure, etc. Almost exclusively, the mainstream deep learning models are constructed with neurons of the same type, which are characterized by the inner product and nonlinear activation. Despite that an artifical neural network was invented via biomimicry, the current artificial networks and a biological neural system are fundamentally different in terms of neuronal diversity and complexity. As we know, a biological neural system coordinates numerous types of neurons to support intellectual behaviors. To fill in this gap, we believe that neuronal diversity should be taken into account in machine learning; 2) Due to the enhanced expressive power at the neuronal level, quadratic neurons have been used in real-world problems and achieved superior performance, such as medical imaging ","element":"span"},{"href":"#id-59","text":"(Fan et al., ","element":"a"},{"href":"#id-59","text":"2019)","element":"a"},{"text":", civil engineering ","element":"span"},{"href":"#id-22","text":"(Ji et al., ","element":"a"},{"href":"#id-22","text":"2021)","element":"a"},{"text":", applied math ","element":"span"},{"href":"#id-21","text":"(Bu and Karpatne, ","element":"a"},{"href":"#id-21","text":"2021)","element":"a"},{"text":", and so on. Given the utility of quadratic neurons, research on networks with quadratic neurons is attractive.","element":"span"}],[{"text":"Hereafter, networks made of quadratic neurons are referred to as quadratic networks. We refer to the neurons using inner-product as the conventional neurons, and corresponding networks are called conventional networks. Please note that quadratic neurons are distinct from the conventional neurons that use quadratic activation. The decision boundary of the latter is still linear. The key characteristic of quadratic neurons is the employment of the quadratic function replacing the inner-product, and the choice of activation functions is flexible. Hereafter, we refer to networks made of quadratic neurons as quadratic networks.","element":"span"}],[{"id":"id-64","style":{"fontWeight":"bold"},"text":"Proposition 2 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Any univariate polynomial of degree ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"style":{"fontStyle":"italic"},"text":"can be perfectly computed by a quadratic ReLU network with the depth of ","element":"span"},{"text":"log","element":"span"},{"style":{"height":17.6},"width":92.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/13-0.png","element":"img","alt":"2(N)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and the width of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"href":"#id-58","style":{"fontStyle":"italic"},"text":"(Fan et al., ","element":"a"},{"href":"#id-58","style":{"fontStyle":"italic"},"text":"2020)","element":"a"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"Please refer to ","element":"span"},{"href":"#id-58","text":"(Fan et al., ","element":"a"},{"href":"#id-58","text":"2020) ","element":"a"},{"text":"for a detailed proof. Any univariate polynomial of degree ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"can be factorized as ","element":"span"},{"style":{"height":26.41},"width":395.01,"height":66.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/13-1.png","element":"img","alt":"�N/2j=1(rjx2 + sjx + tj","inline":true},{"text":"), without the incorporation of complex ","element":"span"},{"text":"numbers. Due to the identity: ","element":"span"},{"style":{"height":17.6},"width":500.43,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/13-2.png","element":"img","alt":" f(x) = σ(f(x)) − σ(−f(x","inline":true},{"text":")), every two quadratic neurons ","element":"span"},{"style":{"height":19.75},"width":793.8,"height":49.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/13-3.png","element":"img","alt":"σ(rjx2 + sjx + tj) and σ(−(rjx2 + sjx + tj","inline":true},{"text":")) can be ensembled together to perfectly express ","element":"span"},{"style":{"height":19.75},"width":291.49,"height":49.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/13-4.png","element":"img","alt":"rjx2 + sjx + tj","inline":true,"padRight":true},{"text":"in the first layer of a network, followed by the half number of quadratic neurons in the second layer to combine the yields of the first layer, and so on and so forth. Consequently, a quadratic ReLU network with a depth of log","element":"span"},{"style":{"height":17.6},"width":74.9,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/13-5.png","element":"img","alt":"2(N","inline":true},{"text":") and a width of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"can express any univariate polynomial of degree ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proposition 3 (Stone-Weierstrass Theorem ","element":"span"},{"href":"#id-60","style":{"fontWeight":"bold"},"text":"(De Branges, ","element":"a"},{"href":"#id-60","style":{"fontWeight":"bold"},"text":"1959)","element":"a"},{"style":{"fontWeight":"bold"},"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be a continuous real-valued function over ","element":"span"},{"text":"[","element":"span"},{"style":{"fontStyle":"italic"},"text":"a, b","element":"span"},{"text":"]","element":"span"},{"style":{"fontStyle":"italic"},"text":", then for any ","element":"span"},{"style":{"height":12.4},"width":106.68,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/13-6.png","element":"img","alt":" σ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exists a polynomial ","element":"span"},{"style":{"fontStyle":"italic"},"text":"P","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":", satisfying","element":"span"}],[{"id":"id-65","style":{"width":"102%"},"width":1769,"height":526,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/13-7.png","element":"img"}],[{"text":"In the above propositions, ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proposition ","element":"span"},{"text":"1 shows a quadratic ReLU network can express a univariate polynomial through factorization, which corresponds to the left side of Eq. ","element":"span"},{"href":"#id-61","text":"(111)","element":"a"},{"text":". At the same time, this quadratic network is wide. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proposition ","element":"span"},{"text":"2 suggests that a polynomial can represent any continuous function. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proposition ","element":"span"},{"text":"3 informs us that a multivariate function can be expressed as a combination of univariate functions, which serves as a bridge to generalize our result from approximation to univariate functions to the case of multivariate functions.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3.2 Motivating Example","element":"span"}],[{"text":"In contrast to the factorization representation, there is a continued fraction representation for a general univariate polynomial:","element":"span"}],[{"style":{"width":"98%"},"width":1707,"height":505,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/14-0.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.6},"width":1601.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/14-1.png","element":"img","alt":" ai ̸= 0, b0 = a0, bk = a2k/a2k−2, k ≥ 1 and c0 = a1, ck = a2k+1/a2k−1, k ≥ 1. In the","inline":true,"padRight":true},{"text":"right side of Eq. ","element":"span"},{"href":"#id-61","text":"(111)","element":"a"},{"text":", the first part contains the terms with even powers of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":", while the second part with the odd powers of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". In ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information V","element":"span"},{"text":", we prove the correctness of Eq. ","element":"span"},{"href":"#id-61","text":"(111) ","element":"a"},{"text":"in detail.","element":"span"}],[{"style":{"width":"80%"},"width":1383,"height":778,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/14-2.png","element":"img"}],[{"text":"Figure 6: The width and depth equivalence for networks of quadratic neurons. In this con- ","element":"figcaption","subtype":"caption"},{"id":"id-63","text":"struction, a deep network is to implement the continued fraction of a polynomial, ","element":"figcaption","subtype":"caption"},{"text":"and a wide network reflects the factorization of the polynomial.","element":"figcaption","subtype":"caption"}],[{"text":"Since both the factorization representation and the continued fraction representation express the same polynomial, we have the following identity:","element":"span"}],[{"style":{"width":"98%"},"width":1707,"height":498,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-0.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.42},"width":337.68,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-1.png","element":"img","alt":" rj, sj, tj and bi, ci","inline":true,"padRight":true},{"text":"are intrinsically connected. Let ","element":"span"},{"style":{"height":19.75},"width":604.66,"height":49.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-2.png","element":"img","alt":" Qj = rjx2 + sjx + tj, Bl(x) =","inline":true}],[{"id":"id-62","style":{"width":"99%"},"width":1724,"height":274,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-3.png","element":"img"}],[{"text":"which could be somehow analogized to the De Morgan law by considering that multiplication and composition operations replace ","element":"span"},{"style":{"height":12.8},"width":157.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-4.png","element":"img","alt":" ∨ and ∧","inline":true,"padRight":true},{"text":"operations, respectively, for either side of Eq. ","element":"span"},{"href":"#id-62","text":"(17)","element":"a"},{"text":". But the objects of those operations are global functions instead of local rules. Eq. ","element":"span"},{"href":"#id-62","text":"(17) ","element":"a"},{"text":"is inspiring in the sense that the left side is of a parallel computational structure, and the right side is with two nested structures. Clearly, the left and right sides of Eq. ","element":"span"},{"href":"#id-62","text":"(17) ","element":"a"},{"text":"suggest a wide network and a deep network, respectively.","element":"span"}],[{"text":"In Figure ","element":"span"},{"href":"#id-63","text":"6, ","element":"a"},{"text":"a deep quadratic network is constructed to represent the right side of Eq. ","element":"span"},{"href":"#id-62","text":"(17)","element":"a"},{"text":", while a wide quadratic network is constructed to represent the left side. ","element":"span"},{"text":"In light of Eq. ","element":"span"},{"href":"#id-62","text":"(17)","element":"a"},{"text":", these two quadratic networks are functionally equivalent to each other. In the case of the deep network, if the polynomial is of degree 2","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":", then the depth of the deep network is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":". The inter-layer relationship in the two branches are respectively ","element":"span"},{"style":{"height":35.07},"width":1728.6,"height":87.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-5.png","element":"img","alt":"y(1)l+1(x) = bN−lx21+bN−lx2−y(1)l (x), y(1)N+1 = b01−y1N , and y(2)l+1(x) = cN−lx21+cN−lx2−y(2)l (x), y2N = c0x1−y2N , where","inline":true},{"style":{"height":24.44},"width":239.03,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-6.png","element":"img","alt":"y(1)l and y(2)l","inline":true,"padRight":true},{"text":"are the outputs of the ","element":"span"},{"style":{"height":15.53},"width":46.12,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-7.png","element":"img","alt":" lth ","inline":true,"padRight":true},{"text":"layer in either branch. Specially, ","element":"span"},{"style":{"height":23.8},"width":304.49,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-8.png","element":"img","alt":" y(1)0 = y(2)0 = 0.","inline":true,"padRight":true},{"text":"In this deep network, the activation function is in a form of reciprocal relation between two inputs: ","element":"span"},{"style":{"height":21.62},"width":318.31,"height":54.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-9.png","element":"img","alt":" z(x, y) = ax1+ax−y","inline":true},{"text":", which can be well approximated by a ReLU function after ","element":"span"},{"text":"a proper normalization operation. In ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information V","element":"span"},{"text":", we demonstrate that the reciprocal activation works effectively in the deep network after some twists. As for constructing the wide network, we employ Proposition ","element":"span"},{"href":"#id-64","text":"2 ","element":"a"},{"text":"directly. Suppose that the polynomial is of degree 2","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":", the resultant quadratic network has a width of 2","element":"span"},{"style":{"fontStyle":"italic"},"text":"N ","element":"span"},{"text":"and a depth of only log","element":"span"},{"style":{"height":17.6},"width":96.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/15-10.png","element":"img","alt":"2(2N","inline":true},{"text":"), where the width dominates.","element":"span"}],[{"text":"Despite the novelty, the above equivalence only fits univariate polynomials. To make a broader impact, we further demonstrate that in terms of representing a multivariate polynomial, the width and the depth of a neural network is also equivalent to each other.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3.3 Width-Depth Quasi-Equivalence","element":"span"}],[{"text":"This section delineates the second contribution of our paper. Particularly, we first show that a combination of univariate polynomials can approximate any continuous multivariate function. Then, we leverage the constructed wide and deep quadratic networks for univariate polynomials to represent a general continuous multivariate function. As a result, the resultant wide and deep networks offer the same functionality.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 12 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For any continuous function ","element":"span"},{"style":{"height":19.53},"width":527.36,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-0.png","element":"img","alt":" f : [0, 1]D → R, given δ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exists a function formulated as ","element":"span"},{"style":{"height":22},"width":813.28,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-1.png","element":"img","alt":"�2Dt=0 Pt(�Ds=1 Pt,s(xt)), where Pt,s and Pt","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are univariate polynomials, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"such that","element":"span"}],[{"style":{"width":"82%"},"width":1425,"height":124,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-2.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"First, we apply the famous Kolmogorov-Arnold theorem to express ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":":","element":"span"}],[{"style":{"width":"73%"},"width":1263,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-3.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":18.22},"width":300.11,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-4.png","element":"img","alt":" φt,s(xs) and Φt","inline":true,"padRight":true},{"text":"are continuous. ","element":"span"},{"text":"According to Proposition ","element":"span"},{"href":"#id-65","text":"3, ","element":"a"},{"text":"for every function ","element":"span"},{"style":{"height":18.22},"width":123.21,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-5.png","element":"img","alt":"φt,s(xs","inline":true},{"text":"), given an arbitrarily small quantity ","element":"span"},{"style":{"height":14.62},"width":110.52,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-6.png","element":"img","alt":" ϵt,s >","inline":true,"padRight":true},{"text":"0 , there exists a polynomial ","element":"span"},{"style":{"height":18.22},"width":143.88,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-7.png","element":"img","alt":" Pt,s(xs)","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"width":"67%"},"width":1168,"height":82,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-8.png","element":"img"}],[{"text":"Combining ","element":"span"},{"style":{"height":18.3},"width":531.58,"height":45.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-9.png","element":"img","alt":" φt,1(x1), φt,2(x2), ..., φt,D(xD","inline":true},{"text":") and applying the triangle inequality, we have","element":"span"}],[{"style":{"width":"73%"},"width":1277,"height":435,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-10.png","element":"img"}],[{"text":"Because Φ","element":"span"},{"style":{"height":8},"width":12,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-11.png","element":"img","alt":"t","inline":true,"padRight":true},{"text":"is a continuous function, we can use the property of continuity. We choose a sufficiently small ","element":"span"},{"style":{"height":17.02},"width":288.04,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-12.png","element":"img","alt":" ϵt,s, s = 1, ..., D","inline":true,"padRight":true},{"text":"such that for every Φ","element":"span"},{"style":{"height":8},"width":12,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-13.png","element":"img","alt":"t","inline":true},{"text":", the following inequality holds:","element":"span"}],[{"style":{"width":"84%"},"width":1459,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-14.png","element":"img"}],[{"text":"With respect to the continuous function Φ","element":"span"},{"style":{"height":8},"width":12,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-15.png","element":"img","alt":"t","inline":true},{"text":", we can find a polynomial ","element":"span"},{"style":{"height":15.02},"width":235.24,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-16.png","element":"img","alt":" Pt such that","inline":true}],[{"style":{"width":"65%"},"width":1139,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/16-17.png","element":"img"}],[{"text":"Next, applying the triangle inequality again, we have","element":"span"}],[{"style":{"width":"80%"},"width":1383,"height":549,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-0.png","element":"img"}],[{"text":"Finally, integrating Φ","element":"span"},{"style":{"height":15.2},"width":262.34,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-1.png","element":"img","alt":"t, t = 0, ..., 2D","inline":true},{"text":", we immediately obtain that","element":"span"}],[{"style":{"width":"82%"},"width":1426,"height":348,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-2.png","element":"img"}],[{"text":"which concludes the proof.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 13 (Quasi-Equivalence by the Extension of the De Morgan Law) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a continuous function ","element":"span"},{"style":{"height":19.53},"width":586.07,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-3.png","element":"img","alt":" h : [0, 1]D → R, for any δ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exists a function expressed as","element":"span"}],[{"style":{"width":"99%"},"width":1722,"height":223,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":17.42},"width":214.56,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-5.png","element":"img","alt":" Pt and Pt,s","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are polynomials of degrees ","element":"span"},{"text":"deg(","element":"span"},{"style":{"height":18.22},"width":333.22,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-6.png","element":"img","alt":"Pt) and deg(Ps,t)","inline":true},{"style":{"fontStyle":"italic"},"text":". Correspondingly, let ","element":"span"},{"style":{"height":19.95},"width":937.88,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-7.png","element":"img","alt":"K1 = maxt[deg(Pt)] and K2 = max{s,t}[deg(Ps,t)]","inline":true},{"style":{"fontStyle":"italic"},"text":", there exist a wide quadratic network ","element":"span"},{"style":{"height":16},"width":54.7,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-8.png","element":"img","alt":" Q1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"of a width ","element":"span"},{"text":"max","element":"span"},{"style":{"height":17.6},"width":175.19,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-9.png","element":"img","alt":"{K1, K2}","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a depth ","element":"span"},{"text":"log","element":"span"},{"style":{"height":17.6},"width":164.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-10.png","element":"img","alt":"2(K1K2)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a deep quadratic network ","element":"span"},{"style":{"height":16.4},"width":154.47,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-11.png","element":"img","alt":" Q2 of a","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"width ","element":"span"},{"text":"2(2","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and a depth ","element":"span"},{"style":{"height":14.62},"width":163.38,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-12.png","element":"img","alt":" K1 + K2","inline":true},{"style":{"fontStyle":"italic"},"text":", satisfying","element":"span"}],[{"style":{"width":"64%"},"width":1114,"height":191,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-13.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"To prototype the wide network, we can use the wide quadratic sub-network scheme in Figure ","element":"span"},{"href":"#id-63","text":"6 ","element":"a"},{"text":"to express ","element":"span"},{"style":{"height":17.42},"width":217.36,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-14.png","element":"img","alt":" Pt and Pt,s","inline":true,"padRight":true},{"text":"whose [width, depth] are [deg(","element":"span"},{"style":{"height":17.6},"width":431.89,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/17-15.png","element":"img","alt":"Pt), log2(deg(Pt))] and","inline":true}],[{"text":"[deg(","element":"span"},{"style":{"height":18.22},"width":344.63,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-0.png","element":"img","alt":"Pt,s), log2(deg(Pt,s","inline":true},{"text":"))], respectively. A straightforward combination of these wide sub-networks can express ","element":"span"},{"style":{"height":22},"width":405.54,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-1.png","element":"img","alt":"�2Dt=0 Pt(�Ds=1 Pt,s(xt","inline":true},{"text":")). Thus, the width of the construction is the ","element":"span"},{"text":"summation of the widths of all the sub-networks: max","element":"span"},{"style":{"height":22},"width":754.67,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-2.png","element":"img","alt":"{�2Dt=0 deg(Pt), �2Dt=0�Ds=1 deg(Pt,s)} ≥","inline":true,"padRight":true},{"text":"max","element":"span"},{"style":{"height":17.6},"width":175.19,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-3.png","element":"img","alt":"{K1, K2}","inline":true},{"text":", while the depth is max","element":"span"},{"style":{"height":18.22},"width":1010.63,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-4.png","element":"img","alt":"t[log2(deg(Pt)] + maxt,s[log2(deg(Pt,s))] = log2(K1) +","inline":true,"padRight":true},{"text":"log","element":"span"},{"style":{"height":17.6},"width":400.25,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-5.png","element":"img","alt":"2(K2) = log2(K1K2).","inline":true}],[{"text":"For the deep network, we use the deep quadratic sub-network shown in Figure ","element":"span"},{"href":"#id-63","text":"6 ","element":"a"},{"text":"to express ","element":"span"},{"style":{"height":17.42},"width":219.26,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-6.png","element":"img","alt":" Pt and Pt,s","inline":true,"padRight":true},{"text":"whose [width, depth] are [2","element":"span"},{"style":{"height":18.22},"width":487.14,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-7.png","element":"img","alt":", deg(Pt)] and [2, deg(Pt,s","inline":true},{"text":")], respectively. Similarly, by integrating these deep sub-networks, we can have a deep network that expresses ","element":"span"},{"style":{"height":22},"width":405.54,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-8.png","element":"img","alt":"�2Dt=0 Pt(�Ds=1 Pt,s(xt","inline":true},{"text":")). As a result, the width of the derived network is 2(2","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1), ","element":"span"},{"text":"and the depth is max","element":"span"},{"style":{"height":18.22},"width":781.6,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/18-9.png","element":"img","alt":"t[deg(Pt)] + maxt,s[deg(Pt,s)] = K1 + K2.","inline":true}],[{"text":"In ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information IV, ","element":"span"},{"text":"we use the quasi-equivalence relationship to construct wide and deep quadratic network variants for the same task, verify their validity on the MNIST dataset, and show empirical hints that a wide network might have an enhanced robustness, since adversarial samples have much less room to perturb latent features and play tricks.","element":"span"}]]},{"heading":"4. Discussions","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"Gains of Equivalence Notion. ","element":"span"},{"text":"$33","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Equivalent Networks. ","element":"span"},{"text":"In a broader sense, our quasi-equivalence studies demonstrate the existence of mutually equivalent networks. We argue that the network equivalence is useful in network design. Although deep networks manifest superb power, their applications can be constrained, for example, when the application is time-critical. In that case, we can convert the deep network to a wide counterpart that can be executed at a high speed. A direction for network design optimization is to derive a compact network that maintains a high performance of an original large network through quantization ","element":"span"},{"href":"#id-66","text":"(Wu et al., ","element":"a"},{"href":"#id-66","text":"2016)","element":"a"},{"text":", pruning ","element":"span"},{"href":"#id-67","text":"(Li et al., ","element":"a"},{"href":"#id-67","text":"2016)","element":"a"},{"text":", distillation ","element":"span"},{"href":"#id-68","text":"(Polino et al., ","element":"a"},{"href":"#id-68","text":"2018)","element":"a"},{"text":", low-rank approximation ","element":"span"},{"href":"#id-69","text":"(Zhang ","element":"a"},{"href":"#id-69","text":"et al., ","element":"a"},{"href":"#id-69","text":"2015)","element":"a"},{"text":", etc. We envision that the equivalence of a deep network and a wide network suggests a new means of network design. Ideally, a wide network can replace the well-trained deep network without compromising the performance. Due to the parallel nature, the wide network can be trained on a computing cluster with many machines for the fast training. At the same time, the inference time of the equivalent wide network is shorter than its deep counterpart.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Width-Depth Correlation. ","element":"span"},{"text":"Every continuous ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":"-variable function ","element":"span"},{"style":{"height":17.6},"width":305.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/19-0.png","element":"img","alt":" f on [0, 1]n can","inline":true,"padRight":true},{"text":"be in the ","element":"span"},{"style":{"height":14.62},"width":46.7,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/19-1.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"sense represented by partially separable multivariate functions ","element":"span"},{"href":"#id-70","text":"(Light and ","element":"a"},{"href":"#id-70","text":"Cheney, ","element":"a"},{"href":"#id-70","text":"2006)","element":"a"},{"text":":","element":"span"}],[{"style":{"width":"77%"},"width":1343,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/19-2.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/19-3.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"is an arbitrarily small positive number, ","element":"span"},{"style":{"height":16.4},"width":48.49,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/19-4.png","element":"img","alt":" φli","inline":true,"padRight":true},{"text":"is a continuous function, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"is the number of products. In the ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information V and VI","element":"span"},{"text":", we justify the suitability of this partially separable representation by showing its boundedness, and comparing it with other representations.","element":"span"}],[{"text":"Further, we correlate the width and depth of a network to the structure of a function to be approximated. In a nutshell, each continuous function ","element":"span"},{"style":{"height":16.4},"width":48.49,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/19-5.png","element":"img","alt":" φli","inline":true,"padRight":true},{"text":"can be approximated by a polynomial of some degree, which can be appropriately represented by quadratic neurons. As a consequence, via a quadratic representation scheme, the width and depth of a network structure must reflect the complexity of ","element":"span"},{"style":{"height":22},"width":318.31,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/19-6.png","element":"img","alt":"�Ll=1�ni=1 φli(xi","inline":true},{"text":"). In other words, they are ","element":"span"},{"text":"controlled by the nature of a specific task. As the task becomes complicated, the width and depth must increase accordingly, and the combination of the width and depth is not unique. For more details, please see the ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information VII","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Effects of Width on Optimization and Generalization. ","element":"span"},{"text":"In the ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Supplementary Information VIII","element":"span"},{"text":", we illustrate the importance of width on optimization in the context of over-paramterization, kernel ridge regression, and NTK, and then report our findings that the existing generalization bounds also shed light on the relationship of the width and depth given a fixed complexity.","element":"span"}]]},{"heading":"5. Conclusion","paragraphs":[[{"text":"Inspired by the De Morgan law and through a systematic analysis, we have established the quasi-equivalence between the depth and width of ReLU neural networks from two perspectives. In the first perspective, we have formulated two transforms for mapping an arbitrary regression/classification ReLU network to a wide ReLU network and a deep ReLU network, respectively. In the second perspective, we have extended our quasi-equivalence results from ReLU networks of popular artificial neurons to those of quadratic neurons. This quasi-equivalence represents a step forward in developing a unified deep learning theory. More efforts are needed in the future to refine this quasi-equivalence relationship and find real-world applications.","element":"span"}]]},{"heading":"Acknowledgments","paragraphs":[[{"text":"F. L. Fan is supported by the Rensselaer-IBM AI Research Collaboration Program (","element":"span"},{"href":"http://airc.rpi.edu","style":{"fontFamily":"monospace"},"text":"http: ","element":"a"},{"href":"http://airc.rpi.edu","style":{"fontFamily":"monospace"},"text":"//airc.rpi.edu","element":"a"},{"text":"), part of the IBM AI Horizons Network (","element":"span"},{"href":"http://ibm.biz/AIHorizons","style":{"fontFamily":"monospace"},"text":"http://ibm.biz/AIHorizons","element":"a"},{"text":"), R. Lai is partially supported by an NSF Career Award DMS–1752934 and NSF DMS-2134168. G. Wang is partially supported by an NIH R01 CA237267 grant.","element":"span"}]]},{"heading":"Appendix A. Proof of Theorem 9 (2D)","paragraphs":[[{"text":"Here, we show the correctness of Theorem ","element":"span"},{"href":"#id-47","text":"9 ","element":"a"},{"text":"in the 2D case. Regarding the transformation of an arbitrary multivariate network, the situation is more complicated than in the case of univariate networks. Nevertheless, we are able to establish the ","element":"span"},{"style":{"height":12.8},"width":20,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-0.png","element":"img","alt":" δ","inline":true},{"text":"-equivalence, which is a slightly relaxed result.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"The sketch of proof: ","element":"span"},{"text":"A ReLU network is a piecewise linear function over polytopes, which can be decomposed into a summation of linear functions over a simplex. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Lemma ","element":"span"},{"href":"#id-71","text":"14 ","element":"a"},{"text":"shows that a wider network module ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-1.png","element":"img","alt":" N1(x","inline":true},{"text":") and a deeper network module ","element":"span"},{"style":{"height":17.6},"width":202.81,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-2.png","element":"img","alt":" N2(x) can","inline":true,"padRight":true},{"text":"represent an arbitrary linear function over a simplex. Next, in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-47","text":"9, ","element":"a"},{"text":"to transform an arbitrary ReLU network into a wide and a deep network, we horizontally aggregate network modules ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-3.png","element":"img","alt":" N1(x","inline":true},{"text":") to have a wide network, and we use shortcuts to sequentially establish a deep network with ","element":"span"},{"style":{"height":17.6},"width":130.63,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-4.png","element":"img","alt":" N2(x).","inline":true}],[{"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-simplex ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"is a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional convex hull provided by convex combinations of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"+1","element":"span"}],[{"id":"id-72","style":{"width":"31%"},"width":540,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-5.png","element":"img"}],[{"text":"affinely independent vectors ","element":"span"},{"style":{"height":20.02},"width":258.15,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-6.png","element":"img","alt":" {vi}Di=0 ⊂ RD","inline":true},{"text":". In other words, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"=","element":"span"}],[{"text":"In 2D case, if we write ","element":"span"},{"style":{"height":17.6},"width":547.54,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-7.png","element":"img","alt":" V = (v1−v0, v2−v0), then V","inline":true,"padRight":true},{"text":"is invertible and ","element":"span"},{"style":{"height":17.6},"width":460.87,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-8.png","element":"img","alt":" S = {v0 + V x | x ∈ ∆},","inline":true},{"style":{"height":20.8},"width":730.29,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-9.png","element":"img","alt":"where ∆ =�x ∈ R2 | x ≥ 0, 1⊤x ≤ 1�","inline":true},{"text":"is a template simplex in ","element":"span"},{"style":{"height":14.73},"width":48.52,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-10.png","element":"img","alt":" R2","inline":true},{"text":". It is clear that the following one-to-one affine mapping between ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"and ∆ exists, which is","element":"span"}],[{"style":{"width":"70%"},"width":1220,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-11.png","element":"img"}],[{"text":"Therefore, we only need to make the construction in the special case where ","element":"span"},{"style":{"height":13.2},"width":201.94,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-12.png","element":"img","alt":" S = ∆ to","inline":true,"padRight":true},{"text":"simplify our analysis. ","element":"span"},{"text":"The coordinate transform in Eq. ","element":"span"},{"href":"#id-72","text":"(29) ","element":"a"},{"text":"can conveniently map the construction from ∆ to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":".","element":"span"}],[{"text":"Given a linear function ","element":"span"},{"style":{"height":19.13},"width":1206.22,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-13.png","element":"img","alt":" ℓ(x) = c1x1 + c2x2 + c3, we write ℓ− = {x ∈ R2 | ℓ(x) < 0} and","inline":true},{"style":{"height":19.13},"width":478.97,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-14.png","element":"img","alt":"ℓ+ = {x ∈ R2 | ℓ(x) ≥ 0}","inline":true},{"text":". ∆ is enclosed by three lines provided by ","element":"span"},{"style":{"height":17.6},"width":438.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-15.png","element":"img","alt":" ℓ1(x) = x1, ℓ2(x) = x2,","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.6},"width":314.74,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-16.png","element":"img","alt":" ℓ3(x) = −x1−x2","inline":true},{"text":"+1. We write three vertices of ∆ as ","element":"span"},{"style":{"height":17.6},"width":652.49,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-17.png","element":"img","alt":" v0 = (0, 0), v1 = (1, 0), v2 = (0, 1).","inline":true,"padRight":true},{"text":"Then, ","element":"span"},{"style":{"height":19.13},"width":343.93,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-18.png","element":"img","alt":" f : [−B, B]2 → R","inline":true,"padRight":true},{"text":"supported on ∆ is expressed as follows:","element":"span"}],[{"id":"id-71","style":{"width":"68%"},"width":1191,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-19.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.6},"width":877.96,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-20.png","element":"img","alt":" a = (f(v1) − f(v0), f(v2) − f(v0)), b = f(v0).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Lemma 14 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that the representation of an arbitrary ReLU network is ","element":"span"},{"style":{"height":19.53},"width":308.57,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-21.png","element":"img","alt":" f : [−B, B]D →","inline":true,"padRight":true},{"text":"R ","element":"span"},{"style":{"fontStyle":"italic"},"text":"expressed as Eq. ","element":"span"},{"href":"#id-71","text":"(30)","element":"a"},{"style":{"fontStyle":"italic"},"text":", for any ","element":"span"},{"style":{"height":13.2},"width":108.58,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-22.png","element":"img","alt":" δ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exist a ReLU network ","element":"span"},{"style":{"height":17.6},"width":385.11,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-23.png","element":"img","alt":" N1 of width D(D +","inline":true,"padRight":true},{"text":"1)(2","element":"span"},{"style":{"height":19.53},"width":457.94,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-24.png","element":"img","alt":"D − 1) + 2 and depth D","inline":true},{"style":{"fontStyle":"italic"},"text":", and also a ReLU network ","element":"span"},{"style":{"height":19.13},"width":655.5,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-25.png","element":"img","alt":" N2 of width (D + 1)D2 and depth","inline":true}],[{"style":{"width":"99%"},"width":1723,"height":208,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-26.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"D=2","element":"span"},{"text":") Our goal is to approximate the given piecewise linear function ","element":"span"},{"style":{"height":16.8},"width":190.01,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-27.png","element":"img","alt":" f over ∆;","inline":true,"padRight":true},{"text":"therefore, we need to cancel ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"outside its domain. We first index the polytopes separated by three lines ","element":"span"},{"style":{"height":20.93},"width":1444.04,"height":52.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/20-28.png","element":"img","alt":" ℓ1(x) = 0, ℓ2(x) = 0, and ℓ3(x) = 0 as A(χ1,χ2,χ3) = ℓχ11 ∩ ℓχ22 ∩ ℓχ33 , χi ∈","inline":true,"padRight":true},{"style":{"height":20.33},"width":995.77,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-0.png","element":"img","alt":"{+, −}, i = 1, 2, 3. It is clear that ∆ = A(+,+,+).","inline":true,"padRight":true},{"text":"In addition, we use ","element":"span"},{"style":{"height":10.8},"width":29,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-1.png","element":"img","alt":" ∨","inline":true,"padRight":true},{"text":"to exclude a component. For instance, ","element":"span"},{"style":{"height":20.93},"width":401.67,"height":52.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-2.png","element":"img","alt":" A(χ1,∨,χ3) = ℓχ11 ∩ ℓχ33 ","inline":true,"padRight":true},{"text":". It can be easily verified that ","element":"span"},{"style":{"height":17.13},"width":227.09,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-3.png","element":"img","alt":" A(χ1,∨,χ3) =","inline":true},{"style":{"height":17.13},"width":427.72,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-4.png","element":"img","alt":"A(χ1,+,χ3) ∪ A(χ1,−,χ3).","inline":true}],[{"text":"Constructing ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-5.png","element":"img","alt":" N1(x","inline":true},{"text":"): The discontinuity of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"in Eq. ","element":"span"},{"href":"#id-71","text":"(30) ","element":"a"},{"text":"is a major challenge of representing the function using a ReLU network. To tackle this issue, we start from a linear function ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":19.13},"width":457.86,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-6.png","element":"img","alt":"f(x) = a⊤x+b, ∀x ∈ R2","inline":true},{"text":", which can be represented by two neurons ","element":"span"},{"style":{"height":20.6},"width":290.59,"height":51.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-7.png","element":"img","alt":" σ◦ ˜f −σ◦(− ˜f).","inline":true,"padRight":true},{"text":"The key idea is to eliminate ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"over all polytopes outside ∆. In other words, ","element":"span"},{"text":"˜","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"over three fan-shaped polytopes ","element":"span"},{"style":{"height":19.13},"width":600.56,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-8.png","element":"img","alt":" A(∨,−,+), A(−,+,∨), and A(+,∨,−) ","inline":true,"padRight":true},{"text":"should be cancelled.","element":"span"}],[{"text":"Let us take the polytope ","element":"span"},{"style":{"height":17.13},"width":155.11,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-9.png","element":"img","alt":" A(+,∨,−) ","inline":true,"padRight":true},{"text":"as an example. Note that ","element":"span"},{"style":{"height":17.13},"width":155.11,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-10.png","element":"img","alt":" A(+,∨,−) ","inline":true,"padRight":true},{"text":"has two boundaries ","element":"span"},{"href":"#id-73","style":{"height":17.6},"width":1728.73,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-11.png","element":"img","alt":"ℓ1(x) = 0 and ℓ3(x) = 0 as illustrated in Figure 7(b). We choose a sufficiently large positive","inline":true,"padRight":true},{"text":"number ","element":"span"},{"style":{"height":12},"width":26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-12.png","element":"img","alt":" µ","inline":true,"padRight":true},{"text":"to construct the three fan-shaped functions:","element":"span"}],[{"style":{"width":"77%"},"width":1344,"height":219,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-13.png","element":"img"}],[{"text":"where the positive number ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-14.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"is chosen to be small enough such that the lines ","element":"span"},{"style":{"height":15.6},"width":243.61,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-15.png","element":"img","alt":" x1 − ηx2 = 0","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.64},"width":1637.7,"height":49.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-16.png","element":"img","alt":" x1 − η = 0 are very close to x1 = 0, then m((x1)+ ∩ (x1 − ηx2)−) < 2√2Bη and","inline":true},{"style":{"height":19.64},"width":613.45,"height":49.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-17.png","element":"img","alt":"m((x1)+ ∩ (x1 − η)−) < 2√2Bη.","inline":true}],[{"style":{"width":"77%"},"width":1347,"height":810,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-18.png","element":"img"}],[{"text":"Figure 7: Quasi-equivalence analysis in 2D case. Left: The structure of the wide network to represent ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"f ","element":"figcaption","subtype":"caption"},{"text":"over ∆, where two neurons denote ","element":"figcaption","subtype":"caption"},{"style":{"height":19.13},"width":301.28,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/21-19.png","element":"img","alt":" f over [−B, B]2 ","inline":true,"padRight":true},{"id":"id-73","text":"and nine fan- ","element":"figcaption","subtype":"caption"},{"text":"shaped functions handle the polytopes outside ∆. Right: The polytopes outside ∆ comprise of three fan-shaped domains, on which ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"f ","element":"figcaption","subtype":"caption"},{"text":"can be cancelled by three linearly independent fan-shaped functions.","element":"figcaption","subtype":"caption"}],[{"text":"According to the aforementioned properties of fan-shaped functions, we approximately have","element":"span"}],[{"style":{"width":"99%"},"width":1723,"height":518,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-0.png","element":"img"}],[{"text":"Based on the property of fan-shaped functions, ","element":"span"},{"style":{"height":23.8},"width":769.55,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-1.png","element":"img","alt":" F (+,∨,−)1 (x), F (+,∨,−)2 (x), F (+,∨,−)3 (x) are","inline":true,"padRight":true},{"text":"approximately constrained into the region ","element":"span"},{"style":{"height":20.61},"width":437.96,"height":51.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-2.png","element":"img","alt":" A(+,∨,−) such that ˜f(x","inline":true},{"text":") is approximately counteracted over ","element":"span"},{"style":{"height":17.13},"width":155.11,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-3.png","element":"img","alt":" A(+,∨,−)","inline":true},{"text":". Mathematically, the new function ","element":"span"},{"style":{"height":23.8},"width":594.21,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-4.png","element":"img","alt":" F (+,∨,−)(x) = ω∗1F (+,∨,−)1 (x) +","inline":true}],[{"style":{"width":"99%"},"width":1724,"height":239,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-5.png","element":"img"}],[{"text":"Similarly, we can construct ","element":"span"},{"style":{"height":16.33},"width":401.81,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-6.png","element":"img","alt":" F (∨,−,+) and F (−,+,∨) ","inline":true,"padRight":true},{"text":"to eliminate ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":19.93},"width":495.94,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-7.png","element":"img","alt":"f on A(∨,−,+) and A(−,+,∨)","inline":true,"padRight":true},{"text":"respectively. Finally, these fan-shaped functions are aggregated to form the following ReLU network ","element":"span"},{"style":{"height":14.62},"width":56.28,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-8.png","element":"img","alt":" N1","inline":true,"padRight":true},{"text":"(illustrated in Figure ","element":"span"},{"href":"#id-73","text":"7(","element":"a"},{"text":"a)):","element":"span"}],[{"style":{"width":"78%"},"width":1353,"height":126,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-9.png","element":"img"}],[{"text":"where the width and depth of the network are 2 + 3 ","element":"span"},{"style":{"height":16.4},"width":696.92,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-10.png","element":"img","alt":" × 3 × 2 = 20 and 2 respectively. In","inline":true,"padRight":true},{"text":"addition, due to the 9 fan-shaped functions being utilized and the effect of the ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-11.png","element":"img","alt":" η","inline":true},{"text":", the total area of the regions suffering from errors is no more than","element":"span"}],[{"style":{"width":"59%"},"width":1023,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-12.png","element":"img"}],[{"text":"Therefore, for any ","element":"span"},{"style":{"height":13.2},"width":67.16,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-13.png","element":"img","alt":" δ >","inline":true,"padRight":true},{"text":"0, as long as we choose ","element":"span"},{"style":{"height":16.4},"width":148.62,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-14.png","element":"img","alt":" η and µ","inline":true,"padRight":true},{"text":"satisfying","element":"span"}],[{"style":{"width":"70%"},"width":1220,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-15.png","element":"img"}],[{"text":"the constructed network ","element":"span"},{"style":{"fontWeight":"bold"},"text":"N ","element":"span"},{"text":"will have","element":"span"}],[{"style":{"width":"68%"},"width":1184,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-16.png","element":"img"}],[{"text":"Constructing ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-17.png","element":"img","alt":" N2(x","inline":true},{"text":"): Allowing more layers in a network provides an alternate way to represent ","element":"span"},{"style":{"height":17.6},"width":1543.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-18.png","element":"img","alt":" f. Let F(x1, x2) = σ◦(x1−µσ◦(−x2)) and F′(x1, x2) = σ◦(x1−νx2−µσ◦(−x2)),","inline":true,"padRight":true},{"text":"both of which are approximately enclosed by boundaries ","element":"span"},{"style":{"height":15.6},"width":645.84,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/22-19.png","element":"img","alt":" x1 = 0 and x2 = 0. Therefore, the","inline":true}],[{"id":"id-74","style":{"width":"79%"},"width":1369,"height":741,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-0.png","element":"img"}],[{"text":"Figure 8: Quasi-equivalence analysis in 2D case. Left: The structure of the deep network to represent ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"f ","element":"figcaption","subtype":"caption"},{"text":"over ∆. Right: The polytopes outside ∆ comprise of three fan-shaped domains, on which ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"f ","element":"figcaption","subtype":"caption"},{"text":"can be cancelled by three linearly independent functions over ∆.","element":"figcaption","subtype":"caption"}],[{"text":"fan-shaped regions of ","element":"span"},{"style":{"height":17.6},"width":446.93,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-1.png","element":"img","alt":" F(x1, x2) and F′(x1, x2","inline":true},{"text":") almost overlap as ","element":"span"},{"style":{"height":8},"width":24,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-2.png","element":"img","alt":" ν","inline":true,"padRight":true},{"text":"is small. The negative sign for ","element":"span"},{"style":{"height":10.62},"width":41.94,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-3.png","element":"img","alt":" x2","inline":true,"padRight":true},{"text":"is to make sure that the fan-shaped region is ∆. To obtain the third boundary ","element":"span"},{"style":{"height":17.6},"width":1728.37,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-4.png","element":"img","alt":"ℓ3(x) = 0 for building the simplex ∆, we stack one more layer with only one neuron to","inline":true,"padRight":true},{"text":"separate the fan-shaped region of ","element":"span"},{"style":{"height":17.6},"width":156.29,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-5.png","element":"img","alt":" F(x1, x2","inline":true},{"text":") with the boundary ","element":"span"},{"style":{"height":15.02},"width":527.92,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-6.png","element":"img","alt":" −x1 − x2 + 1 = 0 as follows:","inline":true}],[{"style":{"width":"99%"},"width":1724,"height":383,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-7.png","element":"img"}],[{"text":"Thus, ","element":"span"},{"style":{"height":19.13},"width":174.07,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-8.png","element":"img","alt":" E1(x1, x2","inline":true},{"text":") will represent the function ","element":"span"},{"style":{"height":10.62},"width":181.3,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-9.png","element":"img","alt":" −x1 − x2","inline":true,"padRight":true},{"text":"+ 1 over ∆ and zero in the rest area. The depth and width of ","element":"span"},{"style":{"height":19.13},"width":174.07,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-10.png","element":"img","alt":" E1(x1, x2","inline":true},{"text":") are 3 and 4 respectively. Similarly, due to the employment of the two fan-shaped functions and the effect of ","element":"span"},{"style":{"height":8},"width":24,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-11.png","element":"img","alt":" ν","inline":true},{"text":", the area of the region with errors is estimated as","element":"span"}],[{"style":{"width":"59%"},"width":1020,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-12.png","element":"img"}],[{"text":"To acquire ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"over ∆, similarly we need three linear independent functions as linear independent bases. ","element":"span"},{"text":"We modify ","element":"span"},{"style":{"height":15.02},"width":35.18,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-13.png","element":"img","alt":" ℓ3","inline":true,"padRight":true},{"text":"slightly to get ","element":"span"},{"style":{"height":17.08},"width":729.22,"height":42.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-14.png","element":"img","alt":" ℓ′3 = ℓ3 − τ ′x1 and ℓ′′3 = ℓ3 − τ ′′x2.","inline":true,"padRight":true},{"text":"Repeating the procedure described in (1), for ","element":"span"},{"style":{"height":17.08},"width":35.18,"height":42.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-15.png","element":"img","alt":" ℓ′3 ","inline":true,"padRight":true},{"text":"we construct the network ","element":"span"},{"style":{"height":19.13},"width":288.52,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-16.png","element":"img","alt":" E2(x1, x2) that","inline":true,"padRight":true},{"text":"is ","element":"span"},{"style":{"height":20.34},"width":573.88,"height":50.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-17.png","element":"img","alt":" ℓ3 − τ ′x1 over ℓ+1 ∩ ℓ+2 ∩ (ℓ′3)+","inline":true},{"text":", while for ","element":"span"},{"style":{"height":17.08},"width":38.93,"height":42.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-18.png","element":"img","alt":" ℓ′′3 ","inline":true,"padRight":true},{"text":"we construct the network ","element":"span"},{"style":{"height":19.13},"width":334.33,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-19.png","element":"img","alt":" E3(x1, x2) that is","inline":true},{"style":{"height":20.34},"width":548.52,"height":50.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-20.png","element":"img","alt":"ℓ3 − τ ′x1 over ℓ+1 ∩ ℓ+2 ∩ (ℓ′′3)+","inline":true},{"text":". We set positive numbers ","element":"span"},{"style":{"height":12.8},"width":181.13,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/23-21.png","element":"img","alt":" τ ′ and τ ′′","inline":true,"padRight":true},{"text":"small enough to have two","element":"span"}],[{"text":"triangular domains ","element":"span"},{"style":{"height":20.34},"width":641.7,"height":50.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-0.png","element":"img","alt":" ℓ+1 ∩ℓ+2 ∩(ℓ′3)+ and ℓ+1 ∩ℓ+2 ∩(ℓ′′3)+","inline":true,"padRight":true},{"text":"almost identical with ∆. In addition, ","element":"span"},{"text":"let ","element":"span"},{"style":{"height":16.4},"width":315.18,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-1.png","element":"img","alt":" τ ′ and τ ′′ satisfy","inline":true}],[{"style":{"width":"71%"},"width":1234,"height":158,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-2.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.01},"width":161.33,"height":42.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-3.png","element":"img","alt":" ρ∗1, ρ∗2, ρ∗3 ","inline":true,"padRight":true},{"text":"are solutions. As a consequence, the deep network (illustrated in Figure ","element":"span"},{"href":"#id-74","text":"8 ","element":"a"},{"text":"(c)):","element":"span"}],[{"style":{"width":"71%"},"width":1231,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-4.png","element":"img"}],[{"text":"produces ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"on ∆. The depth and width of the network are 3 and 12. Similarly, the area of the region with errors is bounded above by","element":"span"}],[{"style":{"width":"65%"},"width":1136,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-5.png","element":"img"}],[{"text":"Therefore, for any ","element":"span"},{"style":{"height":13.2},"width":67.16,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-6.png","element":"img","alt":" δ >","inline":true,"padRight":true},{"text":"0, if we choose","element":"span"}],[{"style":{"width":"75%"},"width":1311,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-7.png","element":"img"}],[{"text":"then the constructed network ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-8.png","element":"img","alt":" N2","inline":true,"padRight":true},{"text":"will satisfy","element":"span"}],[{"style":{"width":"67%"},"width":1174,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-9.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-47","style":{"fontWeight":"bold"},"text":"9","element":"a"},{"text":", ","element":"span"},{"style":{"fontWeight":"bold"},"text":"D ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontWeight":"bold"},"text":"2","element":"span"},{"text":") The network ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h ","element":"span"},{"text":"is piecewise linear and splits the space into polytopes. It is feasible to employ a number of simplices to represent the polytopes defined by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h ","element":"span"},{"href":"#id-75","text":"Ehrenborg ","element":"a"},{"href":"#id-75","text":"(2007)","element":"a"},{"text":". Given that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"is the minimum number of required simplices, we have","element":"span"}],[{"style":{"width":"60%"},"width":1049,"height":118,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-10.png","element":"img"}],[{"text":"where","element":"span"}],[{"style":{"width":"79%"},"width":1373,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-11.png","element":"img"}],[{"text":"and ","element":"span"},{"style":{"height":16.33},"width":255.99,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-12.png","element":"img","alt":" S(m) is the m","inline":true},{"text":"-th simplex. For construction of a wide network, we use network modules to represent ","element":"span"},{"style":{"height":20.33},"width":128.33,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-13.png","element":"img","alt":" f(m)(x","inline":true},{"text":") and then horizontally aggregate them into a wide network. In contrast, for construction of a deep network, we sequentially express ","element":"span"},{"style":{"height":20.33},"width":128.33,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-14.png","element":"img","alt":" f(m)(x","inline":true},{"text":") in terms of a network module without linking them to the input.","element":"span"}],[{"text":"Representing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"with a wide ReLU network: ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Lemma ","element":"span"},{"href":"#id-71","style":{"fontWeight":"bold"},"text":"14 ","element":"a"},{"text":"suggests that a wide network module ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-15.png","element":"img","alt":" N1","inline":true,"padRight":true},{"text":"can generically represent a function over a template simplex. To represent ","element":"span"},{"style":{"height":19.93},"width":82.2,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-16.png","element":"img","alt":" f(m)","inline":true,"padRight":true},{"text":"over ","element":"span"},{"style":{"height":16.33},"width":85.41,"height":40.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-17.png","element":"img","alt":" S(m)","inline":true},{"text":", we need to use Eq. ","element":"span"},{"href":"#id-72","text":"(29) ","element":"a"},{"text":"to transform the function from the barycentric coordinate system to the Euclidean coordinate system. Let three vertices of ","element":"span"},{"style":{"height":23.8},"width":496.82,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-18.png","element":"img","alt":" S(m) be {v(m)0 , v(m)1 , v(m)2 }","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":23.8},"width":831.14,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-19.png","element":"img","alt":" V (m) = (v(m)1 − v(m)0 , v(m)2 − v(m)0 ), we have","inline":true}],[{"style":{"width":"69%"},"width":1205,"height":61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/24-20.png","element":"img"}],[{"style":{"width":"95%"},"width":1642,"height":890,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-0.png","element":"img"}],[{"text":"Figure 9: Illustration of deep networks in 1D and 2D cases.","element":"figcaption","subtype":"caption"}],[{"text":"satisfying","element":"span"}],[{"style":{"width":"71%"},"width":1239,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-1.png","element":"img"}],[{"text":"By aggregating the network ","element":"span"},{"style":{"height":23.8},"width":141.54,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-2.png","element":"img","alt":" N(m)1 (x","inline":true},{"text":") horizontally, we have the following wide network:","element":"span"}],[{"id":"id-76","style":{"width":"62%"},"width":1072,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-3.png","element":"img"}],[{"text":"Therefore, the constructed wide network ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-4.png","element":"img","alt":" H1(x","inline":true},{"text":") is of width ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(20","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":") and depth 2. It is clear that the width ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(20","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":") of the wide network ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-5.png","element":"img","alt":" H1(x","inline":true},{"text":") dominates, as the number of needed simplices goes larger and larger.","element":"span"}],[{"text":"Representing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"with a deep ReLU network: For a deep construction, the fundamental difficulty is how to sequentially express each ","element":"span"},{"style":{"height":19.93},"width":175.34,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-6.png","element":"img","alt":" f(m), i.e.","inline":true},{"text":", the input of each block can only come from the earlier block instead of the input, like what we did in one-dimensional case (Figure ","element":"span"},{"href":"#id-76","text":"9(","element":"a"},{"text":"a)). Let us derive via induction how to sequentially represent each ","element":"span"},{"style":{"height":19.93},"width":260.17,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-7.png","element":"img","alt":" f(m). We still","inline":true,"padRight":true},{"text":"adopt the idea of modularized networks, but now each network module has two outputs. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Lemma ","element":"span"},{"href":"#id-71","style":{"fontWeight":"bold"},"text":"14 ","element":"a"},{"text":"suggests that a deep network module ","element":"span"},{"style":{"height":14.62},"width":56.28,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-8.png","element":"img","alt":" N2","inline":true,"padRight":true},{"text":"can generically represent a function over a template simplex.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Step 1. ","element":"span"},{"text":"Assume that the two outputs of the first block are ","element":"span"},{"style":{"height":23.8},"width":255.79,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-9.png","element":"img","alt":" N(1)2 and Ω(1)","inline":true},{"text":". To represent ","element":"span"},{"style":{"height":19.93},"width":249.79,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-10.png","element":"img","alt":"f(1) over S(1)","inline":true},{"text":", similarly, we need to transform the function from the barycentric coordinate system to the Euclidean coordinate system. Let three vertices of ","element":"span"},{"style":{"height":23.8},"width":452.87,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-11.png","element":"img","alt":" S(1) be {v(1)0 , v(1)1 , v(1)2 }","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":23.8},"width":766.02,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-12.png","element":"img","alt":" V (1) = (v(1)1 − v(1)0 , v(1)2 − v(1)0 ), we have","inline":true}],[{"style":{"width":"66%"},"width":1155,"height":61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/25-13.png","element":"img"}],[{"text":"which is one output of the first block.","element":"span"}],[{"text":"Next, we derive the other output Ω","element":"span"},{"style":{"height":20.33},"width":89.24,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-0.png","element":"img","alt":"(1)(x","inline":true},{"text":"). Note that we are not allowed to use the input directly. Encouraged by the inversion idea in the univariate case, we invert the function domain into the input domain to get a function that is approximately only supported over ","element":"span"},{"style":{"height":16.33},"width":86.55,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-1.png","element":"img","alt":"S(1):","inline":true}],[{"style":{"width":"65%"},"width":1135,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-2.png","element":"img"}],[{"text":"To do so, recall that we have ","element":"span"},{"style":{"height":17.93},"width":192.55,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-3.png","element":"img","alt":" E1, E2, E3 ","inline":true,"padRight":true},{"text":"in constructing ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-4.png","element":"img","alt":" N2","inline":true},{"text":", we will have (","element":"span"},{"style":{"height":17.88},"width":275.82,"height":44.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-5.png","element":"img","alt":"ξ∗1, ξ∗2, ξ∗3) such","inline":true,"padRight":true},{"text":"that","element":"span"}],[{"style":{"width":"98%"},"width":1702,"height":61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-6.png","element":"img"}],[{"text":"As shown in Figure ","element":"span"},{"href":"#id-76","text":"9(","element":"a"},{"text":"b), use the residual connection, we compute Ω","element":"span"},{"style":{"height":20.86},"width":347.76,"height":52.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-7.png","element":"img","alt":"(1)(x) = x − ˜xS(1),","inline":true,"padRight":true},{"text":"which is zero over ","element":"span"},{"style":{"height":15.13},"width":177.34,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-8.png","element":"img","alt":" S1 and x","inline":true,"padRight":true},{"text":"for other regions. Ω","element":"span"},{"style":{"height":20.33},"width":89.25,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-9.png","element":"img","alt":"(1)(x","inline":true},{"text":") will be used to feed the next block to construct ","element":"span"},{"style":{"height":19.93},"width":83.34,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-10.png","element":"img","alt":" f(2).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Step 2","element":"span"},{"text":". ","element":"span"},{"text":"Suppose that the two output functions of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":"-th block are ","element":"span"},{"style":{"height":17.88},"width":222.24,"height":44.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-11.png","element":"img","alt":" Nm2 (x) and","inline":true,"padRight":true},{"text":"Ω","element":"span"},{"style":{"height":21.1},"width":609.44,"height":52.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-12.png","element":"img","alt":"(m)(x) = x − �mi=1 ˜xS(i). Ωm(x","inline":true},{"text":") is fed into the (","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"+ 1)-th block as the input. Because ","element":"span"},{"style":{"height":16.33},"width":128.67,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-13.png","element":"img","alt":"S(m+1) ","inline":true,"padRight":true},{"text":"is outside the domain of ","element":"span"},{"style":{"height":16.33},"width":311.52,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-14.png","element":"img","alt":" S(1) ∪ · · · ∪ S(m)","inline":true},{"text":", with the same technique in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Step 1","element":"span"},{"text":", we construct","element":"span"}],[{"style":{"width":"78%"},"width":1365,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-15.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":23.8},"width":468.9,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-16.png","element":"img","alt":" {v(m+1)0 , v(m+1)1 , v(m+1)2 }","inline":true,"padRight":true},{"text":"are three vertices of ","element":"span"},{"style":{"height":23.8},"width":677.62,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-17.png","element":"img","alt":" S(m+1), and V (m+1) = (v(m+1)1 −","inline":true},{"style":{"height":23.8},"width":450.98,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-18.png","element":"img","alt":"v(m+1)0 , v(m+1)2 − v(m+1)0","inline":true,"padRight":true},{"text":") to express ","element":"span"},{"style":{"height":20.33},"width":615.4,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-19.png","element":"img","alt":" f(m+1) over S(m+1) well. Ω(m)(x","inline":true},{"text":") is functionally equivalent to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"for two reasons. First, all simplices do not overlap. Ω","element":"span"},{"style":{"height":20.33},"width":250.54,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-20.png","element":"img","alt":"(m+1)(x) is x","inline":true,"padRight":true},{"text":"outside the domain of ","element":"span"},{"style":{"height":16.33},"width":296.83,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-21.png","element":"img","alt":" S(1) ∪ · · · ∪ S(m)","inline":true},{"text":"; therefore zero value over ","element":"span"},{"style":{"height":16.33},"width":296.83,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-22.png","element":"img","alt":" S(1) ∪ · · · ∪ S(m) ","inline":true,"padRight":true},{"text":"has no effect. Second, w.l.o.g., we assume that (0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"0) is outside of all simplices or lies on the boundary of a certain simplex. As such, we have ","element":"span"},{"style":{"height":23.8},"width":450.16,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-23.png","element":"img","alt":" N(m+1)2 ((0, 0)) = 0, ∀m","inline":true},{"text":", therefore, ","element":"span"},{"style":{"height":23.8},"width":291.61,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-24.png","element":"img","alt":" N(m+1)2 (Ω(m)(x","inline":true},{"text":")) will not erroneously produce a non-zero constant over ","element":"span"},{"style":{"height":16.33},"width":292.52,"height":40.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-25.png","element":"img","alt":" S(1) ∪· · ·∪S(m)","inline":true},{"text":". Furthermore, we also obtain a function ˜","element":"span"},{"style":{"height":12.92},"width":135.5,"height":32.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-26.png","element":"img","alt":"xS(m+1)","inline":true,"padRight":true},{"text":"inside the (","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"+ 1)-th block. Apply the residual operation, we have","element":"span"}],[{"style":{"width":"76%"},"width":1314,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-27.png","element":"img"}],[{"text":"Lastly, simlilar to the one-dimensional deep network, we use the shortcut connection to aggregate ","element":"span"},{"style":{"height":23.8},"width":291.61,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-28.png","element":"img","alt":" N(m)2 (Ω(m−1)(x","inline":true},{"text":")) to obtain the following deep network:","element":"span"}],[{"style":{"width":"66%"},"width":1156,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-29.png","element":"img"}],[{"text":"Therefore, the constructed deep network ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-30.png","element":"img","alt":" H2(x","inline":true},{"text":") is of depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(3","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":") and width 12. Please note that in the above equation, the summation is made by shortcuts. It is clear that the depth of ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-31.png","element":"img","alt":" H2(x","inline":true},{"text":") dominates over the width.","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/26-32.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"I. Width as Related to Neural Tangent Kernel (NTK)","element":"span"}],[{"text":"NTK sheds light on the power of a network when the width of this network goes to infin-ity. Previous results have suggested a link between the neural network and the Gaussian distribution when the network width increases infinitely. In ","element":"span"},{"href":"#id-77","text":"Neal ","element":"a"},{"href":"#id-77","text":"(1996)","element":"a"},{"text":"; ","element":"span"},{"href":"#id-78","text":"Lee et al. ","element":"a"},{"href":"#id-78","text":"(2018)","element":"a"},{"text":"; ","element":"span"},{"href":"#id-79","text":"Matthews et al. ","element":"a"},{"href":"#id-79","text":"(2018)","element":"a"},{"text":"; ","element":"span"},{"href":"#id-80","text":"Novak et al. ","element":"a"},{"href":"#id-80","text":"(2018)","element":"a"},{"text":"; ","element":"span"},{"href":"#id-81","text":"Arora et al. ","element":"a"},{"href":"#id-81","text":"(2019)","element":"a"},{"text":", the Gaussian process is shown in the two-layer networks, the deep networks, and the convolutional networks. Weakly-trained networks ","element":"span"},{"href":"#id-81","text":"(Arora et al., ","element":"a"},{"href":"#id-81","text":"2019) ","element":"a"},{"text":"refer to the networks where only the top layer is trained after other layers are randomly initialized. Let ","element":"span"},{"style":{"height":17.6},"width":216.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/27-0.png","element":"img","alt":" f(θ, x) ∈ R","inline":true,"padRight":true},{"text":"denote the output of the network on input ","element":"span"},{"style":{"height":12.8},"width":188.96,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/27-1.png","element":"img","alt":" x where θ","inline":true,"padRight":true},{"text":"denotes the parameters of the network, and ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/27-2.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"is an initialization over ","element":"span"},{"style":{"height":12.8},"width":39,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/27-3.png","element":"img","alt":" Θ","inline":true},{"text":". In the above context, training the top layer with the ","element":"span"},{"style":{"height":15.02},"width":30.02,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/27-4.png","element":"img","alt":" l2","inline":true,"padRight":true},{"text":"penalty reduces to the kernel regression:","element":"span"}],[{"style":{"width":"70%"},"width":1211,"height":66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/27-5.png","element":"img"}],[{"text":"where (","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":") denotes the functional space. On the other hand, an infinite wide network is considered as an over-parameterized network, with the motivation to explain the fact that why over-paramterized networks scale. There were extensive studies on this topic ","element":"span"},{"href":"#id-82","text":"(Du et al., ","element":"a"},{"href":"#id-82","text":"2018; ","element":"a"},{"href":"#id-83","text":"Allen-Zhu et al., ","element":"a"},{"href":"#id-83","text":"2019; ","element":"a"},{"href":"#id-84","text":"Cao and Gu, ","element":"a"},{"href":"#id-84","text":"2019)","element":"a"},{"text":". Notably, these studies indicate that the training renders relatively small changes when the network is sufficiently wide. ","element":"span"},{"text":"Such a concentration behavior is justified by NTK in terms of the Gaussian process. The NTK is defined as the following:","element":"span"}],[{"style":{"width":"71%"},"width":1237,"height":95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/27-6.png","element":"img"}],[{"text":"where (","element":"span"},{"style":{"height":13.2},"width":25,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/27-7.png","element":"img","alt":"∂","inline":true},{"text":") denotes the gradient space. Along this line, ","element":"span"},{"href":"#id-85","text":"Huang et al. ","element":"a"},{"href":"#id-85","text":"(2020) ","element":"a"},{"text":"compared the NTK of the ResNet and the deep chain-like network, and found that the NTK of the chain-like network converges to a non-informative kernel as the depth goes to infinity, while the counterpart of ResNet does not. ","element":"span"},{"href":"#id-86","text":"Du and Hu ","element":"a"},{"href":"#id-86","text":"(2019) ","element":"a"},{"text":"showed that the width provably matters in networks using the piecewise linear activation.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"II. Transformation of an Arbitrary Network","element":"span"}],[{"text":"Here, we provide the proof for ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-87","style":{"fontWeight":"bold"},"text":"15 ","element":"a"},{"text":"and the experiment that shows the feasibility of the procedures in the proof of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-88","style":{"fontWeight":"bold"},"text":"16","element":"a"},{"text":". The similar derivation to the proof for ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-87","style":{"fontWeight":"bold"},"text":"15 ","element":"a"},{"text":"is also seen in ","element":"span"},{"href":"#id-14","text":"Fan et al. ","element":"a"},{"href":"#id-14","text":"(2018c)","element":"a"},{"text":".","element":"span"}],[{"id":"id-87","style":{"fontWeight":"bold"},"text":"Theorem 15 (Equivalence of Univariate Regression Networks) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given any ReLU network ","element":"span"},{"style":{"height":17.6},"width":338.1,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-0.png","element":"img","alt":" f : [−B, B] → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with one dimensional input and output variables. There is a wide ReLU network ","element":"span"},{"style":{"height":17.6},"width":357.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-1.png","element":"img","alt":" H1 : [−B, B] → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a deep ReLU network ","element":"span"},{"style":{"height":17.6},"width":357.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-2.png","element":"img","alt":" H2 : [−B, B] → R","inline":true},{"style":{"fontStyle":"italic"},"text":", such that ","element":"span"},{"style":{"height":17.6},"width":734.13,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-3.png","element":"img","alt":"f(x) = H1(x) = H2(x), ∀x ∈ [−B, B].","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-87","style":{"fontWeight":"bold"},"text":"15","element":"a"},{"text":") Since the function represented by a network ","element":"span"},{"style":{"height":17.6},"width":385.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-4.png","element":"img","alt":" f : [−B, B] → R is","inline":true,"padRight":true},{"text":"piecewise linear, we express ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") as","element":"span"}],[{"style":{"width":"76%"},"width":1317,"height":341,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":40.8},"width":1338.22,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-6.png","element":"img","alt":" −B = x0 < x1 < x2 < · · · < xn < xn+1 = B, w(i) = f(xi+1) − f(xi)xi+1 − xi","inline":true,"padRight":true},{"text":"to guarantee continuity, and the slopes of neighboring pieces are different; otherwise they can be fused together.","element":"span"}],[{"text":"We first construct a wide ReLU network ","element":"span"},{"style":{"height":17.6},"width":100.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-7.png","element":"img","alt":" H1(x","inline":true},{"text":") to represent ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":". This can be straightforward as follows:","element":"span"}],[{"id":"id-89","style":{"width":"74%"},"width":1289,"height":119,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-8.png","element":"img"}],[{"text":"where specially ","element":"span"},{"style":{"height":16.33},"width":713.05,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-9.png","element":"img","alt":" w(−1) = 0. This network is of width n","inline":true},{"text":"+1. Now, we verify this claim. Given ","element":"span"},{"style":{"height":18.22},"width":236.07,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-10.png","element":"img","alt":"x ∈ [xj, xj+1","inline":true},{"text":"), for some ","element":"span"},{"style":{"height":16},"width":66.59,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-11.png","element":"img","alt":" j ≥","inline":true,"padRight":true},{"text":"0, we have the following:","element":"span"}],[{"style":{"width":"74%"},"width":1292,"height":662,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/28-12.png","element":"img"}],[{"id":"id-90","style":{"width":"36%"},"width":637,"height":615,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-0.png","element":"img"}],[{"text":"Figure 10: Equivalent wide (a) and deep (b) univariate ReLU networks.","element":"figcaption","subtype":"caption"}],[{"text":"Next, we construct an equivalent deep network dual to the above wide network. Note that ","element":"span"},{"href":"#id-89","text":"(62) ","element":"a"},{"text":"is the same as","element":"span"}],[{"style":{"width":"80%"},"width":1395,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-1.png","element":"img"}],[{"text":"where sgn(","element":"span"},{"style":{"height":20.34},"width":1526.35,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-2.png","element":"img","alt":"x < 0) = −1 and sgn(x > 0) = 1. If we write Ri(x) = σ(|w(i)−w(i−1)|(x−xi)), i =","inline":true,"padRight":true},{"text":"0","element":"span"},{"style":{"height":11.2},"width":174.13,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-3.png","element":"img","alt":", · · · , n −","inline":true,"padRight":true},{"text":"1, then it is clear that the following recursive relation holds:","element":"span"}],[{"style":{"width":"85%"},"width":1474,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-4.png","element":"img"}],[{"text":"Thanks to the recurrent relation, each ","element":"span"},{"style":{"height":14.62},"width":45.13,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-5.png","element":"img","alt":" Ri","inline":true,"padRight":true},{"text":"can accurately represent a small monotonically increasing piece over [","element":"span"},{"style":{"height":11.82},"width":137.66,"height":29.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-6.png","element":"img","alt":"xi, xi+1","inline":true},{"text":"]. We aggregate the outputs of those ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":"+1 pieces in the output neuron to get the desired deep ReLU network ","element":"span"},{"style":{"height":17.6},"width":129.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-7.png","element":"img","alt":" H2(x):","inline":true}],[{"style":{"width":"74%"},"width":1288,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-8.png","element":"img"}],[{"text":"which is the same as ","element":"span"},{"style":{"height":17.6},"width":100.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-9.png","element":"img","alt":" H1(x","inline":true},{"text":"). To construct ","element":"span"},{"style":{"height":17.6},"width":167.53,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-10.png","element":"img","alt":" Rn(x), n","inline":true,"padRight":true},{"text":"+ 1 consecutive neurons are connected in series. Therefore, such a one-neuron-wide network has ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"+ 2 layers. We plot two types of neural networks ","element":"span"},{"href":"#id-90","style":{"height":17.6},"width":620.07,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-11.png","element":"img","alt":" H1(x) and H2(x) in Figure 10.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Theorem 16 (Quasi-Equivalence of Multivariate Regression Networks) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that the representation of an arbitrary ReLU network is ","element":"span"},{"style":{"height":19.53},"width":514.78,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-12.png","element":"img","alt":" h : [−B, B]D → R, and M","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the minimum number of simplices to cover the polytopes to support ","element":"span"},{"style":{"height":16.4},"width":326.24,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-13.png","element":"img","alt":" h, for any δ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exist a wide ReLU network ","element":"span"},{"style":{"height":21},"width":970.97,"height":52.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/29-14.png","element":"img","alt":" H1 of width O�D(D + 1)(2D − 1)M�and depth D","inline":true},{"style":{"fontStyle":"italic"},"text":", and also a deep","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"ReLU network ","element":"span"},{"style":{"height":19.13},"width":923.22,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-0.png","element":"img","alt":" H2 of width (D + 1)D2 and depth O [(D + 1)M]","inline":true},{"style":{"fontStyle":"italic"},"text":", satisfying that","element":"span"}],[{"id":"id-88","style":{"width":"65%"},"width":1125,"height":174,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-1.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":17.6},"width":79.55,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-2.png","element":"img","alt":" m(·)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the standard measure in ","element":"span"},{"text":"[","element":"span"},{"style":{"height":19.53},"width":195.36,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-3.png","element":"img","alt":"−B, B]D.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"The sketch of proof: ","element":"span"},{"text":"A ReLU network is a piecewise linear function over polytopes, which can be decomposed into a summation of linear functions over a simplex. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Lemma ","element":"span"},{"href":"#id-91","text":"17 ","element":"a"},{"text":"shows that a wider network module ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-4.png","element":"img","alt":" N1(x","inline":true},{"text":") and a deeper network module ","element":"span"},{"style":{"height":17.6},"width":202.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-5.png","element":"img","alt":" N2(x) can","inline":true,"padRight":true},{"text":"represent an arbitrary linear function over a simplex. Next, in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-88","text":"16, ","element":"a"},{"text":"to transform an arbitrary ReLU network into a wide and a deep network, we horizontally aggregate network modules ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-6.png","element":"img","alt":" N1(x","inline":true},{"text":") to have a wide network, and we use shortcuts to sequentially establish a deep network.","element":"span"}],[{"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-simplex ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"is a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional convex hull provided by convex combinations of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"+1","element":"span"}],[{"id":"id-94","style":{"width":"31%"},"width":540,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-7.png","element":"img"}],[{"text":"affinely independent vectors ","element":"span"},{"style":{"height":20.02},"width":258.15,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-8.png","element":"img","alt":" {vi}Di=0 ⊂ RD","inline":true},{"text":". In other words, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"=","element":"span"}],[{"text":"If we write ","element":"span"},{"style":{"height":17.6},"width":938.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-9.png","element":"img","alt":" V = (v1 − v0, v2 − v0, · · · , vD − v0), then V","inline":true,"padRight":true},{"text":"is invertible, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"= ","element":"span"},{"style":{"height":21},"width":1140.94,"height":52.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-10.png","element":"img","alt":"{v0 + V x | x ∈ ∆} where ∆ =�x ∈ RD | x ≥ 0, 1⊤x ≤ 1�","inline":true},{"text":"is a template simplex in ","element":"span"},{"style":{"height":15.13},"width":74.24,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-11.png","element":"img","alt":" RD.","inline":true,"padRight":true},{"text":"It is clear that the following one-to-one affine mapping between ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"and ∆ exists, which is","element":"span"}],[{"id":"id-92","style":{"width":"99%"},"width":1725,"height":427,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-12.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.6},"width":1257.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-13.png","element":"img","alt":" a = (f(v1) − f(v0), f(v2) − f(v0), · · · , f(vD) − f(v0)), b = f(v0","inline":true},{"text":"). We denote the domain of a network as [","element":"span"},{"style":{"height":19.53},"width":164.03,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-14.png","element":"img","alt":"−B, B]D","inline":true},{"text":". Given a linear function ","element":"span"},{"style":{"height":17.6},"width":618.51,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-15.png","element":"img","alt":" ℓ(x) = c1x1 +c2x2 +· · ·+cnxn +","inline":true},{"style":{"height":19.53},"width":1448.9,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-16.png","element":"img","alt":"cn+1, we write ℓ− = {x ∈ RD | ℓ(x) < 0} and ℓ+ = {x ∈ RD | ℓ(x) ≥ 0}. S","inline":true,"padRight":true},{"text":"is enclosed by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1 hyperplanes provided by ","element":"span"},{"style":{"height":17.6},"width":1128.74,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-17.png","element":"img","alt":" ℓi(x) = xi, i = 1, · · · , D, and ℓD+1(x) = −x1 − · · · − xD + 1.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Lemma 17 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose that the representation of an arbitrary ReLU network is ","element":"span"},{"style":{"height":19.53},"width":308.57,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-18.png","element":"img","alt":" f : [−B, B]D →","inline":true,"padRight":true},{"text":"R ","element":"span"},{"style":{"fontStyle":"italic"},"text":"expressed as Eq. ","element":"span"},{"href":"#id-92","text":"(69)","element":"a"},{"style":{"fontStyle":"italic"},"text":", for any ","element":"span"},{"style":{"height":13.2},"width":108.58,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-19.png","element":"img","alt":" δ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exist a ReLU network ","element":"span"},{"style":{"height":17.6},"width":385.11,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-20.png","element":"img","alt":" N1 of width D(D +","inline":true,"padRight":true},{"text":"1)(2","element":"span"},{"style":{"height":19.53},"width":457.94,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-21.png","element":"img","alt":"D − 1) + 2 and depth D","inline":true},{"style":{"fontStyle":"italic"},"text":", and also a ReLU network ","element":"span"},{"style":{"height":19.13},"width":655.49,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-22.png","element":"img","alt":" N2 of width (D + 1)D2 and depth","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", satisfying that","element":"span"}],[{"id":"id-91","style":{"width":"65%"},"width":1125,"height":174,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/30-23.png","element":"img"}],[{"href":"#id-91","style":{"height":17.6},"width":535.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-0.png","element":"img","alt":"Proof (Lemma 17, D ≥ 2","inline":true},{"text":") Our goal is to approximate the given piecewise linear function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"using ReLU networks. ","element":"span"},{"text":"We first index the polytopes separated by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1 hyperplanes ","element":"span"},{"style":{"height":22.63},"width":1728.06,"height":56.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-1.png","element":"img","alt":"ℓi(x) = 0, i = 1, · · · , D+1 as A(χ1,··· ,χi,··· ,χD+1) = ℓχ11 ∩· · ·∩ℓχii ∩· · ·∩ℓχD+1D+1 , χi ∈ {+, −}, i =","inline":true,"padRight":true},{"text":"1","element":"span"},{"style":{"height":15.2},"width":140.27,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-2.png","element":"img","alt":", · · · , D","inline":true},{"text":"+1. It is clear to see that ","element":"span"},{"style":{"height":17.13},"width":289.6,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-3.png","element":"img","alt":" S = A(+,+,··· ,+)","inline":true},{"text":". In addition, we use ","element":"span"},{"style":{"height":10.8},"width":29,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-4.png","element":"img","alt":" ∨","inline":true,"padRight":true},{"text":"to denote exclusion of certain component. For instance, ","element":"span"},{"style":{"height":22.63},"width":802.26,"height":56.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-5.png","element":"img","alt":" A(χ1,∨,χ3,··· ,χD+1) = ℓχ11 ∩ ℓχ33 ∩ · · · ∩ ℓχD+1D+1 ","inline":true,"padRight":true},{"text":". It can be ","element":"span"},{"text":"easily verified that","element":"span"}],[{"id":"id-93","style":{"width":"80%"},"width":1398,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-6.png","element":"img"}],[{"text":"Please note that ","element":"span"},{"style":{"height":19.13},"width":470.92,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-7.png","element":"img","alt":" A(−,−,··· ,−) = ∅. Thus, D","inline":true},{"text":"+1 hyperplanes create in total 2","element":"span"},{"style":{"height":15.13},"width":111.38,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-8.png","element":"img","alt":"D+1−","inline":true},{"text":"1 polytopes in the [","element":"span"},{"style":{"height":19.53},"width":178.76,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-9.png","element":"img","alt":"−B, B]D.","inline":true}],[{"text":"Now we recursively define an essential building block, a D-dimensional fan-shaped ReLU network ","element":"span"},{"style":{"height":17.6},"width":131.22,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-10.png","element":"img","alt":" FD(x):","inline":true}],[{"style":{"width":"84%"},"width":1455,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-11.png","element":"img"}],[{"text":"where the set of linear functions ","element":"span"},{"style":{"height":20.45},"width":451.8,"height":51.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-12.png","element":"img","alt":" {hk(x) = p⊤k x + rk}Dk=1 ","inline":true,"padRight":true},{"text":"are provided by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"linearly inde- ","element":"span"},{"text":"pendent vectors ","element":"span"},{"style":{"height":20.45},"width":298.52,"height":51.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-13.png","element":"img","alt":" {pk}Dk=1, and µ","inline":true,"padRight":true},{"text":"is a large positive number (","element":"span"},{"style":{"height":18.73},"width":246.85,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-14.png","element":"img","alt":"µj denotes µ","inline":true,"padRight":true},{"text":"with the power ","element":"span"},{"text":"to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"). Note that the network ","element":"span"},{"style":{"height":14.7},"width":56.06,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-15.png","element":"img","alt":" FD","inline":true,"padRight":true},{"text":"is of width ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"and depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":". This network enjoys the following key characteristics: 1) As ","element":"span"},{"style":{"height":13.2},"width":138.44,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-16.png","element":"img","alt":" µ → ∞","inline":true},{"text":", the hyperplane ","element":"span"},{"style":{"height":19.75},"width":579.46,"height":49.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-17.png","element":"img","alt":" h1 − µh2 − · · · − µjhj+1 = 0 is","inline":true,"padRight":true},{"text":"approximate to the hyperplane ","element":"span"},{"style":{"height":19.75},"width":539.43,"height":49.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-18.png","element":"img","alt":" hj+1 = 0 as the term µjhj+1","inline":true,"padRight":true},{"text":"dominates. Thus, the support of ","element":"span"},{"style":{"height":17.6},"width":102.75,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-19.png","element":"img","alt":" FD(x","inline":true},{"text":") converges to ","element":"span"},{"style":{"height":20.84},"width":635.06,"height":52.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-20.png","element":"img","alt":" h+1 ∩ h−2 ∩ · · · ∩ h−D which is a D","inline":true},{"text":"-dimensional fan-shaped function. ","element":"span"},{"text":"2) Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"be the maximum area of hyperplanes in [","element":"span"},{"style":{"height":19.53},"width":180.85,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-21.png","element":"img","alt":"−B, B]D","inline":true},{"text":". Because the real boundary ","element":"span"},{"style":{"height":19.75},"width":1730.65,"height":49.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-22.png","element":"img","alt":"h1−µh2−· · ·−µjhj+1 = 0 is almost parallel to the ideal boundary hj+1 = 0, the measure of","inline":true,"padRight":true},{"text":"the imprecise domain caused by ","element":"span"},{"style":{"height":19.13},"width":583.96,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-23.png","element":"img","alt":" µj is at most C/µj, where 1/µj ","inline":true,"padRight":true},{"text":"is the approximate distance between the real and ideal boundaries. In total, the measure of the inaccurate region in building ","element":"span"},{"style":{"height":17.6},"width":102.75,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-24.png","element":"img","alt":" FD(x","inline":true},{"text":") is at most ","element":"span"},{"style":{"height":24.4},"width":456.3,"height":61.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-25.png","element":"img","alt":" C �D−1j=1 1/µj ≤ C/(µ −","inline":true,"padRight":true},{"text":"1). 3) The function over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional ","element":"span"},{"text":"fan-shaped domain is ","element":"span"},{"style":{"height":19.35},"width":444.32,"height":48.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-26.png","element":"img","alt":" h1, since (hj)+ = 0, j ≥","inline":true,"padRight":true},{"text":"2 over the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional fan-shaped domain.","element":"span"}],[{"text":"Constructing ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-27.png","element":"img","alt":" N1","inline":true},{"text":": Discontinuity of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"in Eq.","element":"span"},{"href":"#id-92","text":"(69) ","element":"a"},{"text":"is one of the major challenges of representing it using a ReLU network. To tackle this issue, we start from a linear function ˜","element":"span"},{"style":{"height":19.53},"width":479.67,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-28.png","element":"img","alt":"f(x) = a⊤x + b, ∀x ∈ RD","inline":true},{"text":", which can be represented by two neurons ","element":"span"},{"style":{"height":20.6},"width":416.72,"height":51.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-29.png","element":"img","alt":" σ ◦ ˜f − σ ◦ (− ˜f). The","inline":true,"padRight":true},{"text":"key idea is to eliminate ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"over all 2","element":"span"},{"style":{"height":15.13},"width":119.07,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-30.png","element":"img","alt":"D+1 −","inline":true,"padRight":true},{"text":"2 polytopes outside ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"using the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional fan-shaped functions.","element":"span"}],[{"text":"Let us use ","element":"span"},{"style":{"height":17.14},"width":646.45,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-31.png","element":"img","alt":" A(+,+,+,−,··· ,−) and A(+,+,−,−,··· ,−) ","inline":true,"padRight":true},{"text":"to show how to cancel the function ","element":"span"},{"text":"˜","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"over the polytopes outside ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":". According to ","element":"span"},{"href":"#id-93","text":"(71)","element":"a"},{"text":", ","element":"span"},{"style":{"height":19.93},"width":787.26,"height":49.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-32.png","element":"img","alt":" A(+,+,+,−,··· ,−) and A(+,+,−,−,··· ,−) satisfy","inline":true}],[{"style":{"width":"77%"},"width":1331,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-33.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.13},"width":414.41,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-34.png","element":"img","alt":" A(+,+,∨,−,··· ,−) is a D","inline":true},{"text":"-dimensional fan-shaped domain. Without loss of generality, a number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1 of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional fan-shaped functions over ","element":"span"},{"style":{"height":17.13},"width":269.9,"height":42.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-35.png","element":"img","alt":" A(+,+,∨,−,··· ,−) ","inline":true,"padRight":true},{"text":"are needed as the group of linear independent bases to cancel ","element":"span"},{"text":"˜","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":", where the ","element":"span"},{"style":{"height":15.53},"width":56.34,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/31-36.png","element":"img","alt":" kth ","inline":true,"padRight":true},{"text":"fan-shaped function is","element":"span"}],[{"style":{"width":"99%"},"width":1723,"height":493,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-0.png","element":"img"}],[{"text":"where we let ","element":"span"},{"style":{"height":16.4},"width":930.72,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-1.png","element":"img","alt":" xD+1 = 1 for consistency, the negative sign for x2","inline":true,"padRight":true},{"text":"is to make sure that the fan-shaped region ","element":"span"},{"style":{"height":25.49},"width":1455.07,"height":63.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-2.png","element":"img","alt":" ℓ+1 ∩(−ℓ2)− ∩ℓ−4 ∩· · ·∩ℓ−D+1 of F (k)D is A(+,+,∨,−,··· ,−), η1 = 0, and ηk = η, k =","inline":true,"padRight":true},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", ..., D ","element":"span"},{"text":"+ 1 represents a small shift for ","element":"span"},{"style":{"height":18.73},"width":969.32,"height":46.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-3.png","element":"img","alt":" x1 = 0 such that m((x1)+ ∩ (x1 − ηkxk)−) < Cηk.","inline":true,"padRight":true},{"text":"The constructed function over ","element":"span"},{"style":{"height":17.13},"width":315.74,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-4.png","element":"img","alt":" A(+,+,∨,−,··· ,−) is","inline":true}],[{"style":{"width":"69%"},"width":1197,"height":61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-5.png","element":"img"}],[{"text":"which is approximately over","element":"span"}],[{"style":{"width":"75%"},"width":1301,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-6.png","element":"img"}],[{"text":"Let us find ","element":"span"},{"style":{"height":18.77},"width":460.8,"height":46.93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-7.png","element":"img","alt":" ω∗1, · · · , ω∗D+1 by solving","inline":true}],[{"style":{"width":"73%"},"width":1270,"height":289,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-8.png","element":"img"}],[{"text":"and then the new function ","element":"span"},{"style":{"height":24.29},"width":689.87,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-9.png","element":"img","alt":" F (+,+,∨,−,··· ,−)(x) = �D+1k=1 ω∗kF (k)D (x","inline":true},{"text":") satisfies that","element":"span"}],[{"style":{"width":"80%"},"width":1390,"height":320,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-10.png","element":"img"}],[{"text":"Similarly, we can construct other functions","element":"span"}],[{"text":"cancel ","element":"span"},{"text":"˜","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"over other polytopes. Finally, these ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimenional fan-shaped functions are aggregated to form the following wide ReLU network ","element":"span"},{"style":{"height":17.6},"width":130.63,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-11.png","element":"img","alt":" N1(x):","inline":true}],[{"style":{"width":"75%"},"width":1302,"height":195,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/32-12.png","element":"img"}],[{"text":"where the width and depth of the network are ","element":"span"},{"style":{"height":19.53},"width":288.98,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-0.png","element":"img","alt":" D(D + 1)(2D −","inline":true,"padRight":true},{"text":"1) + 2 and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"respectively. In addition, because there are 2","element":"span"},{"style":{"height":15.13},"width":74.82,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-1.png","element":"img","alt":"D −","inline":true,"padRight":true},{"text":"1 polytopes being cancelled, the total area of the regions suffering from errors is no more than","element":"span"}],[{"style":{"width":"63%"},"width":1095,"height":99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-2.png","element":"img"}],[{"text":"Therefore, for any ","element":"span"},{"style":{"height":13.2},"width":67.16,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-3.png","element":"img","alt":" δ >","inline":true,"padRight":true},{"text":"0, as long as we choose appropriate ","element":"span"},{"style":{"height":12},"width":67.73,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-4.png","element":"img","alt":" µ, η","inline":true,"padRight":true},{"text":"that fulfill","element":"span"}],[{"style":{"width":"85%"},"width":1482,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-5.png","element":"img"}],[{"text":"the constructed network ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-6.png","element":"img","alt":" N1(x","inline":true},{"text":") will have","element":"span"}],[{"style":{"width":"68%"},"width":1190,"height":57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-7.png","element":"img"}],[{"text":"Constructing ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-8.png","element":"img","alt":" N2","inline":true},{"text":": Allowing more layers in a network provides an alternate way to represent ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":". The fan-shaped functions remain to be used. The whole pipeline can be divided into two steps: (1) build a function over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":"; and (2) represent ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"over ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"by slightly moving one boundary of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"to create linear independent bases.","element":"span"}],[{"text":"(1) We construct a number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional fan-shaped functions. Without loss of generality, the ","element":"span"},{"style":{"height":15.53},"width":56.34,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-9.png","element":"img","alt":" kth ","inline":true,"padRight":true},{"text":"fan-shaped function is constructed as","element":"span"}],[{"style":{"width":"72%"},"width":1250,"height":276,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-10.png","element":"img"}],[{"text":"whose fan-shaped region is approximately (","element":"span"},{"style":{"height":18.73},"width":883.44,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-11.png","element":"img","alt":"ℓ1 − νkxk)+ ∩ (−ℓ2)− ∩ · · · ∩ (−ℓD)− = (ℓ1 −","inline":true},{"style":{"height":20.84},"width":429.49,"height":52.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-12.png","element":"img","alt":"νkxk)+ ∩ ℓ+2 ∩ · · · ∩ ℓ+D","inline":true},{"text":", which almost overlaps with ","element":"span"},{"style":{"height":21.43},"width":722.62,"height":53.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-13.png","element":"img","alt":" A(+,··· ,+,∨) = ℓ+1 ∩ ℓ+2 ∩ · · · ∩ ℓ+D as νk","inline":true,"padRight":true},{"text":"becomes sufficiently small. The output of ","element":"span"},{"style":{"height":24.29},"width":601.75,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-14.png","element":"img","alt":" F (k)D is x1 − νkxk, k = 1, · · · , D","inline":true},{"text":". To obtain the last boundary ","element":"span"},{"style":{"height":17.6},"width":1451.1,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-15.png","element":"img","alt":" ℓD+1(x) = −x1 − · · · − xD + 1 = 0 so as to construct the simplex S, we stack","inline":true,"padRight":true},{"text":"one more layer with only one neuron as follows:","element":"span"}],[{"style":{"width":"99%"},"width":1724,"height":545,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-16.png","element":"img"}],[{"text":"Thus, ","element":"span"},{"style":{"height":19.13},"width":95.87,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-17.png","element":"img","alt":" E1(x","inline":true},{"text":") will approximately represent the linear function ","element":"span"},{"style":{"height":15.1},"width":524.01,"height":37.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-18.png","element":"img","alt":" −x1 − · · · − xD + 1 over S","inline":true,"padRight":true},{"text":"and zero elsewhere. The depth and width of the network are ","element":"span"},{"style":{"height":16.33},"width":273.8,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/33-19.png","element":"img","alt":" D + 1 and D2 ","inline":true,"padRight":true},{"text":"respectively. Similarly, due to the employment of a number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"-dimensional fan-shaped functions and the effect of ","element":"span"},{"style":{"height":10.84},"width":39.56,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-0.png","element":"img","alt":" νk","inline":true},{"text":", the area of the region with errors is estimated as","element":"span"}],[{"style":{"width":"62%"},"width":1082,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-1.png","element":"img"}],[{"text":"(2) To acquire an arbitrary linear function, similarly we need ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1 linear independent functions as linear independent bases. Other than the one obtained in step (1), we further modify ","element":"span"},{"style":{"height":16.3},"width":90.24,"height":40.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-2.png","element":"img","alt":" ℓD+1","inline":true,"padRight":true},{"text":"a little bit ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"times to get ","element":"span"},{"style":{"height":21.51},"width":638.15,"height":53.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-3.png","element":"img","alt":" ℓlD+1 = ℓD+1 − τlxl, l = 1, · · · , D","inline":true},{"text":". Repeating the ","element":"span"},{"text":"same procedure described in step (1), for ","element":"span"},{"style":{"height":21.51},"width":90.24,"height":53.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-4.png","element":"img","alt":" ℓlD+1","inline":true},{"text":", we can construct the network ","element":"span"},{"style":{"height":19.53},"width":242.75,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-5.png","element":"img","alt":" El(x) that is","inline":true},{"style":{"height":21.51},"width":213.88,"height":53.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-6.png","element":"img","alt":"ℓlD+1 − τlxl","inline":true,"padRight":true},{"text":"approximately over ","element":"span"},{"style":{"height":21.71},"width":895.17,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-7.png","element":"img","alt":" ℓ+1 ∩ ℓ+2 ∩ · · · ∩ (ℓlD+1)+, where τl, l = 1, · · · , D","inline":true,"padRight":true},{"text":"is small to ","element":"span"},{"text":"render these domains almost identical to ","element":"span"},{"style":{"height":15.6},"width":425.88,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-8.png","element":"img","alt":" S, and τl, l = 1, · · · , D","inline":true,"padRight":true},{"text":"satisfies that","element":"span"}],[{"style":{"width":"99%"},"width":1724,"height":476,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-9.png","element":"img"}],[{"text":"producing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S","element":"span"},{"text":". The depth and width of ","element":"span"},{"style":{"height":19.13},"width":530.07,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-10.png","element":"img","alt":" N2(x) are D + 1 and D2(D","inline":true,"padRight":true},{"text":"+ 1), respectively. Similarly, the area of the region with errors is bounded above by","element":"span"}],[{"style":{"width":"76%"},"width":1327,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-11.png","element":"img"}],[{"text":"Therefore, for any ","element":"span"},{"style":{"height":13.2},"width":67.16,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-12.png","element":"img","alt":" δ >","inline":true,"padRight":true},{"text":"0, if we choose ","element":"span"},{"style":{"height":12},"width":136.19,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-13.png","element":"img","alt":" µ, νk, τl","inline":true,"padRight":true},{"text":"appropriately such that","element":"span"}],[{"style":{"width":"78%"},"width":1363,"height":166,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-14.png","element":"img"}],[{"text":"then the constructed network ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-15.png","element":"img","alt":" N2(x","inline":true},{"text":") will satisfy","element":"span"}],[{"style":{"width":"69%"},"width":1194,"height":133,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-16.png","element":"img"}],[{"href":"#id-88","style":{"height":17.6},"width":581.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-17.png","element":"img","alt":"Proof (Theorem 16, D ≥ 2","inline":true},{"text":") The network ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h ","element":"span"},{"text":"is piecewise linear and splits the space into polytopes. It is feasible to employ a number of simplices to fill the polytopes defined by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h ","element":"span"},{"href":"#id-75","text":"(Ehrenborg, ","element":"a"},{"href":"#id-75","text":"2007)","element":"a"},{"text":". Given that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"M ","element":"span"},{"text":"is the minimum number of required simplices, we have","element":"span"}],[{"style":{"width":"60%"},"width":1049,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/34-18.png","element":"img"}],[{"text":"where","element":"span"}],[{"style":{"width":"75%"},"width":1306,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-0.png","element":"img"}],[{"text":"and ","element":"span"},{"style":{"height":16.33},"width":251.66,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-1.png","element":"img","alt":" S(m) is the m","inline":true},{"text":"-th simplex. The core of the wide construction is to horizontally aggregate network modules representing ","element":"span"},{"style":{"height":20.33},"width":128.33,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-2.png","element":"img","alt":" f(m)(x","inline":true},{"text":") into a wide network. In contrast, the core of the deep construction is to sequentially express ","element":"span"},{"style":{"height":20.33},"width":128.33,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-3.png","element":"img","alt":" f(m)(x","inline":true},{"text":") in a network module without linking these modules to the input.","element":"span"}],[{"text":"Representing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h ","element":"span"},{"text":"with a wide ReLU network: ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Lemma ","element":"span"},{"href":"#id-91","style":{"fontWeight":"bold"},"text":"17 ","element":"a"},{"text":"suggests that a wide network module ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-4.png","element":"img","alt":" N1","inline":true,"padRight":true},{"text":"can generically represent a function over a template simplex. ","element":"span"},{"text":"To represent ","element":"span"},{"style":{"height":19.93},"width":284.55,"height":49.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-5.png","element":"img","alt":"f(m) over S(m)","inline":true},{"text":", we need to use Eq. ","element":"span"},{"href":"#id-94","text":"(68) ","element":"a"},{"text":"to transform the function from the barycentric coordinate system to the Euclidean coordinate system. ","element":"span"},{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1 vertices of ","element":"span"},{"style":{"height":16.33},"width":154.26,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-6.png","element":"img","alt":" S(m) be","inline":true}],[{"style":{"width":"99%"},"width":1721,"height":169,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-7.png","element":"img"}],[{"text":"satisfying","element":"span"}],[{"style":{"width":"72%"},"width":1245,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-8.png","element":"img"}],[{"text":"By aggregating the network ","element":"span"},{"style":{"height":23.8},"width":141.54,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-9.png","element":"img","alt":" N(m)1 (x","inline":true},{"text":") horizontally, we have the following wide network:","element":"span"}],[{"style":{"width":"62%"},"width":1072,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-10.png","element":"img"}],[{"text":"Therefore, the constructed wide network ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-11.png","element":"img","alt":" H1(x","inline":true},{"text":") is of width ","element":"span"},{"style":{"height":19.54},"width":558.85,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-12.png","element":"img","alt":" O(D(D + 1)(2D − 1)M) and","inline":true,"padRight":true},{"text":"depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":". It is clear that the width ","element":"span"},{"style":{"height":19.53},"width":453.12,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-13.png","element":"img","alt":" O(D(D + 1)(2D − 1)M","inline":true},{"text":") of the wide network ","element":"span"},{"style":{"height":17.6},"width":118.66,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-14.png","element":"img","alt":" H1(x)","inline":true,"padRight":true},{"text":"dominates, as the number of needed simplices goes larger and larger.","element":"span"}],[{"text":"Representing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h ","element":"span"},{"text":"with a deep ReLU network: For a deep construction, the fundamental difficulty is how to sequentially express each ","element":"span"},{"style":{"height":19.94},"width":175.34,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-15.png","element":"img","alt":" f(m), i.e.","inline":true},{"text":", the input of each block can only come from the earlier block instead of the input, like what we did in one-dimensional case (Figure ","element":"span"},{"href":"#id-76","text":"9(","element":"a"},{"text":"a)). Let us use mathematical deduction to derive how to sequentially represent each ","element":"span"},{"style":{"height":19.93},"width":82.2,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-16.png","element":"img","alt":" f(m)","inline":true},{"text":". We still adopt the idea of modularized networks, but now each network module has two outputs. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Lemma ","element":"span"},{"href":"#id-91","style":{"fontWeight":"bold"},"text":"17 ","element":"a"},{"text":"suggests that a deep network module ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-17.png","element":"img","alt":" N2","inline":true,"padRight":true},{"text":"can generically represent a function over a template simplex.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Step 1. ","element":"span"},{"text":"Assume that the two outputs of the first block are ","element":"span"},{"style":{"height":23.8},"width":255.79,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-18.png","element":"img","alt":" N(1)2 and Ω(1)","inline":true},{"text":". To represent ","element":"span"},{"style":{"height":19.93},"width":249.79,"height":49.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-19.png","element":"img","alt":"f(1) over S(1)","inline":true},{"text":", similarly, we need to transform the function from the barycentric coordinate system to the Euclidean coordinate system. Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":"+1 vertices of ","element":"span"},{"style":{"height":24.29},"width":524.26,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-20.png","element":"img","alt":" S(1) be {v(1)0 , v(1)1 , · · · , v(1)D }","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":24.29},"width":1068.5,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-21.png","element":"img","alt":" V (1) = (v(1)1 − v(1)0 , v(1)2 − v(1)0 , · · · , v(1)D − v(1)0 ), we have","inline":true}],[{"style":{"width":"66%"},"width":1155,"height":61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-22.png","element":"img"}],[{"text":"which is one output of the first block. Next, we derive the other output Ω","element":"span"},{"style":{"height":20.33},"width":89.24,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/35-23.png","element":"img","alt":"(1)(x","inline":true},{"text":"). Note that we are not allowed to use the input directly. Encouraged by the inversion idea in the univariate case, we invert the function","element":"span"}],[{"text":"domain into the input domain to get a function that is approximately only supported over ","element":"span"},{"style":{"height":16.33},"width":86.55,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-0.png","element":"img","alt":"S(1):","inline":true}],[{"style":{"width":"65%"},"width":1135,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-1.png","element":"img"}],[{"text":"To do so, recall that we have ","element":"span"},{"style":{"height":18.33},"width":332.47,"height":45.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-2.png","element":"img","alt":" E1, E2, · · · , ED+1 ","inline":true,"padRight":true},{"text":"in constructing ","element":"span"},{"style":{"height":14.62},"width":56.27,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-3.png","element":"img","alt":" N2","inline":true},{"text":", we will have a coefficient vector (","element":"span"},{"style":{"height":18.77},"width":294.84,"height":46.93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-4.png","element":"img","alt":"ξ∗1, ξ∗2, · · · , ξ∗D+1","inline":true},{"text":") such that","element":"span"}],[{"style":{"width":"102%"},"width":1763,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-5.png","element":"img"}],[{"text":"As shown in Figure ","element":"span"},{"href":"#id-76","text":"9(","element":"a"},{"text":"b), use the residual connection, we compute Ω","element":"span"},{"style":{"height":20.86},"width":347.76,"height":52.14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-6.png","element":"img","alt":"(1)(x) = x − ˜xS(1),","inline":true,"padRight":true},{"text":"which is zero over ","element":"span"},{"style":{"height":15.13},"width":177.34,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-7.png","element":"img","alt":" S1 and x","inline":true,"padRight":true},{"text":"for other regions. Ω","element":"span"},{"style":{"height":20.33},"width":89.25,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-8.png","element":"img","alt":"(1)(x","inline":true},{"text":") will be used to feed the next block to construct ","element":"span"},{"style":{"height":19.93},"width":83.34,"height":49.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-9.png","element":"img","alt":" f(2).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Step 2","element":"span"},{"text":". ","element":"span"},{"text":"Suppose that the two output functions of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":"-th block are ","element":"span"},{"style":{"height":17.88},"width":222.24,"height":44.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-10.png","element":"img","alt":" Nm2 (x) and","inline":true,"padRight":true},{"text":"Ω","element":"span"},{"style":{"height":21.1},"width":609.44,"height":52.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-11.png","element":"img","alt":"(m)(x) = x − �mi=1 ˜xS(i). Ωm(x","inline":true},{"text":") is fed into the (","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"+ 1)-th block as the input. Because ","element":"span"},{"style":{"height":16.33},"width":128.67,"height":40.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-12.png","element":"img","alt":"S(m+1) ","inline":true,"padRight":true},{"text":"is outside the domain of ","element":"span"},{"style":{"height":16.33},"width":311.52,"height":40.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-13.png","element":"img","alt":" S(1) ∪ · · · ∪ S(m)","inline":true},{"text":", with the same technique in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Step 1","element":"span"},{"text":", we construct","element":"span"}],[{"style":{"width":"78%"},"width":1365,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-14.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":24.29},"width":553.75,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-15.png","element":"img","alt":" {v(m+1)0 , v(m+1)1 , · · · , v(m+1)D }","inline":true,"padRight":true},{"text":"are three vertices of ","element":"span"},{"style":{"height":23.8},"width":634.34,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-16.png","element":"img","alt":" S(m+1), and V (m+1) = (v(m+1)1 −","inline":true},{"style":{"height":24.29},"width":860.87,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-17.png","element":"img","alt":"v(m+1)0 , v(m+1)2 − v(m+1)0 , · · · , v(m+1)D − v(m+1)0","inline":true,"padRight":true},{"text":") to express ","element":"span"},{"style":{"height":20.33},"width":632.16,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-18.png","element":"img","alt":" f(m+1) over S(m+1) well. Ω(m)(x)","inline":true,"padRight":true},{"text":"is functionally equivalent to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"for two reasons. First, all simplices do not overlap. Ω","element":"span"},{"style":{"height":20.33},"width":162.04,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-19.png","element":"img","alt":"(m+1)(x)","inline":true,"padRight":true},{"text":"is ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"outside the domain of ","element":"span"},{"style":{"height":16.33},"width":292.7,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-20.png","element":"img","alt":" S(1) ∪· · ·∪S(m)","inline":true},{"text":"; therefore zero value over ","element":"span"},{"style":{"height":16.33},"width":429.71,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-21.png","element":"img","alt":" S(1) ∪· · ·∪S(m) has no","inline":true,"padRight":true},{"text":"effect. Second, w.l.o.g., we assume that ","element":"span"},{"style":{"fontWeight":"bold"},"text":"0 ","element":"span"},{"text":"is outside of all simplices or lies on the boundary of a certain simplex. As such, we have ","element":"span"},{"style":{"height":23.8},"width":362.52,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-22.png","element":"img","alt":" N(m+1)2 (0) = 0, ∀m","inline":true},{"text":", therefore, ","element":"span"},{"style":{"height":23.8},"width":406.25,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-23.png","element":"img","alt":" N(m+1)2 (Ω(m)(x)) will","inline":true,"padRight":true},{"text":"not erroneously produce a non-zero constant over ","element":"span"},{"style":{"height":16.33},"width":312.37,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-24.png","element":"img","alt":" S(1) ∪ · · · ∪ S(m)","inline":true},{"text":". Furthermore, we also obtain a function ˜","element":"span"},{"style":{"height":12.92},"width":135.5,"height":32.31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-25.png","element":"img","alt":"xS(m+1)","inline":true,"padRight":true},{"text":"inside the (","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"+ 1)-th block. Apply the residual operation, we have","element":"span"}],[{"style":{"width":"76%"},"width":1314,"height":118,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-26.png","element":"img"}],[{"text":"Lastly, simlilar to the one-dimensional deep network, we use the shortcut connection to aggregate ","element":"span"},{"style":{"height":23.8},"width":291.61,"height":59.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-27.png","element":"img","alt":" N(m)2 (Ω(m−1)(x","inline":true},{"text":")) to obtain the following deep network:","element":"span"}],[{"style":{"width":"66%"},"width":1156,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-28.png","element":"img"}],[{"text":"Therefore, the constructed deep network ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-29.png","element":"img","alt":" H2(x","inline":true},{"text":") is of depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"((","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1)","element":"span"},{"style":{"fontStyle":"italic"},"text":"M","element":"span"},{"text":") and width (","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"+ 1)","element":"span"},{"style":{"height":14.73},"width":54.34,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-30.png","element":"img","alt":"D2","inline":true},{"text":". Please note that in the above equation, the summation is made by shortcuts. It is clear that the depth of ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-31.png","element":"img","alt":" H2(x","inline":true},{"text":") dominates over the width.","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-32.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Experiment: ","element":"span"},{"text":"A 2-6-2-1 network was trained on the moon dataset. When the training process was finished, the accuracy of the network was 0.94. Then, we transformed the original network into a wide network (","element":"span"},{"style":{"height":17.6},"width":1087.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/36-33.png","element":"img","alt":"µ = 80) and a deep network (µ = 5) using the procedures","inline":true,"padRight":true},{"text":"described in the proof of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Theorem ","element":"span"},{"href":"#id-88","style":{"fontWeight":"bold"},"text":"16","element":"a"},{"text":". The patterns of outputs of the constructed wide","element":"span"}],[{"style":{"width":"100%"},"width":1737,"height":521,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/37-0.png","element":"img"}],[{"text":"Figure 11: With the procedure in the proof for ","element":"figcaption","subtype":"caption"},{"id":"id-95","style":{"fontWeight":"bold"},"text":"Theorem ","element":"figcaption","subtype":"caption"},{"href":"#id-88","style":{"fontWeight":"bold"},"text":"16","element":"a","subtype":"caption"},{"text":", an exemplary 2-6-2-1 network is transformed into a wide network (","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":772.22,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/37-1.png","element":"img","alt":"µ = 80) and a deep network (µ = 5)","inline":true,"padRight":true},{"text":"respectively.","element":"figcaption","subtype":"caption"}],[{"text":"network and deep network are respectively shown in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Figure ","element":"span"},{"href":"#id-95","style":{"fontWeight":"bold"},"text":"11","element":"a"},{"text":". It is seen that the constructed networks can approximate the original network well, and some expected artifacts can be squeezed by further increasing ","element":"span"},{"style":{"height":12},"width":26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/37-2.png","element":"img","alt":" µ","inline":true,"padRight":true},{"text":"or using a post-processing network that can identify streaks and remove them.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"III. Width-Depth equivalence by the De Morgan Law","element":"span"}],[{"text":"We have the following formal statement for the quasi-equivalence in light of the De Morgan law. The proof of this theorem is constructive.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proposition 5 (Proposition C.1 in ","element":"span"},{"href":"#id-13","style":{"fontWeight":"bold"},"text":"(Lin and Jegelka, ","element":"a"},{"href":"#id-13","style":{"fontWeight":"bold"},"text":"2018)","element":"a"},{"style":{"fontWeight":"bold"},"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a piecewise constant function ","element":"span"},{"style":{"height":15.53},"width":220.56,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-0.png","element":"img","alt":" h : Rd → R","inline":true},{"style":{"fontStyle":"italic"},"text":", for any small enough ","element":"span"},{"style":{"height":12.4},"width":102.01,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-1.png","element":"img","alt":" ϵ > 0","inline":true},{"style":{"fontStyle":"italic"},"text":", there exists a ResNet ","element":"span"},{"style":{"height":17.6},"width":220.73,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-2.png","element":"img","alt":" H2(x) such","inline":true,"padRight":true},{"id":"id-99","style":{"fontStyle":"italic"},"text":"that","element":"span"}],[{"style":{"width":"65%"},"width":1138,"height":66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-3.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"The sketch of proof. ","element":"span"},{"text":"The detailed proof can be referred to ","element":"span"},{"href":"#id-13","text":"Lin and Jegelka ","element":"a"},{"href":"#id-13","text":"(2018)","element":"a"},{"text":". We only show the one-dimensional case here for a brief introduction of the idea. The operations that are realizable by a residual network block are","element":"span"}],[{"text":"1. shifting: ","element":"span"},{"style":{"height":18.33},"width":548.19,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-4.png","element":"img","alt":" G+ = G + c, for any c ∈ R","inline":true},{"text":". This operation is to shift a function with a constant.","element":"span"}],[{"text":"2. min or max: ","element":"span"},{"style":{"height":18.73},"width":1360.2,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-5.png","element":"img","alt":" G+ = min{G, c} = G − σ(G − c) or G+ = max{G, c} = G + σ(c − G),","inline":true,"padRight":true},{"text":"for any ","element":"span"},{"style":{"height":12.8},"width":104.22,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-6.png","element":"img","alt":" c ∈ R","inline":true},{"text":". This operation allows us to threshold a function.","element":"span"}],[{"text":"3. min or max with a linear transformation: ","element":"span"},{"style":{"height":18.73},"width":837.88,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-7.png","element":"img","alt":" G+ = min{G, αG+β} = G−σ(G−(αG+β))","inline":true,"padRight":true},{"text":"or ","element":"span"},{"style":{"height":18.73},"width":1250.23,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-8.png","element":"img","alt":" G+ = max{G, αG + β} = G + σ((αG + β) − G) for any α, β ∈ R","inline":true},{"text":". This operation can adjust the slope of trapezoid functions.","element":"span"}],[{"style":{"width":"99%"},"width":1712,"height":511,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-9.png","element":"img"}],[{"text":"Figure 12: To approximate any univariate function, (a) we construct an increasing trape- ","element":"figcaption","subtype":"caption"},{"id":"id-96","text":"zoidal function; (b) we adjust the height of each trapezoid function to its corre- ","element":"figcaption","subtype":"caption"},{"text":"sponding height.","element":"figcaption","subtype":"caption"}],[{"text":"As Figure ","element":"span"},{"href":"#id-96","text":"12 ","element":"a"},{"text":"shows, we first construct an increasing trapezoidal function with a very high plateau over each trapezoid. Then, we adjust the height of each trapezoid function to its corresponding height. The procedures of constructing an increasing function are shown in Figure ","element":"span"},{"href":"#id-97","text":"13, ","element":"a"},{"text":"which are based on mathematical induction. Suppose ","element":"span"},{"style":{"height":15.02},"width":226.76,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/38-10.png","element":"img","alt":" Gm satisfies","inline":true}],[{"text":"1. ","element":"span"},{"style":{"height":17.6},"width":540.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-0.png","element":"img","alt":" Gm = 0, when x ∈ [−∞, a1].","inline":true}],[{"text":"2. ","element":"span"},{"style":{"height":15.02},"width":64.31,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-1.png","element":"img","alt":" Gm","inline":true,"padRight":true},{"text":"is a trapezoid function over each [","element":"span"},{"style":{"height":17.6},"width":159.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-2.png","element":"img","alt":"ai, ai+1].","inline":true}],[{"text":"3. ","element":"span"},{"style":{"height":17.6},"width":260.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-3.png","element":"img","alt":" Gm = k∥G∥∞","inline":true,"padRight":true},{"text":"over an interval [","element":"span"},{"style":{"height":17.6},"width":771.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-4.png","element":"img","alt":"ak + δ, ak+1 − δ] for any k = 1, 2, · · · , m.","inline":true}],[{"text":"4. 0 ","element":"span"},{"style":{"height":17.6},"width":662.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-5.png","element":"img","alt":" ≤ Gm ≤ m∥G∥∞ over [−∞, am+1].","inline":true}],[{"text":"5. ","element":"span"},{"style":{"height":24.22},"width":805.5,"height":60.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-6.png","element":"img","alt":" Gm = − m∥G∥∞δ (x − am+1) over [am+1, ∞].","inline":true}],[{"id":"id-97","style":{"width":"49%"},"width":849,"height":629,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-7.png","element":"img"}],[{"text":"Figure 13: An illustrative plot to show how to derive ","element":"figcaption","subtype":"caption"},{"style":{"height":16.22},"width":305.51,"height":40.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-8.png","element":"img","alt":" Gm+1 from Gm.","inline":true}],[{"style":{"width":"95%"},"width":1655,"height":269,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-9.png","element":"img"}],[{"text":"The procedure of adjusting the height of each trapezoid function is shown in Figure ","element":"span"},{"href":"#id-96","text":"12(","element":"a"},{"text":"b). An important consideration is that we need to keep the function on previous subdivisions unchanged while twisting the current trapezoid function. This is realized by the fact that ","element":"span"},{"style":{"height":17.6},"width":116.79,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-10.png","element":"img","alt":" ||G||∞","inline":true,"padRight":true},{"text":"is large. Suppose that ","element":"span"},{"style":{"height":16.4},"width":44,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-11.png","element":"img","alt":" φk","inline":true,"padRight":true},{"text":"is the target value over the interval [","element":"span"},{"style":{"height":17.6},"width":237.97,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-12.png","element":"img","alt":"ak, ak+1], we","inline":true,"padRight":true},{"text":"adjust the values of intervals sequentially by","element":"span"}],[{"style":{"width":"99%"},"width":1725,"height":227,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-13.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 18 (Quasi-equivalence in light of the De Morgan law) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a disjoint rule system ","element":"span"},{"style":{"height":18.09},"width":144.76,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-14.png","element":"img","alt":" {Ai}ni=1","inline":true},{"style":{"fontStyle":"italic"},"text":", where each rule ","element":"span"},{"style":{"height":15.42},"width":44.73,"height":38.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-15.png","element":"img","alt":" Ai","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is characterized by an indicator function ","element":"span"},{"style":{"height":17.6},"width":94.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-16.png","element":"img","alt":" gi(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"over a hypercube ","element":"span"},{"text":"Γ","element":"span"},{"style":{"height":19.53},"width":730.9,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-17.png","element":"img","alt":"i = [a1i, b1i] × · · · × [aDi, bDi] ∈ [0, 1]D:","inline":true}],[{"style":{"width":"28%"},"width":490,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/39-18.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"fulfilling the De Morgan law","element":"span"}],[{"style":{"width":"77%"},"width":1343,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"we can construct a wide ReLU network ","element":"span"},{"style":{"height":17.6},"width":118.66,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-1.png","element":"img","alt":" H1(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"to represent the right hand side of the De Morgan law and a deep ReLU network ","element":"span"},{"style":{"height":17.6},"width":118.66,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-2.png","element":"img","alt":" H2(x)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"to represent the left hand side of the De Morgan law, such that for any ","element":"span"},{"style":{"height":14.8},"width":110.71,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-3.png","element":"img","alt":" ϵ > 0,","inline":true}],[{"style":{"width":"66%"},"width":1155,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"text":"m ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a measurement in ","element":"span"},{"text":"[0","element":"span"},{"style":{"height":19.53},"width":97.06,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-5.png","element":"img","alt":", 1]D.","inline":true}],[{"id":"id-98","style":{"width":"100%"},"width":1729,"height":987,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-6.png","element":"img"}],[{"text":"Figure 14: A wide network to realize the negation of the logic intersection of the negation of propositional rules in a high-dimensional space. The negation of each propositional rule is associated with a trap-like indicator function.","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"The defined rule system is a piecewise constant function over a hypercube. The wide and deep networks are constructed as follows:","element":"span"}],[{"text":"Wide network ","element":"span"},{"style":{"height":17.6},"width":102.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-7.png","element":"img","alt":" H1(x","inline":true},{"text":"): As illustrated in Figure ","element":"span"},{"href":"#id-98","text":"14, ","element":"a"},{"text":"to represent the rule ","element":"span"},{"style":{"height":15.42},"width":44.73,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-8.png","element":"img","alt":" Ai","inline":true,"padRight":true},{"text":"that is equivalent to ","element":"span"},{"style":{"height":17.6},"width":78.31,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-9.png","element":"img","alt":" gi(x","inline":true},{"text":"), a trap-like function is constructed to represent ","element":"span"},{"style":{"height":15.42},"width":73.82,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-10.png","element":"img","alt":" ¬Ai","inline":true,"padRight":true},{"text":"that is equivalent to 1 ","element":"span"},{"style":{"height":17.6},"width":121.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-11.png","element":"img","alt":" − gi(x","inline":true},{"text":"). In our construction, for each hypercube the measures of ","element":"span"},{"style":{"height":17.6},"width":448.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-12.png","element":"img","alt":" {x ∈ Γi|H1(x) ̸= gi(x)}","inline":true,"padRight":true},{"text":"is no more than ","element":"span"},{"style":{"height":17.6},"width":582.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-13.png","element":"img","alt":" vol(Γi)(1 − 2δ)n, where vol(Γi","inline":true},{"text":") is the volume of a hypercube Γ","element":"span"},{"style":{"height":15.02},"width":178.64,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-14.png","element":"img","alt":"i. There-","inline":true,"padRight":true},{"text":"fore, the total measure of errors for all the hypercubes is less than ","element":"span"},{"style":{"height":18.8},"width":432.75,"height":47.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/40-15.png","element":"img","alt":"�ni vol(Γi)(1 − 2δ)n =","inline":true}],[{"style":{"width":"99%"},"width":1720,"height":227,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/41-0.png","element":"img"}],[{"text":"Deep network ","element":"span"},{"style":{"height":17.6},"width":436.49,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/41-1.png","element":"img","alt":" H2(x): A1∨A2 · · ·∨An","inline":true,"padRight":true},{"text":"is a piecewise constant function over a hypercube, based on ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proposition ","element":"span"},{"href":"#id-99","style":{"fontWeight":"bold"},"text":"5","element":"a"},{"text":", we have","element":"span"}],[{"id":"id-101","style":{"width":"69%"},"width":1199,"height":122,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/41-2.png","element":"img"}],[{"text":"Combining Eqs. ","element":"span"},{"href":"#id-100","text":"(108) ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-101","text":"(109) ","element":"a"},{"text":"leads to ","element":"span"},{"style":{"height":31.6},"width":597.92,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/41-3.png","element":"img","alt":" m�{x |H1(x)} ̸= H2(x)}�< ϵ","inline":true},{"text":", which concludes our proof.","element":"span"}],[{"id":"id-100","style":{"width":"1%"},"width":29,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/41-4.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"IV. Quasi-equivalence of Quadratic Networks.","element":"span"}],[{"text":"In this section, we first prove the correctness of the continued fraction representation of a polynomial. Then, we report the experimental results on the quasi-equivalency of quadratic networks.","element":"span"}],[{"style":{"width":"50%"},"width":868,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/42-0.png","element":"img"}],[{"text":"For a general univariate polynomial, there is a continued fraction representation as follows:","element":"span"}],[{"style":{"width":"99%"},"width":1712,"height":505,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/42-1.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.6},"width":1601.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/42-2.png","element":"img","alt":" ai ̸= 0, b0 = a0, bk = a2k/a2k−2, k ≥ 1 and c0 = a1, ck = a2k+1/a2k−1, k ≥ 1. In the","inline":true,"padRight":true},{"text":"right side of ","element":"span"},{"href":"#id-61","text":"(111)","element":"a"},{"text":", the first part contains the terms with even powers of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":", while the second part is for the odd powers of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". Because both even and odd parts are in the essentially ","element":"span"},{"id":"id-61","text":"same format, we only show the correctness of the even part:","element":"span"}],[{"style":{"width":"93%"},"width":1611,"height":314,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/42-3.png","element":"img"}],[{"text":"Specially, we can derive the continued fraction as follows:","element":"span"}],[{"style":{"width":"94%"},"width":1638,"height":1369,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/43-0.png","element":"img"}],[{"text":"The following example helps illustrate the correctness of ","element":"span"},{"href":"#id-61","text":"(111)","element":"a"},{"text":":","element":"span"}],[{"style":{"width":"103%"},"width":1796,"height":281,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/43-1.png","element":"img"}],[{"text":"If ","element":"span"},{"style":{"height":15.02},"width":389.15,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/43-2.png","element":"img","alt":" ai = 0 for certain i","inline":true},{"text":", we can derive an approximate continued fraction representation by complementing ","element":"span"},{"style":{"height":22},"width":183.67,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/43-3.png","element":"img","alt":"�2Ni=0 aixi","inline":true,"padRight":true},{"text":"with a term ","element":"span"},{"style":{"height":17.93},"width":265.64,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/43-4.png","element":"img","alt":" δixi, where δi","inline":true,"padRight":true},{"text":"is a small constant. For a ","element":"span"},{"text":"given approximation requirement, we can always choose a sufficient small, nonzero ","element":"span"},{"style":{"height":15.02},"width":90.54,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/43-5.png","element":"img","alt":" δi to","inline":true,"padRight":true},{"text":"accommodate the accuracy requirement since ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"is bounded.","element":"span"}],[{"style":{"width":"19%"},"width":333,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/43-6.png","element":"img"}],[{"text":"This subsection reports the experimental results on quasi-equivalence of quadratic networks in two steps. In the first step, we show the feasibility of the reciprocal activation function in the network according to the continued fraction representation. In the second step, we compare the accuracy and robustness between wide and deep networks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Experimental Design: ","element":"span"},{"text":"We first preprocessed the MNIST dataset using image deskewing and dimension deduction techniques. ","element":"span"},{"text":"Image deskewing (","element":"span"},{"href":"https://fsix.github.io/mnist/Deskewing.html","style":{"fontFamily":"monospace"},"text":"https://fsix.github.io/ ","element":"a"},{"href":"https://fsix.github.io/mnist/Deskewing.html","style":{"fontFamily":"monospace"},"text":"mnist/Deskewing.html","element":"a"},{"text":") straightens the digits that are written in a crooked manner. Mathematically, skewing is modeled as an affine transformation: ","element":"span"},{"style":{"height":17.6},"width":548.82,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/44-0.png","element":"img","alt":" Image′ = A(Image) + b, in","inline":true,"padRight":true},{"text":"which the center of mass of the image is computed to estimate how much offset is needed, and the covariance matrix is estimated to approximate by how much an image is skewed. Furthermore, the center and covariance matrix are employed for the inverse affine transformation, which is referred to as deskewing. Then, we used t-SNE ","element":"span"},{"href":"#id-102","text":"(Van der Maaten and Hin- ","element":"a"},{"href":"#id-102","text":"ton, ","element":"a"},{"href":"#id-102","text":"2008) ","element":"a"},{"text":"to reduce the dimension of the MNIST from 28","element":"span"},{"style":{"height":8.8},"width":34,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/44-1.png","element":"img","alt":"×","inline":true},{"text":"28 to 2, as the two-dimensional embedding space. Figure ","element":"span"},{"href":"#id-103","text":"15 ","element":"a"},{"text":"presents the effect of deskewing on t-SNE.","element":"span"}],[{"style":{"width":"99%"},"width":1728,"height":807,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/44-2.png","element":"img"}],[{"text":"Figure 15: The effect of the deskewing operation on the t-SNE result of the MNIST data. ","element":"figcaption","subtype":"caption"},{"id":"id-103","text":"Deskewing significantly improves the embedding quality.","element":"figcaption","subtype":"caption"}],[{"text":"The embedding data can be naturally divided into the training and testing sets based on the partition in the original space. To illustrate the aforementioned quasi-equivalence between quadratic networks, We built three representative quadratic deep learning models, using two wide sub-networks (by the factorization representation), four deep sub-networks with the reciprocal activation (by the continued fraction), and four deep sub-networks using the ReLU activation (representing the first-order approximation to the continued fraction representation). Each wide sub-network takes one element of the input, and two paired deep sub-networks take the same element of the input. Each wide sub-network has four layers with 8","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"4","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"1 neurons sequentially, while each deep sub-network also has four layers with one neuron in each layer. These sub-networks are followed by a fully connected ReLU layer of the same structure. The fully connected network has three layers with 300, 200, and 10 neurons respectively.","element":"span"}],[{"text":"We performed the batch training with 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"000 instances per batch in each iteration. All the parameters were initialized with truncated Gaussian distribution of a mean 0 and a variance 0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"1. The learning rate was set to 0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"002, and the whole network was optimized with Adam ","element":"span"},{"href":"#id-104","text":"(Kingma and Ba, ","element":"a"},{"href":"#id-104","text":"2015)","element":"a"},{"text":". The number of epochs was 200.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Feasibility of the Reciprocal Activation: ","element":"span"},{"text":"At the first glance, the training of a deep model using the reciprocal activation function is subject to instability. For example, when the value of the input is large, ","element":"span"},{"style":{"height":27.08},"width":119.54,"height":67.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-0.png","element":"img","alt":"bNx21+bNx2","inline":true,"padRight":true},{"text":"gets closer to 1, which will lead to zero in the ","element":"span"},{"text":"denominator in the next fraction, and undermine the training convergence. However, we argue that when data are appropriately normalized, the training process can be made stable. We did the following experiment to study how the normalization can help. Specifically, the input was normalized by the formula: ","element":"span"},{"style":{"height":31.6},"width":650.43,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-1.png","element":"img","alt":" xnew = xmaxζ � x−xminxmax−xmin�, where ζ","inline":true,"padRight":true},{"text":"is a scaling factor. We evaluated four scaling factors ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"4","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"8","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"16","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"by repeating 20 times the training process for each factor. The criterion is whether the training process converges or not. In this experiment, as long as ’nan’ does not appear, the training will converge because we used a sufficiently large number of epochs. Table ","element":"span"},{"href":"#id-105","text":"4 ","element":"a"},{"text":"shows the success rate of training for different ","element":"span"},{"style":{"height":16.4},"width":21,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-2.png","element":"img","alt":" ζ","inline":true,"padRight":true},{"text":"scales. It can be seen that without scaling the training process always fails, while the success rate reaches 80% when ","element":"span"},{"style":{"height":16.4},"width":1282.82,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-3.png","element":"img","alt":" ζ = 16. The point is that the training process can be stablized with","inline":true,"padRight":true},{"text":"an appropriate scaling operation.","element":"span"}],[{"id":"id-105","text":"Table 4: The success rates of the training process with reciprocal activation after normal- ","element":"figcaption","subtype":"caption"},{"text":"ization.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"45%"},"width":791,"height":114,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-4.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Accuracy and Robustness of the Three Models: ","element":"span"},{"text":"We repeated the training process 10 times and computed the accuracy of the three models on the test dataset. As shown in Table ","element":"span"},{"href":"#id-106","text":"5, ","element":"a"},{"text":"all the three models achieved the state-of-the-art results. The performance of the deep model using ReLU was only slightly lower than that of the other two models.","element":"span"}],[{"id":"id-106","text":"Table 5: Performance of the three models on the test dataset after 10 repetitions.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"88%"},"width":1523,"height":171,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-5.png","element":"img"}],[{"text":"Furthermore, we used the following four popular adversarial attack methods to evaluate the robustness of the deep learning models: (1) fast gradient method (FGM); (2) fast sign gradient method (FSGM ","element":"span"},{"href":"#id-107","text":"(Goodfellow et al., ","element":"a"},{"href":"#id-107","text":"2014b)","element":"a"},{"text":"); (3) iterative fast sign gradient method (I-FSGM ","element":"span"},{"href":"#id-108","text":"(Kurakin et al., ","element":"a"},{"href":"#id-108","text":"2016)","element":"a"},{"text":"); and (4) DeepFool ","element":"span"},{"href":"#id-109","text":"(Moosavi-Dezfooli et al., ","element":"a"},{"href":"#id-109","text":"2016)","element":"a"},{"text":". Let ","element":"span"},{"style":{"height":12.8},"width":21,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-6.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"denote the model parameters, ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"be the input, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y ","element":"span"},{"text":"be the target pertaining to ","element":"span"},{"style":{"height":17.6},"width":296.55,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-7.png","element":"img","alt":" x and L(θ, x, y)","inline":true,"padRight":true},{"text":"the loss, FGM generates an adversary as","element":"span"}],[{"style":{"width":"65%"},"width":1133,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/45-8.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-0.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"is an amplitude factor. FGM is plausible to find the attack along the direction of the gradient. FSGM computes an adversary based on","element":"span"}],[{"style":{"width":"67%"},"width":1175,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-1.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-2.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"is also a factor, and ","element":"span"},{"style":{"height":17.6},"width":113.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-3.png","element":"img","alt":" sign(·","inline":true},{"text":") outputs 1 for a positive argument and ","element":"span"},{"style":{"height":4.8},"width":34,"height":12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-4.png","element":"img","alt":" −","inline":true},{"text":"1 otherwise. I-FSGM iteratively derives an adversary with the FSGM formula:","element":"span"}],[{"style":{"width":"79%"},"width":1367,"height":138,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":18.22},"width":167.92,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-6.png","element":"img","alt":" Clipx,α(·","inline":true},{"text":") is a thresholding function such that the maximum value of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x ","element":"span"},{"text":"is no more","element":"span"}],[{"text":"than a preset limit ","element":"span"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-7.png","element":"img","alt":" α","inline":true},{"text":". In the experiment, we set this limit ","element":"span"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-8.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"to the maximum value of the","element":"span"}],[{"style":{"width":"99%"},"width":1723,"height":239,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-9.png","element":"img"}],[{"text":"Table ","element":"span"},{"href":"#id-110","text":"6 ","element":"a"},{"text":"shows the performance of the three models on the adversarial samples generated based on the test dataset using the aforementioned attacking methods. We bold-faced the best scores under attack. It can be seen that the wide model shows the highest accuracy under most attacks. The only exceptions are the cases of FSGM(","element":"span"},{"style":{"height":17.6},"width":484.01,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-10.png","element":"img","alt":"ϵ = 3) and FSGM(ϵ = 5),","inline":true,"padRight":true},{"text":"where the wide model is the second best.","element":"span"}],[{"id":"id-110","text":"Table 6: Accuracy of the three models under attacks, where ","element":"figcaption","subtype":"caption"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-11.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"is the amplitude factor and ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"N ","element":"figcaption","subtype":"caption"},{"text":"is the number of iterations.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"104%"},"width":1811,"height":729,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/46-12.png","element":"img"}],[{"text":"Next, we probed the robustness of each model by examining the needed strength of the adversarial attack such that the network performance will drop by a pre-specified percentage. ","element":"span"},{"text":"Using the absolute performance drop as a reference is better than using the performance compromised by adversarial attacks, as it eliminates the bias induced by the original network performance. Suppose that the original performance (without attack) is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":", with each attacking method, we reduced ","element":"span"},{"style":{"height":12.8},"width":309.01,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/47-0.png","element":"img","alt":" O to O′ = O −","inline":true,"padRight":true},{"text":"5% by gradually increasing the strength of attack. The reason to select a 5% drop is that if the drop is too high the comparison is not sensitive, while if the drop is too low the attack is too weak. Specifically, we gradually increased the strength by a fixed step for each attacking method until the performance drop is over 5%. For FGM, FSGM, I-FSGM and DeepFool, the steps are 50, 1, 1 and 1 respectively. Table ","element":"span"},{"href":"#id-111","text":"7 ","element":"a"},{"text":"shows the needed strength values. The higher the needed strength, the more robust the model is. We bold-faced the best scores. Overall, the wide model shows the highest robustness. The only exception is the case of I-FSGM, where the wide model is the second best. These data suggest that the wide model seems more robust than the deep model. Further studies are needed to understand the underlying mechanisms and derive practical guidelines.","element":"span"}],[{"id":"id-111","text":"Table 7: Needed strength values for each of the four attacking methods to reduce the ","element":"figcaption","subtype":"caption"},{"text":"unattacked network performance by 5%","element":"figcaption","subtype":"caption"}],[{"style":{"width":"94%"},"width":1629,"height":282,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/47-1.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"V. Bounds for the Partially Separable Representation","element":"span"}],[{"text":"The purpose of this section is to show that the partially separable function can be realized a quadratic network whose structure is bounded. Let us first introduce necessary preliminaries.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Partially Separable Representation: ","element":"span"},{"text":"Approximating a multivariate function ","element":"span"},{"style":{"height":17.6},"width":318.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-0.png","element":"img","alt":" f(x1, x2, · · · , xn)","inline":true,"padRight":true},{"text":"by a set of functions of fewer variables is a basic problem in approximation theory ","element":"span"},{"href":"#id-70","text":"(Light ","element":"a"},{"href":"#id-70","text":"and Cheney, ","element":"a"},{"href":"#id-70","text":"2006)","element":"a"},{"text":". Despite that some function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is directly separable in the form of","element":"span"}],[{"style":{"width":"73%"},"width":1273,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-1.png","element":"img"}],[{"text":"a more general formulation to express a multivariate function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is","element":"span"}],[{"style":{"width":"68%"},"width":1181,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-2.png","element":"img"}],[{"text":"If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"is permitted to be sufficiently large, then such a model is rather universal. For bivariate functions, an inspiring theorem has been proved ","element":"span"},{"href":"#id-70","text":"(Light and Cheney, ","element":"a"},{"href":"#id-70","text":"2006)","element":"a"},{"text":": Let ","element":"span"},{"style":{"height":17.6},"width":168.28,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-3.png","element":"img","alt":" {un}n∈N","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.6},"width":164.46,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-4.png","element":"img","alt":" {vn}n∈N","inline":true,"padRight":true},{"text":"are orthornomal bases of ","element":"span"},{"style":{"height":19.13},"width":318.28,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-5.png","element":"img","alt":" L2(X) and L2(Y","inline":true},{"text":") respectively, then ","element":"span"},{"style":{"height":20.18},"width":300.58,"height":50.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-6.png","element":"img","alt":" {umvn}(m,n)∈N 2","inline":true,"padRight":true},{"text":"is an orthornomal basis of ","element":"span"},{"style":{"height":19.13},"width":223.78,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-7.png","element":"img","alt":" L2(X × Y).","inline":true}],[{"text":"To approximate a general multivariate function, we relax the restrictive equality to the approximation in the ","element":"span"},{"style":{"height":14.62},"width":46.7,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-8.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"sense and assume that, for every continuous ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":"-variable function ","element":"span"},{"style":{"height":17.6},"width":437.38,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-9.png","element":"img","alt":"f(x1, · · · , xn) on [0, 1]n","inline":true},{"text":", given any positive number ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-10.png","element":"img","alt":" ϵ","inline":true},{"text":", there exists a group of ","element":"span"},{"style":{"height":16.4},"width":48.49,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-11.png","element":"img","alt":" φli","inline":true},{"text":", satisfying:","element":"span"}],[{"style":{"width":"77%"},"width":1343,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-12.png","element":"img"}],[{"text":"The key to demonstrate boundedness of the partially separable representation with a quadratic network is to use the Taylor’s expansion, and estimate the remainder, leading to a realization of the partially separable representation. Based on such a realization, we obtain an upper bound of the needed width and depth for the partially separable representation. Let ","element":"span"},{"style":{"height":23.86},"width":1306.89,"height":59.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-13.png","element":"img","alt":" ∂αf(x) = ∂α1∂x1∂α2∂x2 · · · ∂αn∂xn f(x), α! = �ni=1 αi!, and xα = xα11 · · · xαnn ","inline":true,"padRight":true},{"text":", for the function ","element":"span"},{"text":"class ","element":"span"},{"style":{"height":19.53},"width":1053.99,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-14.png","element":"img","alt":" Fn,k = {f ∈ Ck+1([0, 1]n) | ∥∂αf∥∞ ≤ M, ∀|α| ≤ k}","inline":true},{"text":", the following theorem holds.","element":"span"}],[{"style":{"height":19.54},"width":881.73,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-15.png","element":"img","alt":"Theorem 19 For any f ∈ Fn,k, if x ∈ [0, 1]n","inline":true},{"style":{"fontStyle":"italic"},"text":", there exists a quadratic network ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"x","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"with the width no more than ","element":"span"},{"style":{"height":22.57},"width":129.48,"height":56.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-16.png","element":"img","alt":" k�n+kn �","inline":true},{"style":{"fontStyle":"italic"},"text":"and the depth no more than ","element":"span"},{"text":"log","element":"span"},{"style":{"height":17.6},"width":110.46,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-17.png","element":"img","alt":"2 (kn)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"to represent ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"in the partially separable representation, satisfying that","element":"span"}],[{"style":{"width":"70%"},"width":1214,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-18.png","element":"img"}],[{"style":{"height":19.53},"width":656.66,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-19.png","element":"img","alt":"Proof Let f ∈ Fn,k, if x ∈ [0, 1]n","inline":true},{"text":", then based on classical multivariate calculus ","element":"span"},{"href":"#id-112","text":"(Widder, ","element":"a"},{"href":"#id-112","text":"1989)","element":"a"},{"id":"id-113","text":",","element":"span"}],[{"style":{"width":"68%"},"width":1185,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/48-20.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":18.44},"width":127.6,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-0.png","element":"img","alt":" Rx,k(x","inline":true},{"text":") is the remainder term. For any ","element":"span"},{"style":{"height":17.6},"width":367.21,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-1.png","element":"img","alt":" x ∈ [0, 1]n, we have","inline":true}],[{"id":"id-114","style":{"width":"84%"},"width":1453,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-2.png","element":"img"}],[{"text":"It is observed from Eqs. ","element":"span"},{"href":"#id-113","text":"122 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-114","text":"123 ","element":"a"},{"text":"that, given ","element":"span"},{"style":{"height":19.13},"width":175.51,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-3.png","element":"img","alt":" f ∈ Fn,k","inline":true},{"text":", the Taylor expansion forms a partially separable representation ","element":"span"},{"style":{"height":24.91},"width":561.4,"height":62.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-4.png","element":"img","alt":"�|α|≤k∂αα! f(0)xα with�n+kn �","inline":true},{"text":"products, and the error is no more than ","element":"span"},{"style":{"height":25.12},"width":190.32,"height":62.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-5.png","element":"img","alt":"M(k+1)!nk+1","inline":true},{"text":". Along this line, we apply the quadratic network to express","element":"span"}],[{"style":{"width":"99%"},"width":1725,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-6.png","element":"img"}],[{"text":"network using the ReLU function with the depth of log","element":"span"},{"style":{"height":17.6},"width":74.9,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-7.png","element":"img","alt":"2(N","inline":true},{"text":") and the width of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":". To approximate a general term ","element":"span"},{"style":{"height":22.49},"width":582.87,"height":56.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-8.png","element":"img","alt":"∂αα! f(0)xα = ∂αα! f(0)xα11 · · · xαnn ","inline":true,"padRight":true},{"text":", the depth of the quadratic network is ","element":"span"},{"text":"no more than log","element":"span"},{"style":{"height":17.6},"width":543.01,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-9.png","element":"img","alt":"2(max{α1, · · · , αn}) ≤ log2 k","inline":true},{"text":", and the width is no more than ","element":"span"},{"style":{"height":18.8},"width":245.86,"height":47.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-10.png","element":"img","alt":"�ni=1 αi = k.","inline":true,"padRight":true},{"text":"In addition, the depth of a network that is used to express the product of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"terms is log","element":"span"},{"style":{"height":12.19},"width":64.4,"height":30.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-11.png","element":"img","alt":"2 n.","inline":true,"padRight":true},{"text":"Therefore, the depth is no more than log","element":"span"},{"style":{"height":17.6},"width":423.81,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-12.png","element":"img","alt":"2 k + log2 n = log2 (kn","inline":true},{"text":"). In contrast, the width is no more than ","element":"span"},{"style":{"height":22.56},"width":129.48,"height":56.41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-13.png","element":"img","alt":" k�n+kn �","inline":true},{"text":", considering that there are","element":"span"},{"style":{"height":24.91},"width":648.3,"height":62.27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/49-14.png","element":"img","alt":"�n+kn �terms in �|α|≤k∂αα! f(x)xα.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"VI. Properties of the Partially Separable Representation","element":"span"}],[{"text":"Let us compare the properties of the partially separable representation with those of the piecewise polynomial representation and the Kolmogorov–Arnold representation in terms of universality, finiteness, decomposability and smoothness. As a result, the partially separable representation is well justified.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"1. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Universality: ","element":"span"},{"text":"The representation should have a sufficient ability to express the functions. A piecewise polynomial representation is capable of representing any continuous function in a local way. The Kolmogorov–Arnold representation theorem states that every multivariate continuous function can be represented as a combination of continuous univariate functions, which solves a generalized Hilbert thirteen problem. Therefore, the Kolmogorov–Arnold representation also possesses universality. As far as the partially separable representation is concerned, its universality in the ","element":"span"},{"style":{"height":14.62},"width":46.7,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/50-0.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"distance is analyzed in the above Taylor expansion analysis. In summary, these three representation classes are all powerful enough to express continuous functions.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2. Finiteness: ","element":"span"},{"text":"The multivariate Taylor expansion can approximate a general continuous function ","element":"span"},{"style":{"height":19.13},"width":163.05,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/50-1.png","element":"img","alt":" f ∈ Fn,k ","inline":true,"padRight":true},{"text":"with a finite number of terms. Although the proof in the manuscript devises a specific strategy of obtaining a partially separable representation, the partially separable representation actually allows a variety of constructions in addition to the Taylor expansion. There is no fundamental reason that the partially separable representation will necessarily have exponentially many terms in a majority of practical tasks after reasonable assumptions are incorporated.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3. Decomposability: ","element":"span"},{"text":"Decomposability is to decode the functionality of the model in terms of its building units, i.e., cells, layers, blocks, and so on. A plethora of engineering examples, such as software development and optical system design, have shown that modularized analysis is effective. Particularly, modularizing a neural network is advantageous for the model interpretability. For example, ","element":"span"},{"href":"#id-115","text":"Chen et al. ","element":"a"},{"href":"#id-115","text":"(2016) ","element":"a"},{"text":"developed InfoGAN to enhance decomposability of features learned by GAN ","element":"span"},{"href":"#id-116","text":"(Goodfellow et al., ","element":"a"},{"href":"#id-116","text":"2014a)","element":"a"},{"text":", where InfoGAN maximizes the mutual information between the latent code and the observations, encouraging the use of noise to encode the semantic concept. Since the partially separable representation is more decomposable than its counterparts, the network constructed in reference to the partially separable representation is easily interpretable. For instance, such a network has simpler partial derivatives that can simplify interpretability analysis ","element":"span"},{"href":"#id-117","text":"(Lipton, ","element":"a"},{"href":"#id-117","text":"2018)","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4. Smoothness: ","element":"span"},{"text":"A useful representation should be as smooth as possible such that the approximation can suppress high-frequency oscillations and be robust to noise. In the last eighties, Girosi and Poggio claimed that the use of the Kolmogorov-Arnold representation is doomed because the inner functions and the outer functions are highly non-smooth ","element":"span"},{"href":"#id-118","text":"(Girosi ","element":"a"},{"href":"#id-118","text":"and Poggio, ","element":"a"},{"href":"#id-118","text":"1989)","element":"a"},{"text":". In the piecewise polynomial representation, the situation gets better since at least over each interval the representation is smooth. As far as the partially separable representation is concerned, because the expression is greatly relaxed (from the exact computation to approximate representation), it is feasible to make each element smooth as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"goes large, and extract meaningful structures underlying big data.","element":"span"}],[{"id":"id-119","text":"Table 8: Comparison of the three functional representations, where ”","element":"figcaption","subtype":"caption"},{"style":{"height":12.8},"width":37,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/51-0.png","element":"img","alt":"✓","inline":true},{"text":"” means ”yes”, ”","element":"figcaption","subtype":"caption"},{"style":{"height":12.4},"width":51.08,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/51-1.png","element":"img","alt":"X”","inline":true,"padRight":true},{"text":"means ”no”, and ”–” means ”mediocre”.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"105%"},"width":1831,"height":225,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/51-2.png","element":"img"}],[{"text":"In summary, we compare the three representation schemes in terms of universality, finiteness, decomposability, and smoothness in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Table ","element":"span"},{"href":"#id-119","style":{"fontWeight":"bold"},"text":"8","element":"a"},{"text":". Clearly, the partially separable representation is suitable for machine learning tasks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"VII. Width-Depth Correlation Through Partially Separable Presentation","element":"span"}],[{"text":"Now, let us analyze the complexity of the quadratic network in light of the aforementioned partially separate representation scheme as shown in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Figure ","element":"span"},{"href":"#id-120","style":{"fontWeight":"bold"},"text":"16","element":"a"},{"text":". Suppose that the polynomial ","element":"span"},{"style":{"height":14.84},"width":50.51,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-0.png","element":"img","alt":" Pli","inline":true,"padRight":true},{"text":"is of degree ","element":"span"},{"style":{"height":14.84},"width":57.55,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-1.png","element":"img","alt":" Nli","inline":true},{"text":", then the representation of each ","element":"span"},{"style":{"height":14.84},"width":50.51,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-2.png","element":"img","alt":" Pli","inline":true,"padRight":true},{"text":"can be done with a network of width ","element":"span"},{"style":{"height":22},"width":276.79,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-3.png","element":"img","alt":"�Ll=1�ni=1 Nli","inline":true,"padRight":true},{"text":"and depth max","element":"span"},{"style":{"height":18.44},"width":245.55,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-4.png","element":"img","alt":"l,i{log2(Nli)}","inline":true},{"text":". Next, the multiplication demands an ","element":"span"},{"text":"additional network of width ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Ln ","element":"span"},{"text":"and depth log","element":"span"},{"style":{"height":17.6},"width":61.9,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-5.png","element":"img","alt":"2(n","inline":true},{"text":"). Therefore, the overall quadratic network architecture will be of width max","element":"span"},{"style":{"height":22},"width":397.42,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-6.png","element":"img","alt":"{�Ll=1�ni=1 Nli, Ln}","inline":true,"padRight":true},{"text":"and depth max","element":"span"},{"style":{"height":18.44},"width":287.78,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-7.png","element":"img","alt":"l,i{log2(Nli)} +","inline":true,"padRight":true},{"text":"log","element":"span"},{"style":{"height":17.6},"width":91.07,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-8.png","element":"img","alt":"2(n).","inline":true,"padRight":true},{"text":"Because the depth scales with a log function, which changes slowly when the dimensionality of the input is large. For simplicity, we take an approximation for depth max","element":"span"},{"style":{"height":22.09},"width":1646.67,"height":55.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-9.png","element":"img","alt":"l,i{log2(Nli)} + log2(n) = log2(maxl,i{Nli}) + log2(n) ≈ α log2�Ll=1�ni=1 Nli + log2(n),","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-10.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"is a positive constant. Let ","element":"span"},{"style":{"height":22},"width":401.32,"height":55.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-11.png","element":"img","alt":"�Ll=1�ni=1 Nli = NΣ","inline":true},{"text":", which describes the overall com- ","element":"span"},{"text":"plexity of the function to be expressed, then the formulas to compute the width and depth are simplified as follows:","element":"span"}],[{"id":"id-120","style":{"width":"78%"},"width":1350,"height":1037,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-12.png","element":"img"}],[{"text":"Figure 16: Use of a quadratic network to represent a partially separable representation. Suppose that the polynomial ","element":"figcaption","subtype":"caption"},{"style":{"height":14.84},"width":50.51,"height":37.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-13.png","element":"img","alt":" Pli","inline":true,"padRight":true},{"text":"is of degree ","element":"figcaption","subtype":"caption"},{"style":{"height":14.84},"width":57.55,"height":37.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-14.png","element":"img","alt":" Nli","inline":true},{"text":", the width and depth of the quadratic network to approximate ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":447.71,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-15.png","element":"img","alt":" Pli are Nli and log2(Nli","inline":true},{"text":") respectively.","element":"figcaption","subtype":"caption"}],[{"text":"One interesting point from ","element":"span"},{"href":"#id-120","text":"(124) ","element":"a"},{"text":"is that the lower bounds for depth and width to realize a partially separable representation are also suggested. As shown in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Figure ","element":"span"},{"href":"#id-121","style":{"fontWeight":"bold"},"text":"17","element":"a"},{"text":", we plot the width and depth as ","element":"span"},{"style":{"height":15.13},"width":64.81,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-16.png","element":"img","alt":" NΣ ","inline":true,"padRight":true},{"text":"changes. There are two highlights in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Figure ","element":"span"},{"href":"#id-121","style":{"fontWeight":"bold"},"text":"17","element":"a"},{"text":". The first is that the width is generally larger than the depth, which is different from the superficial impression on deep learning. The second is that, as the ","element":"span"},{"style":{"height":15.13},"width":64.82,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/52-17.png","element":"img","alt":" NΣ ","inline":true,"padRight":true},{"text":"goes up, the width/depth ratio is accordingly increased.","element":"span"}],[{"style":{"width":"48%"},"width":833,"height":609,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/53-0.png","element":"img"}],[{"text":"Figure 17: Width and depth versus ","element":"figcaption","subtype":"caption"},{"style":{"height":19.53},"width":1035.03,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/53-1.png","element":"img","alt":" NΣ changes (L = 4, n = 5, and α = 1 without loss of","inline":true,"padRight":true},{"text":"generality) assuming the partially separable representation.","element":"figcaption","subtype":"caption"}],[{"text":"Table 9: Descriptions of different building blocks. Modules ","element":"figcaption","subtype":"caption"},{"text":"Degree ","element":"figcaption","subtype":"caption"},{"text":"Operation ","element":"figcaption","subtype":"caption"},{"text":"Width ","element":"figcaption","subtype":"caption"},{"id":"id-121","text":"Depth","element":"figcaption","subtype":"caption"}],[{"style":{"width":"61%"},"width":1056,"height":148,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/53-2.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Remark: ","element":"span"},{"text":"Through the complexity analysis, we realize that the width and depth of a network depend on the structure or complexity of the function to be approximated. In other words, they are controlled by the nature of a specific task. As the task becomes complicated, the width and depth must increase accordingly, and the combination of the width and depth is not unique.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"VIII. Effects of Width on Optimization and Generalization","element":"span"}],[{"text":"We first illustrate the importance of width on optimization in the context of over-paramterization, kernel ridge regression, and NTK, and then report our findings that the existing generalization bounds and VC dimension results somehow suggest the width and depth equivalence for a given complexity of networks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"1. Optimization: ","element":"span"},{"text":"The optimization mechanism is the key to understand the training process of neural networks ","element":"span"},{"href":"#id-122","text":"(Ge et al., ","element":"a"},{"href":"#id-122","text":"2015)","element":"a"},{"text":". From the view of optimization, the fact that randomly-initialized first-order methods can find well-generalizable minima on the training data is quite intriguing. ","element":"span"},{"text":"Given the theme of this paper, we put our endeavor into the importance of width on the optimization of neural networks in hope to provide insights for practitioners. We divide the relevant literature into the three categories:","element":"span"}],[{"text":"(1) Increase width for over-parameterization: Brutzkus et al. showed that a wide two-layer network using the hinge loss can generalize well on linearly separable data with stochastic gradient descent (SGD) ","element":"span"},{"href":"#id-123","text":"(Brutzkus et al., ","element":"a"},{"href":"#id-123","text":"2017)","element":"a"},{"text":". Li and Liang showed that when data is normalized and fulfills a separability condition, a two-layer over-parameterized network can learn these data in a polynomial time ","element":"span"},{"href":"#id-124","text":"(Li and Liang, ","element":"a"},{"href":"#id-124","text":"2018)","element":"a"},{"text":". ","element":"span"},{"href":"#id-83","text":"Allen-Zhu et al. ","element":"a"},{"href":"#id-83","text":"(2019) ","element":"a"},{"text":"showed that if the width of hidden layers is ","element":"span"},{"style":{"height":31.6},"width":598.5,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-0.png","element":"img","alt":" O�n30D30 log30(1/ϵ)�, where n","inline":true,"padRight":true},{"text":"is the number of samples, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"is the network depth, and ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-1.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"is an expected error, then the gradient descent search converges with the error ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-2.png","element":"img","alt":" ϵ","inline":true},{"text":". This bound was further reduced to ","element":"span"},{"style":{"height":15.53},"width":94.94,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-3.png","element":"img","alt":" n42D ","inline":true,"padRight":true},{"text":"for the network using non-linear smooth activation functions such as soft-plus ","element":"span"},{"href":"#id-82","text":"(Du et al., ","element":"a"},{"href":"#id-82","text":"2018)","element":"a"},{"text":". These paper established that a pre-specified small training error can be achieved by gradient descent when the network is very wide. The secret therein is that the weight matrix is very close to the initialization due to NTK ","element":"span"},{"href":"#id-29","text":"(Jacot et al., ","element":"a"},{"href":"#id-29","text":"2018)","element":"a"},{"text":".","element":"span"}],[{"text":"(2) Kernel ridge regression: A neural network tends to give a Gaussian process when the width goes infinitely large. In this situation, only training the top layer of a network (weak training) will reduce to Kernel ridge regression. In a classical way, the weak training is to minimize the quadratic loss:","element":"span"}],[{"style":{"width":"71%"},"width":1240,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.6},"width":276.29,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-5.png","element":"img","alt":" X = [x1; ...; xn","inline":true},{"text":"] is a collection of data ","element":"span"},{"style":{"height":18.33},"width":403.17,"height":45.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-6.png","element":"img","alt":" xi ∈ R1×d, i = 1, ..., n","inline":true},{"text":". Taking derivatives with respect to ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"w ","element":"span"},{"text":"and equating them to zero gives","element":"span"}],[{"style":{"width":"77%"},"width":1334,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-7.png","element":"img"}],[{"text":"Given a new example ","element":"span"},{"style":{"height":12.73},"width":41.94,"height":31.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-8.png","element":"img","alt":" x∗","inline":true},{"text":", the prediction is","element":"span"}],[{"style":{"width":"65%"},"width":1134,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-9.png","element":"img"}],[{"text":"We replace the data samples with the high-dimensional representations of the neural network: ","element":"span"},{"style":{"height":20.15},"width":1014.3,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-10.png","element":"img","alt":" x → gθ(x) ∈ R1×k and let Φg(X) = [gθ(x1); ...; gθ(xn","inline":true},{"text":")]. By a similar derivation, we have","element":"span"}],[{"style":{"width":"101%"},"width":1757,"height":243,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/54-11.png","element":"img"}],[{"text":"(3) Neural Tangent Kernel: In the above argument, we have justified the legitimacy of training the top layer by the large network width. How about training the entire network? Given the dataset ","element":"span"},{"style":{"height":18.09},"width":226.76,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-0.png","element":"img","alt":" {(xi, yi)}ni=1","inline":true},{"text":", we also consider training the network with the quadratic ","element":"span"},{"text":"loss for regression: ","element":"span"},{"style":{"height":21.29},"width":585.17,"height":53.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-1.png","element":"img","alt":" l(θ) = 12�ni=1(fθ(xi)) − yi)2.","inline":true,"padRight":true},{"text":"Consider the gradient descent in an infinitesimally small learning rate, then","element":"span"}],[{"style":{"width":"80%"},"width":1388,"height":122,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-2.png","element":"img"}],[{"id":"id-125","text":"Next, we can describe the dynamics of the model output ","element":"span"},{"style":{"height":17.6},"width":64.93,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-3.png","element":"img","alt":" y(θ","inline":true},{"text":") given the input ","element":"span"},{"style":{"height":13.02},"width":41.48,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-4.png","element":"img","alt":" xj","inline":true,"padRight":true},{"text":"as follows:","element":"span"}],[{"style":{"width":"83%"},"width":1436,"height":239,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-5.png","element":"img"}],[{"text":"Let us consider ","element":"span"},{"style":{"height":19.95},"width":450.47,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-6.png","element":"img","alt":" u(t) = (fθ(t)(xi))i=1,...,n","inline":true,"padRight":true},{"text":"for all samples at time ","element":"span"},{"style":{"height":18.22},"width":514.14,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-7.png","element":"img","alt":" t and y = (yi)i=1,...,n is the","inline":true,"padRight":true},{"text":"output, ","element":"span"},{"href":"#id-125","text":"(130) ","element":"a"},{"text":"can be written in a compact way:","element":"span"}],[{"style":{"width":"64%"},"width":1121,"height":93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-8.png","element":"img"}],[{"text":"where the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"pq","element":"span"},{"text":"-entry of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"H","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") is","element":"span"}],[{"style":{"width":"70%"},"width":1220,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-9.png","element":"img"}],[{"text":"In the width limit, the Gaussian process shows that ","element":"span"},{"style":{"fontWeight":"bold"},"text":"H","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":") becomes a constant ","element":"span"},{"style":{"height":12.33},"width":56.27,"height":30.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-10.png","element":"img","alt":" H∗","inline":true},{"text":". Therefore,","element":"span"}],[{"style":{"width":"64%"},"width":1106,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-11.png","element":"img"}],[{"text":"which characterizes the trajectory of the training in the functional space instead of the parameter space.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2. Generalization: ","element":"span"},{"text":"Analysis of generalization bounds is a powerful tool to explain the excellent performance of neural networks. Traditional wisdom suggests that the increased model complexity will cause over-fitting to training data, which contradicts the fact that deep networks can easily fit random labels to the data and yet practically generalize well ","element":"span"},{"href":"#id-126","text":"(Zhang et al., ","element":"a"},{"href":"#id-126","text":"2016)","element":"a"},{"text":". ","element":"span"},{"text":"Recently, the generalization theory has gained increasingly more traction. ","element":"span"},{"text":"In reference to ","element":"span"},{"href":"#id-127","text":"Li et al. ","element":"a"},{"href":"#id-127","text":"(2018)","element":"a"},{"text":", we have summarized the the state-of-the-art generalization bounds in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Table ","element":"span"},{"href":"#id-128","style":{"fontWeight":"bold"},"text":"10 ","element":"a"},{"text":"and provided their complexities. In ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Table ","element":"span"},{"href":"#id-128","style":{"fontWeight":"bold"},"text":"10","element":"a"},{"text":", we boldface the bounds of interest which the width and depth dominate. As far as ","element":"span"},{"href":"#id-129","text":"Neyshabur ","element":"a"},{"href":"#id-129","text":"et al. ","element":"a"},{"href":"#id-129","text":"(2015) ","element":"a"},{"text":"is concerned, due to the exponential dependence on the depth, we will not focus on it. Instead, we argue that these bold-faced bounds somehow suggest the width and depth equivalence under a given complexity. ","element":"span"},{"text":"Now, we use ","element":"span"},{"style":{"height":20.64},"width":446.74,"height":51.61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-12.png","element":"img","alt":" B1(L, p) = log(p)√L3,","inline":true},{"style":{"height":20.8},"width":908.55,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-13.png","element":"img","alt":"B2(L, p) = log(Lp)�L3p, and B3(L, p) = √Lp","inline":true,"padRight":true},{"text":"to denote the bounds of interest. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Figure ","element":"span"},{"href":"#id-130","style":{"fontWeight":"bold"},"text":"18 ","element":"a"},{"text":"shows the contours plots of ","element":"span"},{"style":{"height":15.6},"width":295.85,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/55-14.png","element":"img","alt":" B1, B2 and B3","inline":true},{"text":". Along a contour, the width and depth changes for the fixed bound complexity. In other words, the depth and width of a neural","element":"span"}],[{"id":"id-128","text":"Table 10: Representative bounds for chain-like neural networks, where ","element":"figcaption","subtype":"caption"},{"style":{"height":17.65},"width":389.48,"height":44.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-0.png","element":"img","alt":" Bl,2, Bl,F and Bl,2→1","inline":true,"padRight":true},{"text":"to denote the upper bounds of the spectral norm ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":136.78,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-1.png","element":"img","alt":" ||Wd||2","inline":true},{"text":", Frobenius norm ","element":"figcaption","subtype":"caption"},{"style":{"height":18.22},"width":424.67,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-2.png","element":"img","alt":"||Wd||F , and ||Wd||2,1","inline":true,"padRight":true},{"text":"of the rank-","element":"figcaption","subtype":"caption"},{"style":{"height":15.53},"width":277.22,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-3.png","element":"img","alt":"r matrix in lth ","inline":true,"padRight":true},{"text":"layer. Generally, ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":233.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-4.png","element":"img","alt":" ||Wd||F and","inline":true},{"style":{"height":18.39},"width":277.38,"height":45.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-5.png","element":"img","alt":"||Wd||2,1 is √r","inline":true},{"text":"-times larger than ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":188.39,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-6.png","element":"img","alt":" ||Wd||2, γ","inline":true,"padRight":true},{"text":"means the ramp loss function is 1","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":61.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-7.png","element":"img","alt":"/γ-","inline":true,"padRight":true},{"text":"Lipschitz function, Γ is the lower bound for the product of the spectal norm of all the layers and the ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"m ","element":"figcaption","subtype":"caption"},{"text":"is the size of data.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"97%"},"width":1680,"height":1469,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-8.png","element":"img"}],[{"text":"Figure 18: Contours of ","element":"figcaption","subtype":"caption"},{"style":{"height":15.6},"width":294.49,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-9.png","element":"img","alt":" B1, B2 and B3","inline":true},{"id":"id-130","text":", where numbers on the curves of each sub-figure ","element":"figcaption","subtype":"caption"},{"text":"represent log(","element":"figcaption","subtype":"caption"},{"style":{"height":20.64},"width":552.76,"height":51.61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/56-10.png","element":"img","alt":"p)√L3, log(p)√L3 and √Lp","inline":true,"padRight":true},{"text":"respectively. Along a contour, the width and depth changes to give the same bound.","element":"figcaption","subtype":"caption"}],[{"text":"network are mutually transformable without impacting the overall generalization ability theoretically.","element":"span"}],[{"text":"Therefore, increasing either width or depth can boost the hypothesis space of a neural network. In other words, when it comes to promoting the expressive power of a network, increasing the width is essentially equivalent to increasing the depth in the sense of VC dimension, which also implies the equivalence of the width and depth of neural networks.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"IX. Rethinking the Depth Separation with Intra-layer Links","element":"span"}],[{"text":"Depth separation highlights the representation ability of a deep network, which has been intensively investigated over the past years. The idea is to show that a deep network can more efficiently approximate a complicated function than a shallow network. One notable depth separation result is from ","element":"span"},{"href":"#id-32","text":"Arora et al. ","element":"a"},{"href":"#id-32","text":"(2016)","element":"a"},{"text":", showing that ","element":"span"},{"style":{"height":16.4},"width":484.92,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-0.png","element":"img","alt":" ”for every pair of natural","inline":true},{"style":{"height":16.4},"width":1728.48,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-1.png","element":"img","alt":"numbers k ≥ 1, w ≥ 2, there exists a family of hard functions representable by a R → R","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1)","element":"span"},{"style":{"fontStyle":"italic"},"text":"-layer feedforward ReLU DNN of width ","element":"span"},{"style":{"fontStyle":"italic"},"text":"w ","element":"span"},{"style":{"fontStyle":"italic"},"text":"such that if it is also representable by a ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":1713.79,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-2.png","element":"img","alt":"k′ + 1)-layer feedforward ReLU DNN for any k′ ≤ k, then this (k′ + 1)-layer feedforward","inline":true},{"style":{"height":25.64},"width":773.08,"height":64.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-3.png","element":"img","alt":"ReLU DNN has size at least 12k′wkk′ −1”.","inline":true,"padRight":true},{"text":"Suppose that the number of neurons, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i.e.","element":"span"},{"text":", the size of a ReLU network is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"text":", and the piecewise linear function of this ReLU network has ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"pieces, the core of the depth separation theorem established in ","element":"span"},{"href":"#id-32","text":"Arora et al. ","element":"a"},{"href":"#id-32","text":"(2016) ","element":"a"},{"text":"is summarized as the size-piece relationship: ","element":"span"},{"style":{"height":21.95},"width":348.44,"height":54.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-4.png","element":"img","alt":" s ≥ 12k(2p)1/k − 1.","inline":true}],[{"text":"In the proposed network structure for the width-depth conversion, intra-layer links are employed for a network to flexibly represent arbitrary piecewise linear functions over polytopes using fan-shaped functions. Here, we find that adding intra-layer links can greatly increase the maximum number of pieces represented by a shallow network such that it can express as a complicated function as a deep network could. As shown in Table ","element":"span"},{"href":"#id-131","text":"11, ","element":"a"},{"text":"let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"be the number of intra-linked neurons, the size-piece inequality ","element":"span"},{"style":{"height":25.38},"width":405.32,"height":63.45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-5.png","element":"img","alt":" s ≥ n2(2n−1)k(2p)1/k−","inline":true},{"text":"1 trivially ","element":"span"},{"text":"holds true. As such, it cannot be used to demonstrate depth separation any more. Instead, we need to rethink the depth separation and re-characterize the power of a network when intra-layer links are used. Our detailed proof is as follows.","element":"span"}],[{"id":"id-131","text":"Table 11: Network structures and their associated size-piece relationships. The number of ","element":"figcaption","subtype":"caption"},{"text":"neurons, i.e., the size of a ReLU network is ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"s","element":"figcaption","subtype":"caption"},{"text":". ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"k ","element":"figcaption","subtype":"caption"},{"text":"is the number of hidden layers. The piecewise linear function of this ReLU network has ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"p ","element":"figcaption","subtype":"caption"},{"text":"pieces. ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"n ","element":"figcaption","subtype":"caption"},{"text":"is the number of intra-linked neurons.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"87%"},"width":1505,"height":238,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-6.png","element":"img"}],[{"text":"Note: In the first row, we re-calculate the inequality and obtain a slightly different inequality from the original. The previously-established inequality in ","element":"span"},{"href":"#id-32","text":"(Arora et al., ","element":"a"},{"href":"#id-32","text":"2016) ","element":"a"},{"text":"is ","element":"span"},{"style":{"height":21.95},"width":217.53,"height":54.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-7.png","element":"img","alt":" s ≥ 12kp1/k.","inline":true}],[{"text":"The result listed in the first row of Table ","element":"span"},{"href":"#id-131","text":"11 ","element":"a"},{"text":"is obtained from the following two lemmas and proofs in ","element":"span"},{"href":"#id-32","text":"(Arora et al., ","element":"a"},{"href":"#id-32","text":"2016)","element":"a"},{"text":".","element":"span"}],[{"href":"#id-32","style":{"height":17.6},"width":1337.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-8.png","element":"img","alt":"Lemma 20 (Lemma D.5 in (Arora et al., 2016)) Let f : R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be a function represented by a ","element":"span"},{"style":{"height":12.4},"width":142.38,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-9.png","element":"img","alt":" R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"ReLU DNN, as shown in Figure ","element":"span"},{"href":"#id-132","style":{"fontStyle":"italic"},"text":"19(","element":"a"},{"style":{"fontStyle":"italic"},"text":"a), with depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and widths ","element":"span"},{"style":{"height":16.4},"width":286.83,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-10.png","element":"img","alt":"w1, . . . , wk of k","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"hidden layers. Then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a PWL function with at most ","element":"span"},{"text":"2","element":"span"},{"style":{"height":19.53},"width":347.39,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-11.png","element":"img","alt":"k−1 · (w1 + 1) · w2 ·","inline":true},{"style":{"height":15.6},"width":272.06,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-12.png","element":"img","alt":". . . · wk pieces.","inline":true}],[{"href":"#id-32","style":{"height":17.6},"width":1331.39,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/58-13.png","element":"img","alt":"Lemma 21 (Lemma D.6 in (Arora et al., 2016)) Let f : R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be a piecewise linear function with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"style":{"fontStyle":"italic"},"text":"pieces. If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is represented by a ReLU DNN, as shown in Figure ","element":"span"},{"href":"#id-132","style":{"fontStyle":"italic"},"text":"19(","element":"a"},{"style":{"fontStyle":"italic"},"text":"a), with","element":"span"}],[{"id":"id-132","style":{"width":"79%"},"width":1369,"height":436,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-0.png","element":"img"}],[{"text":"Figure 19: (a) the feedforward architecture; (b)-(c) the structures with intra-layer links.","element":"figcaption","subtype":"caption"}],[{"style":{"fontStyle":"italic"},"text":"depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", then it must have size at least ","element":"span"},{"style":{"height":21.95},"width":201.26,"height":54.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-1.png","element":"img","alt":"12kp1/k − 1","inline":true},{"style":{"fontStyle":"italic"},"text":". Conversely, any piecewise linear ","element":"span"},{"style":{"fontStyle":"italic"},"text":"function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"represented by a ReLU DNN of depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and size at most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"style":{"fontStyle":"italic"},"text":", can have at most","element":"span"}],[{"style":{"width":"13%"},"width":232,"height":57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-2.png","element":"img"}],[{"id":"id-133","text":"Now, we prove the result listed in the second row of Table ","element":"span"},{"href":"#id-131","text":"11 ","element":"a"},{"text":"for ReLU DNN with intra-links.","element":"span"}],[{"style":{"height":17.6},"width":1013.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-3.png","element":"img","alt":"Lemma 22 (Our 1st New Result) Let f : R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be a function represented by a ","element":"span"},{"style":{"height":12.4},"width":131.39,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-4.png","element":"img","alt":" R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"ReLU DNN, as shown in Figure ","element":"span"},{"href":"#id-132","style":{"fontStyle":"italic"},"text":"19(","element":"a"},{"style":{"fontStyle":"italic"},"text":"b), with depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and widths ","element":"span"},{"style":{"height":16.4},"width":430.56,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-5.png","element":"img","alt":" w1, . . . , wk of k hidden","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"layers. Then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a PWL function with at most ","element":"span"},{"text":"3","element":"span"},{"style":{"height":21.6},"width":663.1,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-6.png","element":"img","alt":"k−1 ·� 3w12 + 1�· w2 · . . . · wk pieces.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"As shown in Figure ","element":"span"},{"href":"#id-132","text":"19(","element":"a"},{"text":"b), let us connect every two neurons with an intra-layer link. Suppose that one neuron is ","element":"span"},{"style":{"height":17.6},"width":176.38,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-7.png","element":"img","alt":" σ(wx + b","inline":true},{"text":") which can create 1 breakpoint at ","element":"span"},{"style":{"height":17.6},"width":105.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-8.png","element":"img","alt":" −b/w","inline":true},{"text":", while the other neuron is ","element":"span"},{"style":{"height":17.6},"width":403.78,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-9.png","element":"img","alt":" σ(w′x + b′ + σ(wx + b","inline":true},{"text":")) creating at most 3 breakpoints ","element":"span"},{"style":{"height":17.6},"width":385.29,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-10.png","element":"img","alt":" −b/w, −b′/w′, −(b +","inline":true},{"style":{"height":17.6},"width":211.13,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-11.png","element":"img","alt":"b′)/(w +w′","inline":true},{"text":"). Thus, we can get at most ","element":"span"},{"style":{"height":21.6},"width":209.55,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-12.png","element":"img","alt":"w12 ·3 = 3w12","inline":true,"padRight":true},{"text":"distinct breakpoints, i.e., ","element":"span"},{"style":{"height":21.6},"width":265.18,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-13.png","element":"img","alt":"3w12 +1 pieces.","inline":true}],[{"text":"Let us estimate the number of pieces via induction. Assume that for some ","element":"span"},{"style":{"height":16.4},"width":205.13,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-14.png","element":"img","alt":" k ≥ 1, any","inline":true},{"style":{"height":12.4},"width":131.39,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-15.png","element":"img","alt":"R → R","inline":true,"padRight":true},{"text":"ReLU DNN, as shown in Figure ","element":"span"},{"href":"#id-132","text":"19(","element":"a"},{"text":"b), with depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"+1 and widths ","element":"span"},{"style":{"height":15.6},"width":318.14,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-16.png","element":"img","alt":" w1, . . . , wk of the","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"hidden layers produces at most ","element":"span"},{"style":{"height":21.6},"width":612.79,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-17.png","element":"img","alt":" Ak = 3k−1·� 3w12 + 1�·w2·. . .·wk","inline":true,"padRight":true},{"text":"pieces. We add one more layer of ","element":"span"},{"style":{"height":12.04},"width":93.06,"height":30.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-18.png","element":"img","alt":" wk+1","inline":true,"padRight":true},{"text":"neurons to this network such that its pre-activation is actually the output of a ","element":"span"},{"style":{"height":12.4},"width":131.39,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-19.png","element":"img","alt":"R → R","inline":true,"padRight":true},{"text":"ReLU DNN with depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"+1 and widths ","element":"span"},{"style":{"height":11.2},"width":196.38,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-20.png","element":"img","alt":" w1, . . . , wk","inline":true},{"text":". Suppose that the preactivation of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th neuron is ","element":"span"},{"style":{"height":17.6},"width":238.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-21.png","element":"img","alt":" fi, i ∈ [wk+1","inline":true},{"text":"]. By the induction hypothesis, ","element":"span"},{"style":{"height":16.4},"width":33.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-22.png","element":"img","alt":" fi","inline":true,"padRight":true},{"text":"is a piecewise linear function with at most ","element":"span"},{"style":{"height":15.64},"width":50.73,"height":39.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-23.png","element":"img","alt":" Ak","inline":true,"padRight":true},{"text":"pieces. Without loss of generality, let the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th and (","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"+ 1)-th neurons be connected by an intra-layer link. Their outputs are ","element":"span"},{"style":{"height":17.6},"width":489.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-24.png","element":"img","alt":" σ(fi) and σ(fi+1 + σ(fi))","inline":true,"padRight":true},{"text":"whose total number of pieces equals to that of three functions ","element":"span"},{"style":{"height":17.6},"width":513.59,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-25.png","element":"img","alt":" σ(fi), σ(fi+1), σ(fi + fi+1),","inline":true,"padRight":true},{"text":"which is at most 6","element":"span"},{"style":{"height":15.64},"width":50.73,"height":39.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-26.png","element":"img","alt":"Ak","inline":true},{"text":". Because we have ","element":"span"},{"style":{"height":12.04},"width":93.06,"height":30.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-27.png","element":"img","alt":" wk+1","inline":true,"padRight":true},{"text":"neurons in the last hidden layer, we can get at most 6","element":"span"},{"style":{"height":21.6},"width":879.86,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-28.png","element":"img","alt":"Ak ·(wk+1/2) = 3k ·� 3w12 + 1�·w2 ·. . . wk ·wk+1","inline":true,"padRight":true},{"text":"pieces, which concludes the induction step.","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-29.png","element":"img"}],[{"id":"id-135","style":{"height":17.6},"width":1048.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-30.png","element":"img","alt":"Lemma 23 (Our 2nd New Result) Let f : R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be a piecewise linear function with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"style":{"fontStyle":"italic"},"text":"pieces. If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is represented by a ReLU DNN, as shown in Figure ","element":"span"},{"href":"#id-132","style":{"fontStyle":"italic"},"text":"19(","element":"a"},{"style":{"fontStyle":"italic"},"text":"b), with depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", then it must have size at least ","element":"span"},{"style":{"height":21.96},"width":254.72,"height":54.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-31.png","element":"img","alt":"13k(2p)1/k − 23","inline":true},{"style":{"fontStyle":"italic"},"text":". Conversely, any piecewise linear function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"represented by a ReLU DNN of depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and size at most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"style":{"fontStyle":"italic"},"text":", can have at most ","element":"span"},{"style":{"height":24.86},"width":129.16,"height":62.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/59-32.png","element":"img","alt":"12� 3sk�k","inline":true}],[{"style":{"fontStyle":"italic"},"text":"pieces.","element":"span"}],[{"id":"id-134","style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"Let widths of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"hidden layers be ","element":"span"},{"href":"#id-133","style":{"height":16},"width":527.46,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-0.png","element":"img","alt":" w1, . . . , wk. By Lemma 22","inline":true},{"text":", we must have","element":"span"}],[{"style":{"width":"92%"},"width":1600,"height":105,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-1.png","element":"img"}],[{"text":"By the AM-GM inequality, we minimize the size subject to ","element":"span"},{"href":"#id-134","text":"(134)","element":"a"},{"text":",","element":"span"}],[{"style":{"width":"71%"},"width":1234,"height":458,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-2.png","element":"img"}],[{"text":"where the inequality is achieved at ","element":"span"},{"style":{"height":22.08},"width":675.48,"height":55.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-3.png","element":"img","alt":"3w12 + 1 = 3w22 = . . . = 3wk2 ≥ 12p1/k","inline":true},{"text":". This leads to the ","element":"span"},{"text":"first statement. The second statement follows by reversing the above equation.","element":"span"}],[{"style":{"width":"57%"},"width":993,"height":183,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-4.png","element":"img"}],[{"style":{"height":17.6},"width":1788.66,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-5.png","element":"img","alt":"Lemma 24 (The 1st New Result Extended to n Intra-Linked Neurons) Let f : R →","inline":true,"padRight":true},{"text":"R ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be a function represented by a ","element":"span"},{"style":{"height":12.4},"width":131.39,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-6.png","element":"img","alt":" R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"ReLU DNN, as shown in Figure ","element":"span"},{"href":"#id-132","style":{"fontStyle":"italic"},"text":"19(","element":"a"},{"style":{"fontStyle":"italic"},"text":"c), with depth ","element":"span"},{"style":{"height":16.4},"width":548.9,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-7.png","element":"img","alt":"k + 1, widths w1, . . . , wk of k","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"hidden layers and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"style":{"fontStyle":"italic"},"text":"intra-linked neurons in each layer. Then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is a PWL function with at most ","element":"span"},{"text":"2","element":"span"},{"style":{"height":31.6},"width":979.92,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-8.png","element":"img","alt":"k−1 ·�(2n−1)w1n + 1�· (2n−1)w2n · . . . · (2n−1)wkn pieces.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"For the first layer, the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"intra-linked neuron can create 2","element":"span"},{"style":{"height":11.93},"width":69.14,"height":29.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-9.png","element":"img","alt":"n −","inline":true,"padRight":true},{"text":"1 breaking points. Let us derive this fact via induction. Suppose that ","element":"span"},{"style":{"height":15.02},"width":98.21,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-10.png","element":"img","alt":" Gn−1","inline":true,"padRight":true},{"text":"is the piecewise linear function of ","element":"span"},{"style":{"height":8.4},"width":73.6,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-11.png","element":"img","alt":" n −","inline":true,"padRight":true},{"text":"1 intra-linked neurons, then ","element":"span"},{"style":{"height":17.6},"width":1017.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-12.png","element":"img","alt":" Gn = σ(wx + b + Gn−1). When Gn−1 ≥ 0, Gn =","inline":true},{"style":{"height":17.6},"width":333.66,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-13.png","element":"img","alt":"σ(wx + b + Gn−1","inline":true},{"text":"), generating at most 2","element":"span"},{"style":{"height":14.73},"width":111.83,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-14.png","element":"img","alt":"n−1 −","inline":true,"padRight":true},{"text":"1 breaking points (same as ","element":"span"},{"style":{"height":17.6},"width":271.54,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-15.png","element":"img","alt":" Gn−1). When","inline":true},{"style":{"height":17.6},"width":555.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-16.png","element":"img","alt":"Gn−1 ≤ 0, Gn = σ(wx + b","inline":true},{"text":") has one breaking point. ","element":"span"},{"text":"Without repetition, the number of breaking points of ","element":"span"},{"style":{"height":15.02},"width":55.31,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-17.png","element":"img","alt":" Gn","inline":true,"padRight":true},{"text":"is the summation of these newly-generated breaking points and breaking points of ","element":"span"},{"style":{"height":15.02},"width":98.21,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-18.png","element":"img","alt":" Gn−1","inline":true},{"text":", which is 2(2","element":"span"},{"style":{"height":19.13},"width":380.8,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-19.png","element":"img","alt":"n−1 − 1) + 1 = 2n −","inline":true,"padRight":true},{"text":"1. Because the first layer has ","element":"span"},{"style":{"height":10.62},"width":48.24,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-20.png","element":"img","alt":" w1","inline":true,"padRight":true},{"text":"neurons, at most ","element":"span"},{"style":{"height":17.6},"width":330.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-21.png","element":"img","alt":" w1/n groups of n","inline":true,"padRight":true},{"text":"intra-linked neurons are available. Thus, we can get at most ","element":"span"},{"style":{"height":24.22},"width":146.27,"height":60.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-22.png","element":"img","alt":"(2n−1)w1n","inline":true,"padRight":true},{"text":"+ 1 pieces.","element":"span"}],[{"text":"Computing the number of pieces for the hidden layers shares the same spirit with the first layer, combining the proof of Lemma ","element":"span"},{"href":"#id-133","text":"22, ","element":"a"},{"text":"we conclude that a ","element":"span"},{"style":{"height":15.2},"width":408.78,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-23.png","element":"img","alt":" R → R ReLU DNN,","inline":true,"padRight":true},{"text":"as shown in Figure ","element":"span"},{"href":"#id-132","text":"19(","element":"a"},{"text":"c), with depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1, widths ","element":"span"},{"style":{"height":15.6},"width":291.87,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-24.png","element":"img","alt":" w1, . . . , wk of k","inline":true,"padRight":true},{"text":"hidden layers, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"intra-linked neurons in each layer, has at most 2","element":"span"},{"style":{"height":31.6},"width":809.7,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/60-25.png","element":"img","alt":"k−1 ·�(2n−1)w1n + 1�· (2n−1)w2n · . . . · (2n−1)wkn","inline":true,"padRight":true},{"text":"pieces.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 25 (The 2nd New Result Extended to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Intra-Linked Neurons) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":": ","element":"span"},{"style":{"height":12.4},"width":131.39,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/61-0.png","element":"img","alt":"R → R","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be a piecewise linear function with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"style":{"fontStyle":"italic"},"text":"pieces. If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is represented by a ReLU DNN, as shown in Figure ","element":"span"},{"href":"#id-132","style":{"fontStyle":"italic"},"text":"19(","element":"a"},{"style":{"fontStyle":"italic"},"text":"c), with depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"style":{"fontStyle":"italic"},"text":"neurons intra-linked in each hidden layer, then it must have size at least ","element":"span"},{"style":{"height":25.38},"width":429.81,"height":63.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/61-1.png","element":"img","alt":"n2(2n−1)k(2p)1/k − n2n−1","inline":true},{"style":{"fontStyle":"italic"},"text":". Conversely, any piecewise linear ","element":"span"},{"style":{"fontStyle":"italic"},"text":"function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"represented by a ReLU DNN of depth ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"+ 1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and size at most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s","element":"span"},{"style":{"fontStyle":"italic"},"text":", can have at most","element":"span"}],[{"style":{"width":"22%"},"width":387,"height":83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/61-2.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"Please refer to our proof for Lemma ","element":"span"},{"href":"#id-135","text":"23.","element":"a"}]]},{"heading":"Reference","paragraphs":[[{"id":"id-41","text":"A. Adadi and M. Berrada. Peeking inside the black-box: A survey on explainable artificial ","element":"span"},{"text":"intelligence (xai). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Access","element":"span"},{"text":", 6:52138–60, 2018.","element":"span"}],[{"id":"id-83","text":"Zeyuan Allen-Zhu, Yuanzhi Li, and Yingyu Liang. Learning and generalization in overpa- ","element":"span"},{"text":"rameterized neural networks, going beyond two layers. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", pages 6155–6166, 2019.","element":"span"}],[{"id":"id-32","text":"Raman Arora, Amitabh Basu, Poorya Mianjy, and Anirbit Mukherjee. Understanding deep ","element":"span"},{"text":"neural networks with rectified linear units. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1611.01491","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-81","text":"Sanjeev Arora, Simon S Du, Wei Hu, Zhiyuan Li, Russ R Salakhutdinov, and Ruosong ","element":"span"},{"text":"Wang. On exact computation with an infinitely wide neural net. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pages 8139–8148, 2019.","element":"span"}],[{"text":"Peter L Bartlett, Dylan J Foster, and Matus J Telgarsky. Spectrally-normalized margin ","element":"span"},{"text":"bounds for neural networks. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pages 6240–6249, 2017.","element":"span"}],[{"id":"id-11","text":"Monica Bianchini and Franco Scarselli. On the complexity of neural network classifiers: ","element":"span"},{"text":"A comparison between shallow and deep architectures. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE transactions on neural networks and learning systems","element":"span"},{"text":", 25(8):1553–1565, 2014.","element":"span"}],[{"id":"id-123","text":"Alon Brutzkus, Amir Globerson, Eran Malach, and Shai Shalev-Shwartz. Sgd learns over- ","element":"span"},{"text":"parameterized networks that provably generalize on linearly separable data, 2017.","element":"span"}],[{"id":"id-21","text":"Jie Bu and Anuj Karpatne. Quadratic residual networks: A new class of neural networks ","element":"span"},{"text":"for solving forward and inverse problems in physics involving pdes. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 2021 SIAM International Conference on Data Mining (SDM)","element":"span"},{"text":", pages 675–683. SIAM, 2021.","element":"span"}],[{"id":"id-84","text":"Yuan Cao and Quanquan Gu. ","element":"span"},{"text":"A generalization theory of gradient descent for learning over-parameterized deep relu networks, 2019.","element":"span"}],[{"id":"id-18","text":"CL Philip Chen and Zhulin Liu. Broad learning system: An effective and efficient incre- ","element":"span"},{"text":"mental learning system without the need for deep architecture. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE transactions on neural networks and learning systems","element":"span"},{"text":", 29(1):10–24, 2017.","element":"span"}],[{"id":"id-4","text":"Hu Chen, Yi Zhang, Mannudeep K Kalra, Feng Lin, Yang Chen, Peixi Liao, Jiliu Zhou, ","element":"span"},{"text":"and Ge Wang. Low-dose ct with a residual encoder-decoder convolutional neural network. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE transactions on medical imaging","element":"span"},{"text":", 36(12):2524–2535, 2017.","element":"span"}],[{"id":"id-115","text":"Xi Chen, Yan Duan, Rein Houthooft, John Schulman, Ilya Sutskever, and Pieter Abbeel. ","element":"span"},{"text":"Infogan: Interpretable representation learning by information maximizing generative adversarial nets. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", pages 2172–2180, 2016.","element":"span"}],[{"id":"id-17","text":"Heng-Tze Cheng, Levent Koc, Jeremiah Harmsen, Tal Shaked, Tushar Chandra, Hrishi ","element":"span"},{"text":"Aradhye, Glen Anderson, Greg Corrado, Wei Chai, Mustafa Ispir, et al. Wide & deep learning for recommender systems. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 1st workshop on deep learning for recommender systems","element":"span"},{"text":", pages 7–10, 2016.","element":"span"}],[{"id":"id-20","text":"Lingyang Chu, Xia Hu, Juhua Hu, Lanjun Wang, and Jian Pei. ","element":"span"},{"text":"Exact and consistent","element":"span"}],[{"style":{"width":"97%"},"width":1685,"height":149,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/63-0.png","element":"img"}],[{"id":"id-7","text":"Nadav Cohen, Or Sharir, and Amnon Shashua. On the expressive power of deep learning: ","element":"span"},{"text":"A tensor analysis. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on learning theory","element":"span"},{"text":", pages 698–728. PMLR, 2016.","element":"span"}],[{"id":"id-2","text":"George E Dahl, Dong Yu, Li Deng, and Alex Acero. Context-dependent pre-trained deep ","element":"span"},{"text":"neural networks for large-vocabulary speech recognition. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Transactions on audio, speech, and language processing","element":"span"},{"text":", 20(1):30–42, 2011.","element":"span"}],[{"id":"id-60","text":"Louis De Branges. The stone-weierstrass theorem. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the American Mathematical Society","element":"span"},{"text":", 10(5):822–824, 1959.","element":"span"}],[{"id":"id-34","text":"Zhijie Deng, Yinpeng Dong, Shifeng Zhang, and Jun Zhu. Understanding and exploring ","element":"span"},{"text":"the network with stochastic architectures. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 33, 2020.","element":"span"}],[{"id":"id-86","text":"Simon Du and Wei Hu. Width provably matters in optimization for deep linear neural ","element":"span"},{"text":"networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pages 1655–1664. PMLR, 2019.","element":"span"}],[{"id":"id-82","text":"Simon S Du, Xiyu Zhai, Barnabas Poczos, and Aarti Singh. Gradient descent provably","element":"span"}],[{"style":{"width":"97%"},"width":1686,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/63-1.png","element":"img"}],[{"id":"id-75","text":"Richard Ehrenborg. ","element":"span"},{"text":"The perles-shephard identity for non-convex polytopes. ","element":"span"},{"text":"Technical report, Technical Report, University of Kentucky, 2007.","element":"span"}],[{"id":"id-9","text":"Ronen Eldan and Ohad Shamir. The power of depth for feedforward neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on learning theory","element":"span"},{"text":", pages 907–940. PMLR, 2016.","element":"span"}],[{"id":"id-40","text":"F. Fan and G. Wang. Fuzzy logic interpretation of quadratic networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neurocomputing","element":"span"},{"text":", 374:10–21, 2020.","element":"span"}],[{"id":"id-1","text":"Fenglei Fan, Wenxiang Cong, and Ge Wang. A new type of neurons for machine learn- ","element":"span"},{"text":"ing. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International journal for numerical methods in biomedical engineering","element":"span"},{"text":", 34(2):e2920, 2018a.","element":"span"}],[{"id":"id-57","text":"Fenglei Fan, Wenxiang Cong, and Ge Wang. Generalized backpropagation algorithm for ","element":"span"},{"text":"training second-order neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International journal for numerical methods in biomedical engineering","element":"span"},{"text":", 34(5):e2956, 2018b.","element":"span"}],[{"id":"id-14","text":"Fenglei Fan, Dayang Wang, Hengtao Guo, Qikui Zhu, Pingkun Yan, Ge Wang, and Hengy- ","element":"span"},{"text":"ong Yu. On a sparse shortcut topology of artificial neural networks, 2018c.","element":"span"}],[{"id":"id-59","text":"Fenglei Fan, Hongming Shan, Mannudeep K Kalra, Ramandeep Singh, Guhan Qian, ","element":"span"},{"text":"Matthew Getzin, Yueyang Teng, Juergen Hahn, and Ge Wang. Quadratic autoencoder (q-ae) for low-dose ct denoising. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE transactions on medical imaging","element":"span"},{"text":", 39(6):2035–2050, 2019.","element":"span"}],[{"id":"id-58","text":"Fenglei Fan, Jinjun Xiong, and Ge Wang. Universal approximation with quadratic deep ","element":"span"},{"text":"networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural Networks","element":"span"},{"text":", 124:383–392, 2020.","element":"span"}],[{"id":"id-15","text":"Ken-Ichi Funahashi. ","element":"span"},{"text":"On the approximate realization of continuous mappings by neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural networks","element":"span"},{"text":", 2(3):183–192, 1989.","element":"span"}],[{"id":"id-122","text":"Rong Ge, Furong Huang, Chi Jin, and Yang Yuan. Escaping from saddle points—online ","element":"span"},{"text":"stochastic gradient for tensor decomposition. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on Learning Theory","element":"span"},{"text":", pages 797–842, 2015.","element":"span"}],[{"id":"id-118","text":"Federico Girosi and Tomaso Poggio. Representation properties of networks: Kolmogorov’s ","element":"span"},{"text":"theorem is irrelevant. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural Computation","element":"span"},{"text":", 1(4):465–469, 1989.","element":"span"}],[{"text":"Noah Golowich, Alexander Rakhlin, and Ohad Shamir. Size-independent sample complexity ","element":"span"},{"text":"of neural networks, 2017.","element":"span"}],[{"id":"id-116","text":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil ","element":"span"},{"text":"Ozair, Aaron Courville, and Yoshua Bengio. Generative adversarial nets. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", pages 2672–2680, 2014a.","element":"span"}],[{"id":"id-0","text":"Ian Goodfellow, Yoshua Bengio, Aaron Courville, and Yoshua Bengio. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Deep learning","element":"span"},{"text":", volume 1. MIT press Cambridge, 2016.","element":"span"}],[{"id":"id-107","text":"Ian J Goodfellow, Jonathon Shlens, and Christian Szegedy. ","element":"span"},{"text":"Explaining and harnessing adversarial examples, 2014b.","element":"span"}],[{"id":"id-51","text":"Juncai He, Lin Li, Jinchao Xu, and Chunyue Zheng. Relu deep neural networks and linear ","element":"span"},{"text":"finite elements, 2018.","element":"span"}],[{"id":"id-16","text":"Kurt Hornik, Maxwell Stinchcombe, and Halbert White. Multilayer feedforward networks ","element":"span"},{"text":"are universal approximators. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural networks","element":"span"},{"text":", 2(5):359–366, 1989.","element":"span"}],[{"id":"id-85","text":"Kaixuan Huang, Yuqing Wang, Molei Tao, and Tuo Zhao. Why do deep residual networks ","element":"span"},{"text":"generalize better than deep feedforward networks?–a neural tangent kernel perspective, 2020.","element":"span"}],[{"id":"id-29","text":"Arthur Jacot, Franck Gabriel, and Cl´ement Hongler. Neural tangent kernel: Convergence ","element":"span"},{"text":"and generalization in neural networks. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", pages 8571–8580, 2018.","element":"span"}],[{"id":"id-22","text":"Duofa Ji, Jin Liu, Weiping Wen, Changhai Zhai, Wei Wang, and Evangelos I Katsanos. ","element":"span"},{"text":"Prediction of cumulative absolute velocity based on refined second-order deep neural network. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Earthquake Engineering","element":"span"},{"text":", pages 1–20, 2021.","element":"span"}],[{"id":"id-30","text":"Kenji Kawaguchi, Jiaoyang Huang, and Leslie Pack Kaelbling. Effect of depth and width ","element":"span"},{"text":"on local minima in deep learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural computation","element":"span"},{"text":", 31(7):1462–1498, 2019.","element":"span"}],[{"id":"id-104","text":"Diederik P Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ICLR","element":"span"}],[{"style":{"width":"15%"},"width":275,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/65-0.png","element":"img"}],[{"id":"id-26","text":"A. N. Kolmogorov. On the representation of continuous functions of several variables by ","element":"span"},{"text":"superpositions of continuous functions of a smaller number of variables. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the USSR Academy of Sciences","element":"span"},{"text":", 108:179–182, 1956.","element":"span"}],[{"id":"id-3","text":"Ankit Kumar, Ozan Irsoy, Peter Ondruska, Mohit Iyyer, James Bradbury, Ishaan Gulrajani, ","element":"span"},{"text":"Victor Zhong, Romain Paulus, and Richard Socher. Ask me anything: Dynamic memory networks for natural language processing. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International conference on machine learning","element":"span"},{"text":", pages 1378–1387. PMLR, 2016.","element":"span"}],[{"id":"id-108","text":"Alexey Kurakin, Ian Goodfellow, Samy Bengio, et al. Adversarial examples in the physical ","element":"span"},{"text":"world, 2016.","element":"span"}],[{"id":"id-78","text":"Jaehoon Lee, Yasaman Bahri, Roman Novak, Samuel S Schoenholz, Jeffrey Pennington,","element":"span"}],[{"style":{"width":"97%"},"width":1686,"height":95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/65-1.png","element":"img"}],[{"id":"id-31","text":"Yoav Levine, Noam Wies, Or Sharir, Hofit Bata, and Amnon Shashua. Limits to depth ","element":"span"},{"text":"efficiencies of self-attention, 2020.","element":"span"}],[{"id":"id-67","text":"Hao Li, Asim Kadav, Igor Durdanovic, Hanan Samet, and Hans Peter Graf. Pruning filters ","element":"span"},{"text":"for efficient convnets, 2016.","element":"span"}],[{"id":"id-127","text":"Xingguo Li, Junwei Lu, Zhaoran Wang, Jarvis Haupt, and Tuo Zhao. On tighter general- ","element":"span"},{"text":"ization bound for deep neural networks: Cnns, resnets, and beyond, 2018.","element":"span"}],[{"id":"id-124","text":"Yuanzhi Li and Yingyu Liang. Learning overparameterized neural networks via stochas- ","element":"span"},{"text":"tic gradient descent on structured data. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pages 8157–8166, 2018.","element":"span"}],[{"id":"id-70","text":"William A Light and Elliot W Cheney. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Approximation theory in tensor product spaces","element":"span"},{"text":", volume 1169. Springer, 2006.","element":"span"}],[{"id":"id-13","text":"Hongzhou Lin and Stefanie Jegelka. Resnet with one-neuron hidden layers is a universal ","element":"span"},{"text":"approximator. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 31:6169–6178, 2018.","element":"span"}],[{"id":"id-117","text":"Zachary C Lipton. The mythos of model interpretability. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Queue","element":"span"},{"text":", 16(3):31–57, 2018.","element":"span"}],[{"id":"id-12","text":"Zhou Lu, Hongming Pu, Feicheng Wang, Zhiqiang Hu, and Liwei Wang. The expressive ","element":"span"},{"text":"power of neural networks: a view from the width. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 31st International Conference on Neural Information Processing Systems","element":"span"},{"text":", pages 6232–6240, 2017.","element":"span"}],[{"id":"id-23","text":"Pranav Mantini and Shishr K Shah. Cqnn: Convolutional quadratic neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"2020 25th International Conference on Pattern Recognition (ICPR)","element":"span"},{"text":", pages 9819–9826. IEEE, 2021.","element":"span"}],[{"id":"id-79","text":"Alexander G de G Matthews, Jiri Hron, Mark Rowland, Richard E Turner, and Zoubin","element":"span"}],[{"style":{"width":"97%"},"width":1686,"height":95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/66-0.png","element":"img"}],[{"id":"id-8","text":"Hrushikesh N Mhaskar and Tomaso Poggio. Deep vs. shallow networks: An approximation ","element":"span"},{"text":"theory perspective. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Analysis and Applications","element":"span"},{"text":", 14(06):829–848, 2016.","element":"span"}],[{"id":"id-10","text":"Guido F Montufar, Razvan Pascanu, Kyunghyun Cho, and Yoshua Bengio. On the number ","element":"span"},{"text":"of linear regions of deep neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", pages 2924–2932, 2014.","element":"span"}],[{"id":"id-109","text":"Seyed-Mohsen Moosavi-Dezfooli, Alhussein Fawzi, and Pascal Frossard. Deepfool: a simple ","element":"span"},{"text":"and accurate method to fool deep neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the IEEE conference on computer vision and pattern recognition","element":"span"},{"text":", pages 2574–2582, 2016.","element":"span"}],[{"id":"id-77","text":"Radford M Neal. Priors for infinite networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Bayesian Learning for Neural Networks","element":"span"},{"text":", pages 29–53. Springer, 1996.","element":"span"}],[{"id":"id-129","text":"Behnam Neyshabur, Ryota Tomioka, and Nathan Srebro. Norm-based capacity control in ","element":"span"},{"text":"neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Conference on Learning Theory","element":"span"},{"text":", pages 1376–1401, 2015.","element":"span"}],[{"text":"Behnam Neyshabur, Srinadh Bhojanapalli, and Nathan Srebro. A pac-bayesian approach","element":"span"}],[{"style":{"width":"97%"},"width":1685,"height":95,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/66-1.png","element":"img"}],[{"id":"id-80","text":"Roman Novak, Lechao Xiao, Yasaman Bahri, Jaehoon Lee, Greg Yang, Jiri Hron, Daniel A","element":"span"}],[{"style":{"width":"97%"},"width":1686,"height":150,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/66-2.png","element":"img"}],[{"id":"id-37","text":"Yookoon Park, Sangho Lee, Gunhee Kim, and David Blei. Unsupervised representation ","element":"span"},{"text":"learning via neural activation coding. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pages 8391–8400. PMLR, 2021.","element":"span"}],[{"id":"id-68","text":"Antonio Polino, Razvan Pascanu, and Dan Alistarh. Model compression via distillation and ","element":"span"},{"text":"quantization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Learning Representations","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-44","text":"E. W. Saad and D. C. Wunsch II. Neural network explanation using inversion. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural networks","element":"span"},{"text":", 20(1):78–93, 2007.","element":"span"}],[{"id":"id-39","text":"Thiago Serra and Srikumar Ramalingam. Empirical bounds on linear regions of deep rectifier ","element":"span"},{"text":"networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AAAI","element":"span"},{"text":", pages 5628–5635, 2020.","element":"span"}],[{"id":"id-35","text":"Thiago Serra, Christian Tjandraatmadja, and Srikumar Ramalingam. Bounding and count- ","element":"span"},{"text":"ing linear regions of deep neural networks. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pages 4558–4566. PMLR, 2018.","element":"span"}],[{"id":"id-43","text":"Rudy Setiono and Huan Liu. Understanding neural networks via rule extraction. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IJCAI","element":"span"},{"text":", volume 1, pages 480–485. Citeseer, 1995.","element":"span"}],[{"id":"id-6","text":"Lech Szymanski and Brendan McCane. Deep networks are effective encoders of periodicity. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE transactions on neural networks and learning systems","element":"span"},{"text":", 25(10):1816–1827, 2014.","element":"span"}],[{"id":"id-42","text":"Sebastian Thrun. Extracting rules from artificial neural networks with distributed repre- ","element":"span"},{"text":"sentations. pages 505–512. MORGAN KAUFMANN PUBLISHERS, 1995.","element":"span"}],[{"id":"id-102","text":"Laurens Van der Maaten and Geoffrey Hinton. Visualizing data using t-sne. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of machine learning research","element":"span"},{"text":", 9(11), 2008.","element":"span"}],[{"id":"id-5","text":"Ge Wang. A perspective on deep imaging. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Ieee Access","element":"span"},{"text":", 4:8914–8924, 2016.","element":"span"}],[{"id":"id-50","text":"Shuning Wang and Xusheng Sun. Generalization of hinging hyperplanes. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Transactions on Information Theory","element":"span"},{"text":", 51(12):4425–4431, 2005.","element":"span"}],[{"id":"id-112","text":"David Vernon Widder. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advanced calculus","element":"span"},{"text":". Courier Corporation, 1989.","element":"span"}],[{"id":"id-66","text":"Jiaxiang Wu, Cong Leng, Yuhang Wang, Qinghao Hu, and Jian Cheng. Quantized convo- ","element":"span"},{"text":"lutional neural networks for mobile devices. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","element":"span"},{"text":", pages 4820–4828, 2016.","element":"span"}],[{"id":"id-33","text":"Saining Xie, Alexander Kirillov, Ross Girshick, and Kaiming He. ","element":"span"},{"text":"Exploring randomly wired neural networks for image recognition. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the IEEE International Conference on Computer Vision","element":"span"},{"text":", pages 1284–1293, 2019.","element":"span"}],[{"id":"id-36","text":"Huan Xiong, Lei Huang, Mengyang Yu, Li Liu, Fan Zhu, and Ling Shao. On the number of ","element":"span"},{"text":"linear regions of convolutional neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pages 10514–10523. PMLR, 2020.","element":"span"}],[{"id":"id-24","text":"Zirui Xu, Fuxun Yu, Jinjun Xiong, and Xiang Chen. Quadralib: A performant quadratic","element":"span"}],[{"style":{"width":"97%"},"width":1686,"height":94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.02515/images/67-0.png","element":"img"}],[{"id":"id-19","text":"Sergey Zagoruyko and Nikos Komodakis. Wide residual networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"BMVC","element":"span"},{"text":", 2016.","element":"span"}],[{"id":"id-126","text":"Chiyuan Zhang, Samy Bengio, Moritz Hardt, Benjamin Recht, and Oriol Vinyals. Under- ","element":"span"},{"text":"standing deep learning requires rethinking generalization, 2016.","element":"span"}],[{"id":"id-69","text":"Xiangyu Zhang, Jianhua Zou, Xiang Ming, Kaiming He, and Jian Sun. Efficient and ac- ","element":"span"},{"text":"curate approximations of nonlinear convolutional networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the IEEE Conference on Computer Vision and pattern Recognition","element":"span"},{"text":", pages 1984–1992, 2015.","element":"span"}]]}],"_version":"3.3.4"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]