1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTkwNy4wNDEwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2022-04-12T14:31:47.000Z","paperID":"1907.04108","published":"2019-07-09T12:17:03.000Z","authors":"[\"Justin Sirignano\",\"Konstantinos Spiliopoulos\"]","title":"Scaling Limit of Neural Networks with the Xavier Initialization and Convergence to a Global Minimum","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-05T14:27:40.849Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9zY2FsaW5nLWxpbWl0LW9mLW5ldXJhbC1uZXR3b3Jrcy13aXRoLXRoZSJ9","type":"pwc","url":"https://paperswithcode.com/paper/scaling-limit-of-neural-networks-with-the","data":"{\"date\":\"2022-08-04T19:19:26.127Z\"}"}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"konstantinos spiliopoulos","node":{"id":"eyJhZGRyZXNzIjoia3NwaWxpb3BAbWF0aC5idS5lZHUifQ==","address":"kspiliop@math.bu.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/106413949?v=4","username":"kspiliopoulos"}],"scholar":[{"thirdPartyID":"-NeMa-IAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI2OGJkMTYzNy1mNDQxLTQwZDItYjgxZS1hNzMzNTZmNDAzMzUifQ==","name":"konstantinos spiliopoulos","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTcwOC4wNzQ2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1708.07469"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wOTM3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.09372"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wNDQ0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.04440"},{"id":"eyJwYXBlcklEIjoiMjEwNS4wODYzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.08633"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wNTU0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.05545"},{"id":"eyJwYXBlcklEIjoiMTcxMC4wNDI3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1710.04273"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNzMwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.07304"},{"id":"eyJwYXBlcklEIjoiMTkwNy4wNDEwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.04108"},{"id":"eyJwYXBlcklEIjoiMjMwOC4xNDU1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2308.14555"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wNzIyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.07227"},{"id":"eyJwYXBlcklEIjoiMjAxMS4xMDQ4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.10487"}]}]}},{"author":"justin sirignano","node":{"id":"eyJhZGRyZXNzIjoiamFzaXJpZ25AaWxsaW5vaXMuZWR1In0=","address":"jasirign@illinois.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/13497644?v=4","username":"jasirign"}],"scholar":[{"thirdPartyID":"k6C3VjYAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIwNjA3MzkwMy05NzI5LTRmMTgtODQ2My03NDA3ZGFiMjY1ODQifQ==","name":"Justin Sirignano","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTcwOC4wNzQ2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1708.07469"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wOTM3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.09372"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wNDQ0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.04440"},{"id":"eyJwYXBlcklEIjoiMjEwNS4wODYzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.08633"},{"id":"eyJwYXBlcklEIjoiMjEwNS4wMTAzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.01030"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wOTE0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.09145"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wNTU0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.05545"},{"id":"eyJwYXBlcklEIjoiMTcxMC4wNDI3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1710.04273"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wODY1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.08655"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wNzMwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.07304"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wNjYzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.06637"},{"id":"eyJwYXBlcklEIjoiMjIwNy4wNDQ5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.04496"},{"id":"eyJwYXBlcklEIjoiMjIwOC4wMzQ5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2208.03498"},{"id":"eyJwYXBlcklEIjoiMTkwNy4wNDEwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.04108"},{"id":"eyJwYXBlcklEIjoiMjMwOC4xNDU1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2308.14555"},{"id":"eyJwYXBlcklEIjoiMjQwMy4xNjgyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.16825"}]}]}}]},"__typename":"paper","authorArray":["Justin Sirignano","Konstantinos Spiliopoulos"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"1907.04108","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"1907.04108","publisher":"arxiv","paperJSON":{"title":"Scaling Limit of Neural Networks with the Xavier Initialization and Convergence to a Global Minimum","paperID":"1907.04108","avgLineHeight":12,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"We analyze single-layer neural networks with the Xavier initialization in the asymptotic regime of large numbers of hidden units and large numbers of stochastic gradient descent training steps. The evolution of the neural network during training can be viewed as a stochastic system and, using techniques from stochastic analysis, we prove the neural network converges in distribution to a random ODE with a Gaussian distribution. The limit is completely different than in the typical mean-field results for neural networks due to the ","element":"span"},{"style":{"height":19.63},"width":49,"height":49.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-0.png","element":"img","alt":"1√N ","inline":true,"padRight":true},{"text":"normalization factor in the Xavier initialization (versus the ","element":"span"},{"style":{"height":17.15},"width":25,"height":42.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-1.png","element":"img","alt":" 1N ","inline":true,"padRight":true},{"text":"factor in the typical ","element":"span"},{"text":"mean-field framework). Although the pre-limit problem of optimizing a neural network is non-convex (and therefore the neural network may converge to a local minimum), the limit equation minimizes a (quadratic) convex objective function and therefore converges to a global minimum. Furthermore, under reasonable assumptions, the matrix in the limiting quadratic objective function is positive definite and thus the neural network (in the limit) will converge to a global minimum with zero loss on the training set.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"Consider a single-layer neural network with the Xavier initialization:","element":"span"}],[{"style":{"width":"29%"},"width":561,"height":118,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-2.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16.56},"width":464.84,"height":41.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-3.png","element":"img","alt":" Ci ∈ R, W i ∈ Rd, x ∈ Rd","inline":true},{"text":", and ","element":"span"},{"style":{"height":16},"width":50.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-4.png","element":"img","alt":" σ(·","inline":true},{"text":") : ","element":"span"},{"style":{"height":11.2},"width":125.96,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-5.png","element":"img","alt":" R → R","inline":true},{"text":". The number of hidden units is ","element":"span"},{"text":"N ","element":"span"},{"text":"and the output is scaled by a factor ","element":"span"},{"style":{"height":22.63},"width":53.4,"height":56.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-6.png","element":"img","alt":"1√N","inline":true,"padRight":true},{"text":"(the widely-used Xavier initialization, see ","element":"span"},{"href":"#id-0","referenceIndex":1,"text":"[2]","element":"a"},{"text":"). ","element":"span"},{"text":"The objective function is","element":"span"}],[{"style":{"width":"29%"},"width":553,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-7.png","element":"img"}],[{"text":"where the data (","element":"span"},{"style":{"height":16},"width":443.72,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-8.png","element":"img","alt":"X, Y ) ∼ π(dx, dy), Y ∈ R","inline":true},{"text":", and the parameters ","element":"span"},{"style":{"height":10.8},"width":19,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-9.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"= (","element":"span"},{"style":{"height":18.16},"width":668.16,"height":45.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-10.png","element":"img","alt":"C1, . . . , CN, W 1, . . . , W N) ∈ RN×(1+d)","inline":true},{"text":". For notational convenience, we may refer to ","element":"span"},{"style":{"height":17.36},"width":125.56,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-11.png","element":"img","alt":" gN(x; θ","inline":true},{"text":") as ","element":"span"},{"style":{"height":17.36},"width":88.76,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-12.png","element":"img","alt":" gN(x","inline":true},{"text":") in our analysis below.","element":"span"}],[{"text":"The model parameters ","element":"span"},{"style":{"height":10.8},"width":19,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-13.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"are trained using stochastic gradient descent. The parameter updates are given by:","element":"span"}],[{"id":"id-19","style":{"width":"99%"},"width":1869,"height":369,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/0-14.png","element":"img"}],[{"text":"for ","element":"span"},{"text":"k ","element":"span"},{"text":"= 0","element":"span"},{"text":", ","element":"span"},{"text":"1","element":"span"},{"text":", . . ., T N ","element":"span"},{"text":"where ","element":"span"},{"text":"T > ","element":"span"},{"text":"0. ","element":"span"},{"style":{"height":13.36},"width":52.44,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-0.png","element":"img","alt":" αN","inline":true,"padRight":true},{"text":"is the learning rate (which may depend upon ","element":"span"},{"text":"N","element":"span"},{"text":"). The data samples are (","element":"span"},{"style":{"height":10},"width":96.68,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-1.png","element":"img","alt":"xk, yk","inline":true},{"text":") are i.i.d. samples from a distribution ","element":"span"},{"style":{"height":16},"width":140.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-2.png","element":"img","alt":" π(dx, dy","inline":true},{"text":"). We impose the following assumption.","element":"span"}],[{"id":"id-1","style":{"width":"30%"},"width":576,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-3.png","element":"img"}],[{"text":"• ","element":"span"},{"text":"The activation function ","element":"span"},{"style":{"height":17.97},"width":119.68,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-4.png","element":"img","alt":" σ ∈ C2b","inline":true,"padRight":true},{"text":"(","element":"span"},{"text":"R","element":"span"},{"text":"), i.e. ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-5.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"is twice continuously differentiable and bounded.","element":"span"}],[{"text":"• ","element":"span"},{"text":"The randomly initialized parameters (","element":"span"},{"style":{"height":16.99},"width":118.52,"height":42.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-6.png","element":"img","alt":"Ci0, W i0","inline":true},{"text":") are i.i.d., mean-zero random variables with a distribution ","element":"span"},{"style":{"height":16},"width":162.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-7.png","element":"img","alt":"µ0(dc, dw","inline":true},{"text":").","element":"span"}],[{"text":"• ","element":"span"},{"text":"The random variable ","element":"span"},{"style":{"height":16.8},"width":44.32,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-8.png","element":"img","alt":" Ci0","inline":true,"padRight":true},{"text":"has compact support and ","element":"span"},{"style":{"height":16},"width":259.84,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-9.png","element":"img","alt":" ⟨∥w∥ , µ0⟩ < ∞","inline":true},{"text":".","element":"span"}],[{"text":"• ","element":"span"},{"text":"The sequence of data samples (","element":"span"},{"style":{"height":10},"width":96.68,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-10.png","element":"img","alt":"xk, yk","inline":true},{"text":") is i.i.d. from the probability distribution ","element":"span"},{"style":{"height":16},"width":140.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-11.png","element":"img","alt":" π(dx, dy","inline":true},{"text":").","element":"span"}],[{"style":{"width":"1%"},"width":26,"height":5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-12.png","element":"img"}],[{"text":"• ","element":"span"},{"text":"There is a fixed dataset of ","element":"span"},{"text":"M ","element":"span"},{"text":"data samples (","element":"span"},{"style":{"height":18.38},"width":204.64,"height":45.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-13.png","element":"img","alt":"x(i), y(i))Mi=1","inline":true,"padRight":true},{"text":"and therefore ","element":"span"},{"style":{"height":16},"width":140.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-14.png","element":"img","alt":" π(dx, dy","inline":true},{"text":") = 1","element":"span"},{"text":"M","element":"span"}],[{"style":{"width":"2%"},"width":47,"height":3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-15.png","element":"img"}],[{"text":"Note that the last assumption also implies that ","element":"span"},{"style":{"height":16},"width":140.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-16.png","element":"img","alt":" π(dx, dy","inline":true},{"text":") has compact support.","element":"span"}],[{"text":"We will study the limiting behavior of the network output ","element":"span"},{"style":{"height":17.78},"width":47.64,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-17.png","element":"img","alt":" gNk","inline":true,"padRight":true},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") for ","element":"span"},{"style":{"height":18.16},"width":450.56,"height":45.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-18.png","element":"img","alt":" x ∈ D = {x(1), . . . , x(M)}","inline":true,"padRight":true},{"text":"as the ","element":"span"},{"text":"number of hidden units ","element":"span"},{"text":"N ","element":"span"},{"text":"and stochastic gradient descent steps ","element":"span"},{"text":"T N ","element":"span"},{"text":"simultaneously become large. The","element":"span"}],[{"text":"network output converges in distribution to the solution of a random ODE as ","element":"span"},{"style":{"height":11.2},"width":138.4,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-19.png","element":"img","alt":" N → ∞","inline":true},{"text":".","element":"span"}],[{"text":"1.1 ","element":"span"},{"text":"Main Results","element":"span"}],[{"text":"Define the empirical measure","element":"span"}],[{"style":{"width":"18%"},"width":346,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-20.png","element":"img"}],[{"text":"Note that the neural network output can be written as the inner-product","element":"span"}],[{"style":{"width":"27%"},"width":514,"height":73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-21.png","element":"img"}],[{"text":"Due to Assumption ","element":"span"},{"href":"#id-1","text":"1.1, ","element":"a"},{"text":"as ","element":"span"},{"style":{"height":11.2},"width":138.4,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-22.png","element":"img","alt":" N → ∞","inline":true,"padRight":true},{"text":"and for ","element":"span"},{"style":{"height":11.6},"width":102.52,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-23.png","element":"img","alt":" x ∈ D","inline":true},{"text":",","element":"span"}],[{"id":"id-5","style":{"width":"58%"},"width":1100,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-24.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.36},"width":135.68,"height":38.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-25.png","element":"img","alt":" G ∈ RM","inline":true,"padRight":true},{"text":"is a Gaussian random variable. We also of course have that","element":"span"}],[{"style":{"width":"13%"},"width":253,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-26.png","element":"img"}],[{"text":"Define the scaled processes","element":"span"}],[{"style":{"width":"90%"},"width":1687,"height":258,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-27.png","element":"img"}],[{"text":"We will study convergence in distribution of the random process (","element":"span"},{"style":{"height":17.39},"width":122.04,"height":43.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-28.png","element":"img","alt":"µNt , hNt","inline":true,"padRight":true},{"text":") as ","element":"span"},{"style":{"height":11.2},"width":155.68,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-29.png","element":"img","alt":" N → ∞","inline":true,"padRight":true},{"text":"in the space ","element":"span"},{"style":{"height":13.1},"width":57.12,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-30.png","element":"img","alt":"DE","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]) where ","element":"span"},{"style":{"height":17.36},"width":497.76,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/1-31.png","element":"img","alt":" E = M(R1+d) × RM. DE","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]) is the Skorokhod space and ","element":"span"},{"text":"M","element":"span"},{"text":"(","element":"span"},{"text":"S","element":"span"},{"text":") is the space of probability measures on ","element":"span"},{"text":"S","element":"span"},{"text":".","element":"span"}],[{"text":"The main contribution of our work is a rigorous proof that a neural network with the Xavier initialization and trained with stochastic gradient descent converges in distribution to a random ODE as the number of units and training steps become large. In addition, our convergence analysis will also address several interesting questions:","element":"span"}],[{"text":"• ","element":"span"},{"text":"Our results provide a rigorous convergence guarantee for the Xavier initialization (i.e., the ","element":"span"},{"style":{"height":22.82},"width":53.4,"height":57.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-0.png","element":"img","alt":"1√N","inline":true,"padRight":true},{"text":"nor- ","element":"span"},{"text":"malization factor), which is almost universally used in deep learning models. ","element":"span"},{"text":"A priori ","element":"span"},{"text":"it is unclear if the neural network ","element":"span"},{"style":{"height":17.97},"width":47.64,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-1.png","element":"img","alt":" gNk","inline":true,"padRight":true},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") will converge as ","element":"span"},{"style":{"height":11.2},"width":138.4,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-2.png","element":"img","alt":" N → ∞","inline":true,"padRight":true},{"text":"since, for ","element":"span"},{"text":"k > ","element":"span"},{"text":"0, the ","element":"span"},{"style":{"height":16.96},"width":191,"height":42.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-3.png","element":"img","alt":" Ciσ(W i · x","inline":true},{"text":") is correlated with ","element":"span"},{"style":{"height":16.96},"width":202.04,"height":42.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-4.png","element":"img","alt":"Cjσ(W j · x","inline":true},{"text":") and therefore a limit may not exist. If a limit did not exist, this would imply that the neural network model could have poor numerical behavior for large ","element":"span"},{"text":"N","element":"span"},{"text":". We prove that a limit does exist.","element":"span"}],[{"text":"• ","element":"span"},{"text":"Although the pre-limit problem of optimizing a neural network is non-convex (and therefore the neural network may converge to a local minimum), the limit equation minimizes a quadratic objective function.","element":"span"}],[{"text":"• ","element":"span"},{"text":"We show that the matrix in the limiting quadratic objective function is positive definite, and therefore the neural network (in the limit) will converge to a global minimum with zero loss on the training set.","element":"span"}],[{"text":"Convergence to a global minimum for a neural network has been recently proven in ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"[3]","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[4]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[5]","element":"a"},{"text":". Our work contributes to this growing literature by showing that convergence to a global minimum is a simple consequence of the mean-field limit for neural networks. A detailed discussion of these papers and other related literature is provided in Section ","element":"span"},{"href":"#id-4","text":"1.2.","element":"a"}],[{"text":"Our main results are presented below.","element":"span"}],[{"id":"id-38","text":"Theorem 1.2. ","element":"span"},{"text":"The process (","element":"span"},{"style":{"height":17.39},"width":122.04,"height":43.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-5.png","element":"img","alt":"µNt , hNt","inline":true,"padRight":true},{"text":") converges in distribution in the space ","element":"span"},{"style":{"height":13.11},"width":57.12,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-6.png","element":"img","alt":" DE","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]) as ","element":"span"},{"style":{"height":11.2},"width":138.4,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-7.png","element":"img","alt":" N → ∞","inline":true,"padRight":true},{"text":"to (","element":"span"},{"style":{"height":14},"width":91.2,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-8.png","element":"img","alt":"µt, ht","inline":true},{"text":") ","element":"span"},{"text":"which satisfies, for every ","element":"span"},{"style":{"height":17.36},"width":221,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-9.png","element":"img","alt":" f ∈ Cb2(R1+d","inline":true},{"text":"), the random ODE","element":"span"}],[{"id":"id-6","style":{"width":"84%"},"width":1583,"height":319,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-10.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"See Sections ","element":"span"},{"text":"3, ","element":"span"},{"text":"4, ","element":"span"},{"text":"and ","element":"span"},{"text":"5.","element":"span"}],[{"text":"Recall that ","element":"span"},{"style":{"height":15.55},"width":138.08,"height":38.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-11.png","element":"img","alt":" G ∈ RM","inline":true,"padRight":true},{"text":"is a Gaussian random variable; see equation ","element":"span"},{"href":"#id-5","text":"(1.2)","element":"a"},{"text":". In addition, note that ¯","element":"span"},{"style":{"height":10},"width":36,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-12.png","element":"img","alt":"µt","inline":true,"padRight":true},{"text":"in the limit equation ","element":"span"},{"href":"#id-6","text":"(1.3) ","element":"a"},{"text":"is a constant, i.e. ","element":"span"},{"style":{"height":10},"width":131.2,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-13.png","element":"img","alt":" µt = µ0","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":10.8},"width":52.44,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-14.png","element":"img","alt":" t ∈","inline":true,"padRight":true},{"text":"[0","element":"span"},{"text":", T ","element":"span"},{"text":"]. Therefore, ","element":"span"},{"href":"#id-6","text":"(1.3) ","element":"a"},{"text":"reduces to","element":"span"}],[{"id":"id-7","style":{"width":"86%"},"width":1614,"height":275,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-15.png","element":"img"}],[{"text":"Since ","element":"span"},{"href":"#id-7","text":"(1.4) ","element":"a"},{"text":"is a linear equation in ","element":"span"},{"style":{"height":14.07},"width":74.48,"height":35.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-16.png","element":"img","alt":" CRM","inline":true,"padRight":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]), the solution ","element":"span"},{"style":{"height":13.11},"width":35.04,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-17.png","element":"img","alt":" ht","inline":true,"padRight":true},{"text":"is unique.","element":"span"}],[{"text":"To better understand ","element":"span"},{"href":"#id-7","text":"(1.4)","element":"a"},{"text":", define the matrix ","element":"span"},{"style":{"height":14.35},"width":197.6,"height":35.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-18.png","element":"img","alt":" A ∈ RM×M","inline":true,"padRight":true},{"text":"where","element":"span"}],[{"style":{"width":"65%"},"width":1222,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-19.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":14.8},"width":213.84,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-20.png","element":"img","alt":" x, x′ ∈ D. A","inline":true,"padRight":true},{"text":"is finite-dimensional since we fixed a training set of size ","element":"span"},{"text":"M ","element":"span"},{"text":"in the beginning. Then, ","element":"span"},{"href":"#id-7","text":"(1.4) ","element":"a"},{"text":"becomes","element":"span"}],[{"style":{"width":"22%"},"width":417,"height":148,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/2-21.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"ˆ","element":"span"},{"text":"Y ","element":"span"},{"text":"= (","element":"span"},{"style":{"height":17.36},"width":231.36,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-0.png","element":"img","alt":"y(1), . . . , y(M)","inline":true},{"text":"). Therefore, ","element":"span"},{"style":{"height":13.1},"width":35.04,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-1.png","element":"img","alt":" ht","inline":true,"padRight":true},{"text":"is the solution to a continuous-time gradient descent algorithm which minimizes a quadratic objective function.","element":"span"}],[{"style":{"width":"30%"},"width":572,"height":245,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-2.png","element":"img"}],[{"text":"Therefore, even though the pre-limit optimization problem is non-convex, the neural network’s limit will minimize a quadratic objective function.","element":"span"}],[{"text":"An interesting question is whether ","element":"span"},{"style":{"height":13.1},"width":90.4,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-3.png","element":"img","alt":" ht →","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"text":"Y ","element":"span"},{"text":"as ","element":"span"},{"style":{"height":10.4},"width":120.64,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-4.png","element":"img","alt":" t → ∞","inline":true},{"text":". That is, in the limit of large numbers of hidden units and many training steps, does the neural network model converge to a global minimum with zero training error. Theorem ","element":"span"},{"href":"#id-8","text":"1.3 ","element":"a"},{"text":"shows that ","element":"span"},{"style":{"height":13.1},"width":88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-5.png","element":"img","alt":" ht →","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"text":"Y ","element":"span"},{"text":"as ","element":"span"},{"style":{"height":10.4},"width":116.8,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-6.png","element":"img","alt":" t → ∞","inline":true,"padRight":true},{"text":"if ","element":"span"},{"text":"A ","element":"span"},{"text":"is positive definite. Corollary ","element":"span"},{"href":"#id-9","text":"1.4 ","element":"a"},{"text":"proves that, under reasonable hyperparameter choices and if the data samples are in distinct directions (see ","element":"span"},{"text":"[1]","element":"span"},{"text":"), ","element":"span"},{"text":"A ","element":"span"},{"text":"will be positive definite.","element":"span"}],[{"id":"id-8","text":"Theorem 1.3. ","element":"span"},{"text":"If ","element":"span"},{"text":"A ","element":"span"},{"text":"is positive definite, then","element":"span"}],[{"style":{"width":"20%"},"width":377,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-7.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Consider the transformation ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":13.11},"width":166.84,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-8.png","element":"img","alt":"ht = ht −","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"text":"Y ","element":"span"},{"text":". Then,","element":"span"}],[{"style":{"width":"16%"},"width":308,"height":105,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-9.png","element":"img"}],[{"text":"Then, ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":13.1},"width":88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-10.png","element":"img","alt":"ht →","inline":true,"padRight":true},{"text":"0 (and consequently ","element":"span"},{"style":{"height":13.1},"width":88,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-11.png","element":"img","alt":" ht →","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"text":"Y ","element":"span"},{"text":") as ","element":"span"},{"style":{"height":10.4},"width":116.8,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-12.png","element":"img","alt":" t → ∞","inline":true,"padRight":true},{"text":"if ","element":"span"},{"text":"A ","element":"span"},{"text":"is positive definite.","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-13.png","element":"img"}],[{"id":"id-9","text":"Corollary 1.4. ","element":"span"},{"text":"Assume Assumption ","element":"span"},{"href":"#id-1","text":"1.1. ","element":"a"},{"text":"A sufficient condition for ","element":"span"},{"text":"A ","element":"span"},{"text":"to be positive definite is ","element":"span"},{"style":{"height":16},"width":50.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-14.png","element":"img","alt":" σ(·","inline":true},{"text":") is nonpolynomial and slowly increasing (i.e., lim","element":"span"},{"style":{"height":21.46},"width":156.96,"height":53.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-15.png","element":"img","alt":"x→∞σ(x)xa","inline":true,"padRight":true},{"text":"= 0 for every ","element":"span"},{"text":"a > ","element":"span"},{"text":"0), ","element":"span"},{"style":{"height":10},"width":40,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-16.png","element":"img","alt":" µ0","inline":true,"padRight":true},{"text":"is positive when evaluated on sets of positive Lebesgue measure and the data samples ","element":"span"},{"style":{"height":14.16},"width":58.08,"height":35.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-17.png","element":"img","alt":" x(i)","inline":true,"padRight":true},{"text":"are in distinct directions (as defined on page 192 of ","element":"span"},{"text":"[1]","element":"span"},{"text":").","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"See Section ","element":"span"},{"text":"6.","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-18.png","element":"img"}],[{"text":"Examples of activation units ","element":"span"},{"style":{"height":16},"width":50.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-19.png","element":"img","alt":" σ(·","inline":true},{"text":") satisfying the conditions in Corollary ","element":"span"},{"href":"#id-9","text":"1.4 ","element":"a"},{"text":"include sigmoid functions and hyperbolic tangent functions. Using a normal distribution for the initialization of the parameters in the neural network is a common choice in practice (covered by the requirements of Corollary ","element":"span"},{"href":"#id-9","text":"1.4)","element":"a"},{"text":".","element":"span"}],[{"text":"Remark ","element":"span"},{"text":"1.5","element":"span"},{"text":". ","element":"span"},{"text":"For presentation purposes we have not explicitly denoted the bias term in the model. However, it is clear that this can be handled by requiring the first component of the vector ","element":"span"},{"text":"x ","element":"span"},{"text":"to be equal to one for example. This would result in the neural network taking the form ","element":"span"},{"style":{"height":17.36},"width":125.56,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-20.png","element":"img","alt":" gN(x; θ","inline":true},{"text":") = ","element":"span"},{"style":{"height":24.55},"width":443,"height":61.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-21.png","element":"img","alt":"1√N�Ni=1 Ciσ(W i · x + bi","inline":true},{"text":"). ","element":"span"},{"text":"We leave the rest of the details to the interested reader.","element":"span"}],[{"id":"id-4","text":"1.2 ","element":"span"},{"text":"Literature Review","element":"span"}],[{"href":"#id-10","referenceIndex":11,"text":"[11]","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":12,"text":"[12]","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":13,"text":"[13]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-13","referenceIndex":15,"text":"[15] ","element":"a"},{"text":"study the asymptotics of single-layer neural networks with a ","element":"span"},{"style":{"height":19.31},"width":27,"height":48.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-22.png","element":"img","alt":"1N","inline":true,"padRight":true},{"text":"normalization; that ","element":"span"},{"text":"is, ","element":"span"},{"style":{"height":17.36},"width":125.08,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-23.png","element":"img","alt":" gN(x; θ","inline":true},{"text":") = ","element":"span"},{"style":{"height":21.23},"width":335.96,"height":53.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-24.png","element":"img","alt":"1N�Ni=1 Ciσ(W i · x","inline":true},{"text":"). ","element":"span"},{"href":"#id-14","referenceIndex":14,"text":"[14] ","element":"a"},{"text":"studies the asymptotics of deep (i.e., multi-layer) neural networks ","element":"span"},{"text":"with a ","element":"span"},{"style":{"height":19.31},"width":27,"height":48.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-25.png","element":"img","alt":"1N","inline":true,"padRight":true},{"text":"normalization in each hidden layer. In the single layer case, the limit for the neural network satisfies ","element":"span"},{"text":"a partial differential equation. As discussed in ","element":"span"},{"href":"#id-10","referenceIndex":11,"text":"[11]","element":"a"},{"text":", it is ","element":"span"},{"text":"not ","element":"span"},{"text":"necessarily true that the limiting equation (a PDE in this case) will converge to the global minimum of an objective function with zero training error.","element":"span"}],[{"text":"The ","element":"span"},{"style":{"height":19.31},"width":27,"height":48.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/3-26.png","element":"img","alt":"1N","inline":true,"padRight":true},{"text":"normalization studied in ","element":"span"},{"href":"#id-10","referenceIndex":11,"text":"[11]","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":12,"text":"[12]","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":13,"text":"[13]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-13","referenceIndex":15,"text":"[15] ","element":"a"},{"text":"is convenient since the single-layer neural network ","element":"span"},{"text":"is then in a traditional mean-field framework where it can be described via an empirical measure of the parameters. However, the ","element":"span"},{"style":{"height":22.82},"width":53.4,"height":57.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-0.png","element":"img","alt":"1√N","inline":true,"padRight":true},{"text":"normalization that we study in this paper is more widely-used in practice (it ","element":"span"},{"text":"is referred to as the Xavier initialization and was first introduced in ","element":"span"},{"href":"#id-0","referenceIndex":1,"text":"[2]","element":"a"},{"text":"). The ","element":"span"},{"style":{"height":22.63},"width":52.92,"height":56.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-1.png","element":"img","alt":"1√N","inline":true,"padRight":true},{"text":"normalization requires ","element":"span"},{"text":"different analysis than the standard mean-field analysis ","element":"span"},{"style":{"height":19.31},"width":27,"height":48.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-2.png","element":"img","alt":"1N","inline":true,"padRight":true},{"text":", and it produces a completely different limit. ","element":"span"},{"text":"Importantly, under reasonable conditions, the limit equation converges to a global minimum with zero training error. In addition, for the limit to hold, we show that the ","element":"span"},{"style":{"height":22.82},"width":53.4,"height":57.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-3.png","element":"img","alt":"1√N","inline":true,"padRight":true},{"text":"normalization requires the effective ","element":"span"},{"text":"learning rate for the parameters to be of the order ","element":"span"},{"style":{"height":14.16},"width":109.6,"height":35.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-4.png","element":"img","alt":" N −3/2","inline":true},{"text":".","element":"span"}],[{"text":"Convergence to a global minimum for a neural network has been recently proven in ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"[3]","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[4]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[5]","element":"a"},{"text":". Although it has been long understood that neural networks have universal approximation properties (see ","element":"span"},{"href":"#id-15","referenceIndex":7,"text":"[8]","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":9,"text":"[9]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-17","referenceIndex":10,"text":"[10]","element":"a"},{"text":"), it has until recently been commonly believed that training algorithms for neural networks (e.g., gradient descent) may converge to a local minimum (and not a global minimum) since neural networks are non-convex. ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"[3]","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[4]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[5] ","element":"a"},{"text":"showed that neural networks (under suitable conditions) will converge to a global minimum during training. This result is quite remarkable considering the optimization problem is non-convex, and it provides an important mathematical guarantee for the field of deep learning.","element":"span"}],[{"href":"#id-2","referenceIndex":3,"text":"[3]","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[4]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[5] ","element":"a"},{"text":"do not study the mean-field limit of a neural network with the Xavier initialization, which is the focus of our paper. Once the mean field limit is established, we show that convergence to a global minimum is a simple consequence of the limit equation. There are also some differences between our assumptions and the assumptions required for the theorems of ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"[3]","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[4]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[5]","element":"a"},{"text":". ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"[3] ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[4] ","element":"a"},{"text":"study gradient descent while our paper studies stochastic gradient descent, which introduces additional technical challenges due to the stochastic dynamics. ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[5] ","element":"a"},{"text":"studies stochastic gradient descent for a framework where the neural network’s output layer parameters are not trained. In their paper, the ","element":"span"},{"style":{"height":12.96},"width":42.2,"height":32.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-5.png","element":"img","alt":" Ci","inline":true,"padRight":true},{"text":"parameters are randomly generated and then frozen (i.e., they do not change during training). In practice, all of the parameters in the neural network, including the output layer parameters, are trained with stochastic gradient descent and therefore it is worthwhile to consider the more general case. ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[5] ","element":"a"},{"text":"also imposes an assumption that the loss function vanishes at infinity. ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"[3]","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[4]","element":"a"},{"text":", and ","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"[5] ","element":"a"},{"text":"all require that every data sample has the same magnitude, i.e.","element":"span"},{"style":{"height":20},"width":105.04,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-6.png","element":"img","alt":"��x(i)��","inline":true,"padRight":true},{"text":"= 1 for every ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , M","element":"span"},{"text":". We do not require this assumption.","element":"span"}],[{"href":"#id-18","referenceIndex":5,"text":"[6] ","element":"a"},{"text":"proved a limit for neural networks with a Xavier initialization when they are trained with continuous-time gradient descent. Our paper proves a limit for neural networks trained with the (standard) discrete-time stochastic gradient descent algorithm which is used in practice. Our method of proof is also different than the approach of ","element":"span"},{"href":"#id-18","referenceIndex":5,"text":"[6]","element":"a"},{"text":". Whereas ","element":"span"},{"href":"#id-18","referenceIndex":5,"text":"[6] ","element":"a"},{"text":"begins their analysis in continuous time (due to their framework being continuous-time gradient descent), our paper rigorously passes from discrete time (where the stochastic gradient descent updates evolve) to continuous time through weak convergence analysis of appropriate stochastic processes and measure-valued processes. In ","element":"span"},{"href":"#id-18","referenceIndex":5,"text":"[6]","element":"a"},{"text":", the authors directly study the evolution of the derivatives of the output with respect to the parameters, while we address the limiting behavior of the underlying associated stochastic processes and measure-valued processes.","element":"span"}],[{"text":"1.3 ","element":"span"},{"text":"Organization of Paper","element":"span"}],[{"text":"Section ","element":"span"},{"text":"2 ","element":"span"},{"text":"derives equations describing the evolution of the pre-limit process (","element":"span"},{"style":{"height":16.56},"width":122.04,"height":41.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-7.png","element":"img","alt":"µN, hN","inline":true},{"text":"). Relative compactness of the family of processes (","element":"span"},{"style":{"height":16.56},"width":122.52,"height":41.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-8.png","element":"img","alt":"µN, hN","inline":true},{"text":") is proven in Section ","element":"span"},{"text":"3. ","element":"span"},{"text":"Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"proves that any limit point of the process must satisfy the equation ","element":"span"},{"href":"#id-6","text":"(1.3)","element":"a"},{"text":". These results are collected together in Section ","element":"span"},{"text":"5 ","element":"span"},{"text":"to prove that (","element":"span"},{"style":{"height":16.56},"width":122.04,"height":41.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-9.png","element":"img","alt":"µN, hN","inline":true},{"text":") converges in distribution to the solution of equation ","element":"span"},{"href":"#id-6","text":"(1.3)","element":"a"},{"text":". Corollary ","element":"span"},{"href":"#id-9","text":"1.4 ","element":"a"},{"text":"is proven in Section ","element":"span"},{"text":"6.","element":"span"}]]},{"heading":"2 Evolution of the Pre-limit Process","paragraphs":[[{"text":"We begin by analyzing evolution of the network output ","element":"span"},{"style":{"height":17.78},"width":47.64,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/4-10.png","element":"img","alt":" gNk","inline":true,"padRight":true},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":"). Using a Taylor expansion,","element":"span"}],[{"id":"id-20","style":{"width":"91%"},"width":1718,"height":651,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-0.png","element":"img"}],[{"text":"for points ","element":"span"},{"style":{"height":19.68},"width":79.84,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-1.png","element":"img","alt":" W i,∗k","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.68},"width":105.76,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-2.png","element":"img","alt":" W i,∗,∗k","inline":true,"padRight":true},{"text":"in the line segment connecting the points ","element":"span"},{"style":{"height":17.38},"width":54.44,"height":43.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-3.png","element":"img","alt":" W ik","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.98},"width":95.2,"height":47.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-4.png","element":"img","alt":" W ik+1","inline":true},{"text":". Let ","element":"span"},{"style":{"height":18.93},"width":144.12,"height":47.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-5.png","element":"img","alt":" αN = αN","inline":true,"padRight":true},{"text":". Substi- ","element":"span"},{"text":"tuting ","element":"span"},{"href":"#id-19","text":"(1.1) ","element":"a"},{"text":"into ","element":"span"},{"href":"#id-20","text":"(2.1) ","element":"a"},{"text":"yields","element":"span"}],[{"id":"id-21","style":{"width":"96%"},"width":1808,"height":552,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-6.png","element":"img"}],[{"text":"Using ","element":"span"},{"href":"#id-21","text":"(2.3)","element":"a"},{"text":", we can write the evolution of ","element":"span"},{"style":{"height":17.39},"width":50.04,"height":43.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-7.png","element":"img","alt":" hNt","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":10.8},"width":52.44,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-8.png","element":"img","alt":" t ∈","inline":true,"padRight":true},{"text":"[0","element":"span"},{"text":", T ","element":"span"},{"text":"] as","element":"span"}],[{"style":{"width":"61%"},"width":1154,"height":482,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/5-9.png","element":"img"}],[{"text":"Next, we decompose the summations into a drift and martingale component.","element":"span"}],[{"style":{"width":"105%"},"width":1970,"height":741,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-0.png","element":"img"}],[{"text":"For convenience, we define the martingale terms (the third and fourth terms in the equation above) as ","element":"span"},{"style":{"height":19.31},"width":95.68,"height":48.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-1.png","element":"img","alt":"M N,1t","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.31},"width":95.68,"height":48.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-2.png","element":"img","alt":" M N,2t","inline":true,"padRight":true},{"text":", respectively. The equation for ","element":"span"},{"style":{"height":17.39},"width":50.04,"height":43.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-3.png","element":"img","alt":" hNt","inline":true,"padRight":true},{"text":"can be re-written in terms of a Riemann integral and the scaled measure ","element":"span"},{"style":{"height":17.2},"width":51,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-4.png","element":"img","alt":" µNt","inline":true,"padRight":true},{"text":", yielding","element":"span"}],[{"id":"id-36","style":{"width":"85%"},"width":1597,"height":284,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-5.png","element":"img"}],[{"text":"In addition, using conditional independence of the terms in the series for ","element":"span"},{"style":{"height":19.12},"width":95.68,"height":47.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-6.png","element":"img","alt":" M N,1t","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.12},"width":95.68,"height":47.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-7.png","element":"img","alt":" M N,2t","inline":true,"padRight":true},{"text":"as well as the bounds from Lemmas ","element":"span"},{"href":"#id-22","text":"3.2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-23","text":"3.1, ","element":"a"},{"text":"we have that","element":"span"}],[{"style":{"width":"20%"},"width":385,"height":208,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-8.png","element":"img"}],[{"text":"We can also analyze the evolution of the empirical measure ","element":"span"},{"style":{"height":17.97},"width":49.08,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-9.png","element":"img","alt":" νNk","inline":true,"padRight":true},{"text":"in terms of test functions ","element":"span"},{"style":{"height":17.97},"width":119.68,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-10.png","element":"img","alt":" f ∈ C2b","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":13.55},"width":86.12,"height":33.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-11.png","element":"img","alt":"R1+d","inline":true},{"text":"). ","element":"span"},{"text":"Using a Taylor expansion, we find that","element":"span"}],[{"id":"id-24","style":{"width":"101%"},"width":1899,"height":537,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-12.png","element":"img"}],[{"text":"for points ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":18.24},"width":120.68,"height":45.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-13.png","element":"img","alt":"Cik, ¯W ik","inline":true},{"text":"), ( ˆ","element":"span"},{"style":{"height":19.44},"width":120.2,"height":48.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-14.png","element":"img","alt":"Cik, ˆW ik","inline":true},{"text":") and ( ˜","element":"span"},{"style":{"height":19.44},"width":120.2,"height":48.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-15.png","element":"img","alt":"Cik, ˜W ik","inline":true},{"text":") in the segments connecting ","element":"span"},{"style":{"height":19.17},"width":86.08,"height":47.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-16.png","element":"img","alt":" Cik+1","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":17.57},"width":45.32,"height":43.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-17.png","element":"img","alt":" Cik","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.17},"width":95.2,"height":47.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-18.png","element":"img","alt":" W ik+1","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":17.57},"width":54.44,"height":43.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/6-19.png","element":"img","alt":" W ik","inline":true},{"text":", ","element":"span"},{"text":"respectively.","element":"span"}],[{"text":"Substituting ","element":"span"},{"href":"#id-19","text":"(1.1) ","element":"a"},{"text":"into ","element":"span"},{"href":"#id-24","text":"(2.5) ","element":"a"},{"text":"yields","element":"span"}],[{"style":{"width":"88%"},"width":1657,"height":395,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-0.png","element":"img"}],[{"text":"Therefore,","element":"span"}],[{"id":"id-37","style":{"width":"90%"},"width":1694,"height":416,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-1.png","element":"img"}]]},{"heading":"3 Relative Compactness","paragraphs":[[{"text":"In this section we prove that the family of processes ","element":"span"},{"style":{"height":17.36},"width":192.12,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-2.png","element":"img","alt":" {µN, hN}N","inline":true,"padRight":true},{"text":"is relatively compact. Section ","element":"span"},{"href":"#id-25","text":"3.1 ","element":"a"},{"text":"proves compact containment. Section ","element":"span"},{"href":"#id-26","text":"3.2 ","element":"a"},{"text":"proves regularity. Section ","element":"span"},{"href":"#id-27","text":"3.3 ","element":"a"},{"text":"combines these results to prove relative compactness.","element":"span"}],[{"id":"id-25","text":"3.1 ","element":"span"},{"text":"Compact Containment","element":"span"}],[{"id":"id-23","style":{"width":"61%"},"width":1153,"height":293,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-3.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"The unimportant finite constant ","element":"span"},{"style":{"height":11.6},"width":124.48,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-4.png","element":"img","alt":" C < ∞","inline":true,"padRight":true},{"text":"may change from line to line. We first observe that","element":"span"}],[{"style":{"width":"49%"},"width":926,"height":188,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-5.png","element":"img"}],[{"text":"where the last inequality follows from the definition of ","element":"span"},{"style":{"height":17.97},"width":47.64,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-6.png","element":"img","alt":" gNk","inline":true,"padRight":true},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") and the uniform boundedness assumption on ","element":"span"},{"style":{"height":16},"width":50.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-7.png","element":"img","alt":"σ(·","inline":true},{"text":"). Then, we subsequently obtain that","element":"span"}],[{"style":{"width":"43%"},"width":806,"height":415,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/7-8.png","element":"img"}],[{"text":"This implies that","element":"span"}],[{"style":{"width":"51%"},"width":970,"height":193,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-0.png","element":"img"}],[{"text":"Let us now define ","element":"span"},{"style":{"height":17.97},"width":62.04,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-1.png","element":"img","alt":" mNk","inline":true,"padRight":true},{"text":"= ","element":"span"},{"style":{"height":19.31},"width":27,"height":48.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-2.png","element":"img","alt":"1N","inline":true}],[{"style":{"width":"20%"},"width":378,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-3.png","element":"img"}],[{"text":"have that ","element":"span"},{"style":{"height":19.31},"width":27,"height":48.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-4.png","element":"img","alt":"1N","inline":true}],[{"style":{"width":"50%"},"width":945,"height":193,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-5.png","element":"img"}],[{"text":"By the discrete Gronwall lemma and using ","element":"span"},{"style":{"height":16},"width":158.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-6.png","element":"img","alt":" k/N ≤ T","inline":true,"padRight":true},{"text":",","element":"span"}],[{"id":"id-28","style":{"width":"96%"},"width":1809,"height":708,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-7.png","element":"img"}],[{"text":"where the last inequality follows from the random variables ","element":"span"},{"style":{"height":16.99},"width":44.32,"height":42.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-8.png","element":"img","alt":" Ci0","inline":true,"padRight":true},{"text":"taking values in a compact set. ","element":"span"},{"text":"Now, we turn to the bound for ","element":"span"},{"style":{"height":17.57},"width":119.36,"height":43.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-9.png","element":"img","alt":" ∥ W ik ∥","inline":true},{"text":". We start with the bound (using Young’s inequality)","element":"span"}],[{"style":{"width":"68%"},"width":1289,"height":464,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-10.png","element":"img"}],[{"text":"for a constant ","element":"span"},{"style":{"height":11.6},"width":124.48,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-11.png","element":"img","alt":" C < ∞","inline":true,"padRight":true},{"text":"that may change from line to line. Taking an expectation, using Assumption ","element":"span"},{"href":"#id-1","text":"1.1, ","element":"a"},{"text":"the bound ","element":"span"},{"href":"#id-28","text":"(3.2)","element":"a"},{"text":", and using the fact that ","element":"span"},{"style":{"height":16},"width":158.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-12.png","element":"img","alt":" k/N ≤ T","inline":true,"padRight":true},{"text":", we obtain","element":"span"}],[{"style":{"width":"17%"},"width":332,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-13.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":11.6},"width":91.4,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-14.png","element":"img","alt":" i ∈ N","inline":true,"padRight":true},{"text":"and all ","element":"span"},{"text":"k ","element":"span"},{"text":"such that ","element":"span"},{"style":{"height":16},"width":158.6,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-15.png","element":"img","alt":" k/N ≤ T","inline":true,"padRight":true},{"text":", concluding the proof of the lemma.","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-16.png","element":"img"}],[{"text":"Using the bounds from Lemma ","element":"span"},{"href":"#id-23","text":"3.1, ","element":"a"},{"text":"we can now establish a bound for ","element":"span"},{"style":{"height":17.97},"width":47.64,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-17.png","element":"img","alt":" gNk","inline":true,"padRight":true},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") for ","element":"span"},{"style":{"height":11.6},"width":102.52,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/8-18.png","element":"img","alt":" x ∈ D","inline":true},{"text":".","element":"span"}],[{"id":"id-22","text":"Lemma 3.2. ","element":"span"},{"text":"For all ","element":"span"},{"style":{"height":11.6},"width":91.4,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-0.png","element":"img","alt":" i ∈ N","inline":true},{"text":", all ","element":"span"},{"text":"k ","element":"span"},{"text":"such that ","element":"span"},{"style":{"height":16},"width":158.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-1.png","element":"img","alt":" k/N ≤ T","inline":true,"padRight":true},{"text":", and any ","element":"span"},{"style":{"height":11.6},"width":102.52,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-2.png","element":"img","alt":" x ∈ D","inline":true},{"text":",","element":"span"}],[{"style":{"width":"24%"},"width":456,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-3.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Recall equation ","element":"span"},{"href":"#id-21","text":"(2.2)","element":"a"},{"text":", which describes the evolution of ","element":"span"},{"style":{"height":17.97},"width":47.64,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-4.png","element":"img","alt":" gNk","inline":true,"padRight":true},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":").","element":"span"}],[{"style":{"width":"71%"},"width":1340,"height":257,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-5.png","element":"img"}],[{"text":"This leads to the bound","element":"span"}],[{"style":{"width":"79%"},"width":1492,"height":218,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-6.png","element":"img"}],[{"text":"We now square both sides of the above inequality.","element":"span"}],[{"style":{"width":"85%"},"width":1595,"height":518,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-7.png","element":"img"}],[{"text":"Then, using a telescoping series,","element":"span"}],[{"style":{"width":"49%"},"width":928,"height":413,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-8.png","element":"img"}],[{"text":"Taking expectations,","element":"span"}],[{"style":{"width":"54%"},"width":1029,"height":122,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-9.png","element":"img"}],[{"text":"Taking advantage of the fact that ","element":"span"},{"style":{"height":11.5},"width":35.56,"height":28.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-10.png","element":"img","alt":" xj","inline":true,"padRight":true},{"text":"is sampled from a fixed dataset ","element":"span"},{"text":"D ","element":"span"},{"text":"of ","element":"span"},{"text":"M ","element":"span"},{"text":"data samples,","element":"span"}],[{"id":"id-30","style":{"width":"79%"},"width":1496,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/9-11.png","element":"img"}],[{"text":"and therefore","element":"span"}],[{"id":"id-29","style":{"width":"85%"},"width":1602,"height":269,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-0.png","element":"img"}],[{"text":"Recall that","element":"span"}],[{"style":{"width":"71%"},"width":1341,"height":564,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-1.png","element":"img"}],[{"text":"Combining this bound with the bound ","element":"span"},{"href":"#id-29","text":"(3.4) ","element":"a"},{"text":"and using the discrete Gronwall lemma yields, for any 0 ","element":"span"},{"style":{"height":13.2},"width":182.4,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-2.png","element":"img","alt":" ≤ k ≤ T N","inline":true},{"text":",","element":"span"}],[{"style":{"width":"19%"},"width":356,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-3.png","element":"img"}],[{"text":"Substituting this bound into equation ","element":"span"},{"href":"#id-30","text":"(3.3) ","element":"a"},{"text":"produces the desired bound","element":"span"}],[{"style":{"width":"16%"},"width":304,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-4.png","element":"img"}],[{"text":"for any 0 ","element":"span"},{"style":{"height":13.2},"width":182.88,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-5.png","element":"img","alt":" ≤ k ≤ T N","inline":true},{"text":".","element":"span"}],[{"id":"id-33","style":{"width":"99%"},"width":1870,"height":324,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-6.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"For each ","element":"span"},{"text":"L > ","element":"span"},{"text":"0, define ","element":"span"},{"style":{"height":13.11},"width":56.08,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-7.png","element":"img","alt":" KL","inline":true,"padRight":true},{"text":"= [","element":"span"},{"style":{"height":17.36},"width":172.04,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-8.png","element":"img","alt":"−L, L]1+d","inline":true},{"text":". Then, we have that ","element":"span"},{"style":{"height":13.11},"width":56.08,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-9.png","element":"img","alt":" KL","inline":true,"padRight":true},{"text":"is a compact subset of ","element":"span"},{"style":{"height":13.36},"width":86.12,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-10.png","element":"img","alt":" R1+d","inline":true},{"text":", and for each ","element":"span"},{"style":{"height":12.8},"width":56.44,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-11.png","element":"img","alt":" t ≥","inline":true,"padRight":true},{"text":"0 and ","element":"span"},{"style":{"height":11.6},"width":113.96,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-12.png","element":"img","alt":" N ∈ N","inline":true},{"text":",","element":"span"}],[{"style":{"width":"56%"},"width":1060,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-13.png","element":"img"}],[{"text":"where we have used Markov’s inequality and the bounds from Lemma ","element":"span"},{"href":"#id-23","text":"3.1. ","element":"a"},{"text":"We define the compact subsets of ","element":"span"},{"style":{"height":17.36},"width":149.48,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-14.png","element":"img","alt":" M(R1+d","inline":true},{"text":")","element":"span"}],[{"style":{"width":"51%"},"width":959,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/10-15.png","element":"img"}],[{"text":"and we observe that","element":"span"}],[{"style":{"width":"79%"},"width":1486,"height":256,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-0.png","element":"img"}],[{"text":"Given that lim","element":"span"},{"style":{"height":22.69},"width":333.2,"height":56.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-1.png","element":"img","alt":"L→∞�∞j=1 C(L+j)3/2","inline":true,"padRight":true},{"text":"= 0, we have that, for each ","element":"span"},{"style":{"height":12.4},"width":67,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-2.png","element":"img","alt":" η >","inline":true,"padRight":true},{"text":"0, there exists a compact set ˆ","element":"span"},{"style":{"height":13.11},"width":56.08,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-3.png","element":"img","alt":"KL","inline":true,"padRight":true},{"text":"such ","element":"span"},{"text":"that","element":"span"}],[{"style":{"width":"25%"},"width":485,"height":84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-4.png","element":"img"}],[{"text":"Due to Lemma ","element":"span"},{"href":"#id-22","text":"3.2 ","element":"a"},{"text":"and Markov’s inequality, we also know that, for each ","element":"span"},{"style":{"height":12.4},"width":65.56,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-5.png","element":"img","alt":" η >","inline":true,"padRight":true},{"text":"0, there exists a compact set ","element":"span"},{"text":"U ","element":"span"},{"text":"= [","element":"span"},{"style":{"height":17.36},"width":156.32,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-6.png","element":"img","alt":"−B, B]M","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"width":"24%"},"width":457,"height":84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-7.png","element":"img"}],[{"text":"Therefore, for each ","element":"span"},{"style":{"height":12.4},"width":63.16,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-8.png","element":"img","alt":" η >","inline":true,"padRight":true},{"text":"0, there exists a compact set ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.36},"width":361.24,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-9.png","element":"img","alt":"KL × [−B, B]M ⊂ E","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"width":"73%"},"width":1374,"height":153,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-10.png","element":"img"}],[{"id":"id-26","text":"3.2 ","element":"span"},{"text":"Regularity","element":"span"}],[{"id":"id-34","style":{"width":"99%"},"width":1869,"height":338,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-11.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"We start by noticing that a Taylor expansion gives for 0 ","element":"span"},{"style":{"height":13.2},"width":210.44,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-12.png","element":"img","alt":" ≤ s ≤ t ≤ T","inline":true}],[{"style":{"width":"84%"},"width":1591,"height":488,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-13.png","element":"img"}],[{"text":"for points ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":16.35},"width":55.64,"height":40.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-14.png","element":"img","alt":"Ci,","inline":true,"padRight":true},{"text":"¯","element":"span"},{"style":{"height":13.15},"width":54.2,"height":32.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-15.png","element":"img","alt":"W i","inline":true},{"text":") and ( ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":16.35},"width":55.64,"height":40.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-16.png","element":"img","alt":"Ci,","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"style":{"height":13.15},"width":54.2,"height":32.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-17.png","element":"img","alt":"W i","inline":true},{"text":") in the segments connecting ","element":"span"},{"style":{"height":20.94},"width":101.4,"height":52.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-18.png","element":"img","alt":" Ci⌊Ns⌋","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":20.94},"width":98.52,"height":52.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-19.png","element":"img","alt":" Ci⌊Nt⌋","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":20.94},"width":110.52,"height":52.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-20.png","element":"img","alt":" W i⌊Ns⌋","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":20.94},"width":107.16,"height":52.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/11-21.png","element":"img","alt":" W i⌊Nt⌋","inline":true},{"text":", ","element":"span"},{"text":"respectively.","element":"span"}],[{"text":"Let’s now establish a bound on ","element":"span"},{"style":{"height":20.75},"width":273.56,"height":51.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-0.png","element":"img","alt":" |Ci⌊Nt⌋ − Ci⌊Ns⌋|","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":13.2},"width":168.2,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-1.png","element":"img","alt":" s < t ≤ T","inline":true,"padRight":true},{"text":"with 0 ","element":"span"},{"style":{"height":14},"width":237.88,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-2.png","element":"img","alt":" < t − s ≤ δ <","inline":true,"padRight":true},{"text":"1.","element":"span"}],[{"id":"id-31","style":{"width":"99%"},"width":1872,"height":1590,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-3.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":11.6},"width":124.48,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-4.png","element":"img","alt":" C < ∞","inline":true,"padRight":true},{"text":"is some unimportant constant. Then, the statement of the Lemma follows.","element":"span"}],[{"text":"We next establish regularity of the process ","element":"span"},{"style":{"height":17.39},"width":50.04,"height":43.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-5.png","element":"img","alt":" hNt","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":14.06},"width":79.28,"height":35.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-6.png","element":"img","alt":" DRM","inline":true,"padRight":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]). For the purposes of the following lemma, ","element":"span"},{"text":"let the function ","element":"span"},{"style":{"height":16},"width":123.52,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-7.png","element":"img","alt":" q(z1, z2","inline":true},{"text":") = min","element":"span"},{"style":{"height":16},"width":246.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-8.png","element":"img","alt":"{∥z1 − z2∥ , 1}","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":16.56},"width":200,"height":41.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-9.png","element":"img","alt":" z1, z2 ∈ RM","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":16},"width":392.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-10.png","element":"img","alt":" ∥z∥ = |z1| + · · · + |zM|","inline":true},{"text":".","element":"span"}],[{"id":"id-35","text":"Lemma 3.5. ","element":"span"},{"text":"For any ","element":"span"},{"style":{"height":12.4},"width":61.08,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-11.png","element":"img","alt":" δ ∈","inline":true,"padRight":true},{"text":"(0","element":"span"},{"text":", ","element":"span"},{"text":"1), there is a constant ","element":"span"},{"style":{"height":11.6},"width":132.16,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-12.png","element":"img","alt":" C < ∞","inline":true,"padRight":true},{"text":"such that for 0 ","element":"span"},{"style":{"height":14},"width":194.2,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-13.png","element":"img","alt":" ≤ u ≤ δ <","inline":true,"padRight":true},{"text":"1, 0 ","element":"span"},{"style":{"height":14},"width":207.44,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-14.png","element":"img","alt":" ≤ v ≤ δ ∧ t","inline":true},{"text":", ","element":"span"},{"style":{"height":10.8},"width":52.44,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-15.png","element":"img","alt":"t ∈","inline":true,"padRight":true},{"text":"[0","element":"span"},{"text":", T ","element":"span"},{"text":"],","element":"span"}],[{"style":{"width":"40%"},"width":751,"height":84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-16.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Recall that","element":"span"}],[{"style":{"width":"82%"},"width":1544,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/12-17.png","element":"img"}],[{"text":"Therefore,","element":"span"}],[{"style":{"width":"90%"},"width":1697,"height":356,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-0.png","element":"img"}],[{"text":"This yields the bound","element":"span"}],[{"style":{"width":"66%"},"width":1240,"height":286,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-1.png","element":"img"}],[{"text":"where we have used the boundedness of ","element":"span"},{"style":{"height":16},"width":61.88,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-2.png","element":"img","alt":" σ′(·","inline":true},{"text":") (from Assumption ","element":"span"},{"href":"#id-1","text":"1.1) ","element":"a"},{"text":"and the bounds from Lemma ","element":"span"},{"href":"#id-23","text":"3.1. ","element":"a"},{"text":"Taking expectations,","element":"span"}],[{"style":{"width":"78%"},"width":1474,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-3.png","element":"img"}],[{"text":"Using the bounds ","element":"span"},{"href":"#id-31","text":"(3.6) ","element":"a"},{"text":"and (3.7),","element":"span"}],[{"id":"id-32","style":{"width":"78%"},"width":1475,"height":219,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-4.png","element":"img"}],[{"text":"The bound ","element":"span"},{"href":"#id-32","text":"(3.7) ","element":"a"},{"text":"holds for each ","element":"span"},{"style":{"height":11.6},"width":102.52,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-5.png","element":"img","alt":" x ∈ D","inline":true},{"text":". Therefore,","element":"span"}],[{"style":{"width":"35%"},"width":658,"height":98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-6.png","element":"img"}],[{"text":"The statement of the Lemma then follows.","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-7.png","element":"img"}],[{"id":"id-27","text":"3.3 ","element":"span"},{"text":"Combining our results to prove relative compactness","element":"span"}],[{"text":"Lemma 3.6. ","element":"span"},{"text":"The family of processes ","element":"span"},{"style":{"height":17.36},"width":235.04,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-8.png","element":"img","alt":" {µN, hN}N∈N","inline":true,"padRight":true},{"text":"is relatively compact in ","element":"span"},{"style":{"height":13.1},"width":57.12,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-9.png","element":"img","alt":" DE","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]).","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"Combining Lemmas ","element":"span"},{"href":"#id-33","text":"3.3 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-34","text":"3.4, ","element":"a"},{"text":"and Theorem 8.6 of Chapter 3 of ","element":"span"},{"href":"#id-15","referenceIndex":7,"text":"[7] ","element":"a"},{"text":"proves that ","element":"span"},{"style":{"height":17.36},"width":164.48,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-10.png","element":"img","alt":" {µN}N∈N","inline":true,"padRight":true},{"text":"is relatively compact in ","element":"span"},{"style":{"height":16.86},"width":166.08,"height":42.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-11.png","element":"img","alt":" DM(R1+d)","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]). (See also Remark 8.7 B of Chapter 3 of ","element":"span"},{"href":"#id-15","referenceIndex":7,"text":"[7] ","element":"a"},{"text":"regarding replacing sup","element":"span"},{"style":{"height":7.6},"width":27,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-12.png","element":"img","alt":"N","inline":true,"padRight":true},{"text":"with lim","element":"span"},{"style":{"height":7.6},"width":27,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-13.png","element":"img","alt":"N","inline":true,"padRight":true},{"text":"in the regularity condition B of Theorem 8.6.) Similarly, combining Lemmas ","element":"span"},{"href":"#id-33","text":"3.3 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-35","text":"3.5 ","element":"a"},{"text":"proves that ","element":"span"},{"style":{"height":17.36},"width":163.52,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-14.png","element":"img","alt":" {hN}N∈N","inline":true,"padRight":true},{"text":"is relatively compact in ","element":"span"},{"style":{"height":14.06},"width":79.28,"height":35.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-15.png","element":"img","alt":" DRM","inline":true,"padRight":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]). Since relative compactness is equivalent to tightness, we have that the probability measures of the family","element":"span"}],[{"text":"of processes ","element":"span"},{"style":{"height":17.55},"width":164.48,"height":43.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-16.png","element":"img","alt":" {µN}N∈N","inline":true,"padRight":true},{"text":"are tight. Similarly, we have that the probability measures of the family of process","element":"span"}],[{"style":{"height":17.36},"width":163.04,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-17.png","element":"img","alt":"{hN}N∈N","inline":true,"padRight":true},{"text":"are tight. Therefore, ","element":"span"},{"style":{"height":17.36},"width":235.04,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-18.png","element":"img","alt":" {µN, hN}N∈N","inline":true,"padRight":true},{"text":"is tight. Then, ","element":"span"},{"style":{"height":17.36},"width":235.52,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-19.png","element":"img","alt":" {µN, hN}N∈N","inline":true,"padRight":true},{"text":"is also relatively compact.","element":"span"}],[{"style":{"width":"1%"},"width":29,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/13-20.png","element":"img"}]]},{"heading":"4 Identiﬁcation of the Limit","paragraphs":[[{"text":"Let ","element":"span"},{"style":{"height":13.36},"width":51,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-0.png","element":"img","alt":" πN","inline":true,"padRight":true},{"text":"be the probability measure of a convergent subsequence of","element":"span"},{"style":{"height":22.4},"width":262.52,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-1.png","element":"img","alt":"�µN, hN�0≤t≤T","inline":true,"padRight":true},{"text":". Each ","element":"span"},{"style":{"height":13.36},"width":51,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-2.png","element":"img","alt":" πN","inline":true,"padRight":true},{"text":"takes values ","element":"span"},{"text":"in the set of probability measures ","element":"span"},{"style":{"height":19.2},"width":123.36,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-3.png","element":"img","alt":" M�DE","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"])","element":"span"},{"style":{"height":19.2},"width":18,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-4.png","element":"img","alt":"�","inline":true},{"text":". Relative compactness, proven in Section ","element":"span"},{"text":"3, ","element":"span"},{"text":"implies that there is a subsequence ","element":"span"},{"style":{"height":13.55},"width":64.44,"height":33.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-5.png","element":"img","alt":" πNk","inline":true,"padRight":true},{"text":"which weakly converges. We must prove that any limit point ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-6.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"of a convergent subsequence ","element":"span"},{"style":{"height":13.36},"width":64.44,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-7.png","element":"img","alt":" πNk","inline":true,"padRight":true},{"text":"will satisfy the evolution equation ","element":"span"},{"href":"#id-6","text":"(1.3)","element":"a"},{"text":".","element":"span"}],[{"text":"Lemma 4.1. ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":13.36},"width":64.44,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-8.png","element":"img","alt":" πNk","inline":true,"padRight":true},{"text":"be a convergent subsequence with a limit point ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-9.png","element":"img","alt":" π","inline":true},{"text":". ","element":"span"},{"text":"Then, ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-10.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"is a Dirac measure concentrated on (","element":"span"},{"style":{"height":16},"width":186.24,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-11.png","element":"img","alt":"µ, h) ∈ DE","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]) and (","element":"span"},{"style":{"height":14},"width":64.76,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-12.png","element":"img","alt":"µ, h","inline":true},{"text":") satisfies equation ","element":"span"},{"href":"#id-6","text":"(1.3)","element":"a"},{"text":".","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"We define a map ","element":"span"},{"style":{"height":16},"width":111.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-13.png","element":"img","alt":" F(µ, h","inline":true},{"text":") : ","element":"span"},{"style":{"height":13.1},"width":57.12,"height":32.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-14.png","element":"img","alt":" DE","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"]) ","element":"span"},{"style":{"height":14.7},"width":104.68,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-15.png","element":"img","alt":" → R+","inline":true,"padRight":true},{"text":"for each ","element":"span"},{"style":{"height":10.8},"width":52.44,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-16.png","element":"img","alt":" t ∈","inline":true,"padRight":true},{"text":"[0","element":"span"},{"text":", T ","element":"span"},{"text":"], ","element":"span"},{"style":{"height":17.78},"width":119.68,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-17.png","element":"img","alt":" f ∈ C2b","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":18.06},"width":490.76,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-18.png","element":"img","alt":"R1+d), g1, · · · , gp ∈ Cb(R1+d","inline":true},{"text":"), ","element":"span"},{"style":{"height":18.06},"width":336.32,"height":45.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-19.png","element":"img","alt":"q1, · · · , qp ∈ Cb(RM","inline":true},{"text":"), and 0 ","element":"span"},{"style":{"height":15.1},"width":335.6,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-20.png","element":"img","alt":" ≤ s1 < · · · < sp ≤ t","inline":true},{"text":".","element":"span"}],[{"style":{"width":"98%"},"width":1844,"height":347,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-21.png","element":"img"}],[{"text":"Then, using equations ","element":"span"},{"href":"#id-36","text":"(2.4) ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-37","text":"(2.6)","element":"a"},{"text":", we obtain","element":"span"}],[{"style":{"width":"83%"},"width":1558,"height":760,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-22.png","element":"img"}],[{"text":"Since ","element":"span"},{"style":{"height":16},"width":57.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-23.png","element":"img","alt":" F(·","inline":true},{"text":") is continuous and ","element":"span"},{"style":{"height":17.36},"width":97.56,"height":43.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-24.png","element":"img","alt":" F(µN","inline":true},{"text":") is uniformly bounded (due to the uniform boundedness results of Section ","element":"span"},{"text":"3)","element":"span"},{"text":",","element":"span"}],[{"style":{"width":"14%"},"width":278,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-25.png","element":"img"}],[{"text":"Since this holds for each ","element":"span"},{"style":{"height":10.8},"width":54.36,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-26.png","element":"img","alt":" t ∈","inline":true,"padRight":true},{"text":"[0","element":"span"},{"text":", T ","element":"span"},{"text":"], ","element":"span"},{"style":{"height":17.97},"width":124,"height":44.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-27.png","element":"img","alt":" f ∈ C2b","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":13.55},"width":86.12,"height":33.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-28.png","element":"img","alt":"R1+d","inline":true},{"text":") and ","element":"span"},{"style":{"height":18.26},"width":553.16,"height":45.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-29.png","element":"img","alt":" g1, · · · , gp, q1, · · · , qp ∈ Cb(R1+d","inline":true},{"text":"), (","element":"span"},{"style":{"height":14},"width":64.76,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-30.png","element":"img","alt":"µ, h","inline":true},{"text":") satisfies the ","element":"span"},{"text":"evolution equation ","element":"span"},{"href":"#id-6","text":"(1.3)","element":"a"},{"text":".","element":"span"}]]},{"heading":"5 Proof of Convergence","paragraphs":[[{"text":"We now combine the previous results of Sections ","element":"span"},{"text":"3 ","element":"span"},{"text":"and ","element":"span"},{"text":"4 ","element":"span"},{"text":"to prove Theorem ","element":"span"},{"href":"#id-38","text":"1.2. ","element":"a"},{"text":"Let ","element":"span"},{"style":{"height":13.36},"width":51,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-31.png","element":"img","alt":" πN","inline":true,"padRight":true},{"text":"be the probability measure corresponding to (","element":"span"},{"style":{"height":16.56},"width":122.04,"height":41.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-32.png","element":"img","alt":"µN, hN","inline":true},{"text":"). Each ","element":"span"},{"style":{"height":13.36},"width":51,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-33.png","element":"img","alt":" πN","inline":true,"padRight":true},{"text":"takes values in the set of probability measures ","element":"span"},{"style":{"height":19.2},"width":123.36,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-34.png","element":"img","alt":" M�DE","inline":true},{"text":"([0","element":"span"},{"text":", T ","element":"span"},{"text":"])","element":"span"},{"style":{"height":19.2},"width":18,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-35.png","element":"img","alt":"�","inline":true},{"text":". Relative compactness, proven in Section ","element":"span"},{"text":"3, ","element":"span"},{"text":"implies that every subsequence ","element":"span"},{"style":{"height":13.36},"width":64.44,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-36.png","element":"img","alt":" πNk","inline":true,"padRight":true},{"text":"has a further sub-sequence ","element":"span"},{"style":{"height":13.36},"width":89.32,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-37.png","element":"img","alt":"πNkm","inline":true,"padRight":true},{"text":"which weakly converges. Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"proves that any limit point ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-38.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"height":13.36},"width":89.32,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-39.png","element":"img","alt":" πNkm","inline":true,"padRight":true},{"text":"will satisfy the evolution equation ","element":"span"},{"href":"#id-6","text":"(1.3)","element":"a"},{"text":". Equation ","element":"span"},{"href":"#id-6","text":"(1.3) ","element":"a"},{"text":"is a finite-dimensional, linear equation and therefore has a unique solution. Therefore, by Prokhorov’s Theorem, ","element":"span"},{"style":{"height":13.36},"width":51,"height":33.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-40.png","element":"img","alt":" πN","inline":true,"padRight":true},{"text":"weakly converges to ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-41.png","element":"img","alt":" π","inline":true},{"text":", where ","element":"span"},{"style":{"height":6.8},"width":23,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-42.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"is the distribution of (","element":"span"},{"style":{"height":14},"width":64.76,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-43.png","element":"img","alt":"µ, h","inline":true},{"text":"), the unique solution of ","element":"span"},{"href":"#id-6","text":"(1.3)","element":"a"},{"text":". That is, (","element":"span"},{"style":{"height":16.75},"width":122.04,"height":41.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-44.png","element":"img","alt":"µN, hN","inline":true},{"text":") converges in distribution to (","element":"span"},{"style":{"height":14},"width":64.76,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/14-45.png","element":"img","alt":"µ, h","inline":true},{"text":").","element":"span"}]]},{"heading":"6 Proof of Corollary 1.4","paragraphs":[[{"text":"This section proves that under reasonable hyperparameter choices, the matrix ","element":"span"},{"text":"A ","element":"span"},{"text":"in the limit equation will be positive definite.","element":"span"}],[{"text":"Proof. ","element":"span"},{"text":"We first show that ","element":"span"},{"text":"A ","element":"span"},{"text":"is equivalent to the covariance matrix of the random variables ","element":"span"},{"text":"U ","element":"span"},{"text":"= ","element":"span"},{"style":{"height":38.4},"width":420.68,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-0.png","element":"img","alt":"�U(x(1)), . . . , U(x(M))�","inline":true},{"text":",","element":"span"}],[{"text":"which are defined as","element":"span"}],[{"style":{"width":"100%"},"width":1875,"height":333,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-1.png","element":"img"}],[{"text":"To prove that ","element":"span"},{"text":"A ","element":"span"},{"text":"is positive definite, we need to show that ","element":"span"},{"style":{"height":12.4},"width":139.48,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-2.png","element":"img","alt":" z⊤Az >","inline":true,"padRight":true},{"text":"0 for every non-zero ","element":"span"},{"style":{"height":14.16},"width":129.92,"height":35.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-3.png","element":"img","alt":" z ∈ RM","inline":true},{"text":".","element":"span"}],[{"style":{"width":"79%"},"width":1485,"height":344,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-4.png","element":"img"}],[{"text":"The functions ","element":"span"},{"style":{"height":18.35},"width":174,"height":45.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-5.png","element":"img","alt":" σ(x(i) · W","inline":true},{"text":") are linearly independent since the ","element":"span"},{"style":{"height":14.35},"width":58.08,"height":35.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-6.png","element":"img","alt":" x(i)","inline":true,"padRight":true},{"text":"are in district directions (see Remark3.1 of ","element":"span"},{"text":"[1]","element":"span"},{"text":"). Therefore, for each non-zero ","element":"span"},{"text":"z","element":"span"},{"text":", there exists a point ","element":"span"},{"style":{"height":10.96},"width":45.28,"height":27.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-7.png","element":"img","alt":" w∗","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"width":"96%"},"width":1807,"height":403,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-8.png","element":"img"}],[{"text":"for some ","element":"span"},{"style":{"height":12.4},"width":63.16,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-9.png","element":"img","alt":" η >","inline":true,"padRight":true},{"text":"0 such that for (","element":"span"},{"style":{"height":16},"width":160.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-10.png","element":"img","alt":"c, w) ∈ B","inline":true}],[{"style":{"width":"44%"},"width":838,"height":117,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-11.png","element":"img"}],[{"text":"Then,","element":"span"}],[{"style":{"width":"100%"},"width":1882,"height":503,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1907.04108/images/15-12.png","element":"img"}],[{"text":"and ","element":"span"},{"text":"A ","element":"span"},{"text":"is positive definite, concluding the proof of the Corollary.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-0","text":"[1] Yoshifusa Ito. Nonlinearity creates linear independence. Advances in Computational Mathematics, 5: ","element":"span"},{"text":"189-203, 1996.","element":"span"}],[{"text":"[2] X. Glorot and Y. Bengio. Understanding the difficulty of training deep feedforward neural networks. Proceedings of the thirteenth international conference on artificial intelligence and statistics, pp. 249-256. 2010.","element":"span"}],[{"id":"id-2","text":"[3] S. Du, J. Lee, H. Li, L. Wang, and X. Zhai. Gradient Descent Finds Global Minima of Deep Neural Net- ","element":"span"},{"text":"works. Proceedings of the 36th International Conference on Machine Learning, Long Beach, California, PMLR 97, 2019.","element":"span"}],[{"id":"id-3","text":"[4] S. Du, X. Zhai, B. Poczos, and A. Singh. Gradient Descent Provably Optimizes Over-Parameterized ","element":"span"},{"text":"Neural Networks. ICLR, 2019.","element":"span"}],[{"id":"id-18","text":"[5] D. Zou, Y. Cao, D. Zhou, and Q. Gu. Stochastic Gradient Descent Optimizes Over-parameterized Deep ","element":"span"},{"text":"ReLU Networks. arXiv: 1811.08888, 2018.","element":"span"}],[{"text":"[6] A. Jacot, F. Gabriel, and C. Hongler. Neural Tangent Kernel: Convergence and Generalization in Neural Networks. ","element":"span"},{"text":"32nd Conference on Neural Information Processing Systems (NeurIPS 2018), Montreal, Canada.","element":"span"}],[{"id":"id-15","text":"[7] S. Ethier and T. Kurtz. Markov Processes: Characterization and Convergence. 1986, Wiley, New York, ","element":"span"},{"text":"MR0838085.","element":"span"}],[{"text":"[8] K. Hornik, M. Stinchcombe, and H. White. Multilayer feedforward networks are universal approximators. Neural Networks, 2(5), 359-366, 1989.","element":"span"}],[{"id":"id-16","text":"[9] K. Hornik. Approximation capabilities of multilayer feedforward networks. Neural Networks, 4(2), ","element":"span"},{"text":"251-257, 1991.","element":"span"}],[{"id":"id-17","text":"[10] C. Kuan and K. Hornik. ","element":"span"},{"text":"Convergence of learning algorithms with constant learning rates. ","element":"span"},{"text":"IEEE Transactions on Neural Networks, 2(5), 484-489, 1991.","element":"span"}],[{"id":"id-10","text":"[11] S. Mei, A. Montanari, and P. Nguyen. A mean field view of the landscape of two-layer neural networks ","element":"span"},{"text":"Proceedings of the National Academy of Sciences, 115 (33) E7665-E767, 2018.","element":"span"}],[{"id":"id-11","text":"[12] J. Sirignano and K. Spiliopoulos. Mean Field Analysis of Neural Networks: A Law of Large Numbers. ","element":"span"},{"text":"SIAM Journal on Applied Mathematics, Vol. 80, Issue 2, pp. 725–752, 2020.","element":"span"}],[{"id":"id-12","text":"[13] J. Sirignano and K. Spiliopoulos. Mean Field Analysis of Neural Networks: A Central Limit Theorem. ","element":"span"},{"text":"Stochastic Processes and their Applications, Volume 130, Issue 3, March 2020, pp. 1820-1852, 2020.","element":"span"}],[{"id":"id-14","text":"[14] J. Sirignano and K. Spiliopoulos. ","element":"span"},{"text":"Mean Field Analysis of Deep Neural Networks. ","element":"span"},{"text":"Mathematics of Operations Research, 2021, to appear.","element":"span"}],[{"id":"id-13","text":"[15] G. M. Rotskoff and E. Vanden-Eijnden. Neural Networks as Interacting Particle Systems: Asymptotic ","element":"span"},{"text":"Convexity of the Loss Landscape and Universal Scaling of the Approximation Error. ","element":"span"},{"href":"http://arxiv.org/abs/1805.00915","text":"arXiv:1805.00915, ","element":"a"},{"text":"2018.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]