1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMy4wNzk1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2023-02-23T17:02:04.000Z","paperID":"2003.07953","published":"2020-03-17T21:39:11.000Z","authors":"[\"Shounak Chattopadhyay\",\"Antik Chakraborty\",\"David B. Dunson\"]","title":"Nearest Neighbor Dirichlet Mixtures","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2023-02-24T06:08:12.467Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9uZWFyZXN0LW5laWdoYm9yLWRpcmljaGxldC1wcm9jZXNzIn0=","type":"pwc","url":"https://paperswithcode.com/paper/nearest-neighbor-dirichlet-process","data":"{\"date\":\"2024-09-04T20:15:30.035Z\"}"},{"id":"eyJ1cmwiOiJodHRwczovL2dpdGh1Yi5jb20vc2hvdW5ha2NoYXR0b3BhZGh5YXkvbm4tZHAifQ==","type":"code","url":"https://github.com/shounakchattopadhyay/nn-dp","data":null},{"id":"eyJ1cmwiOiJodHRwczovL2dpdGh1Yi5jb20vc2hvdW5ha2NoYXR0b3BhZGh5YXkvbm4tZG0ifQ==","type":"code","url":"https://github.com/shounakchattopadhyay/nn-dm","data":null},{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9tZXRob2QvY29udm9sdXRpb24ifQ==","type":"method","url":"https://paperswithcode.com/method/convolution","data":null}],"reposConnection":{"edges":[{"official":null,"node":{"id":"eyJyZXBvSUQiOiIyMTU2NjczOTgiLCJzb3VyY2UiOiJnaXRodWIifQ==","source":"github","repoID":"215667398","url":"https://github.com/shounakch/NN-DM","title":"NN-DM","language":"c++","stars":1,"forks":1,"framework":null,"scoreTrending":null,"updated":null,"created":null,"downloads":null,"likes":null,"owner":[{"username":"shounakch","avatar":"https://avatars.githubusercontent.com/u/42551357?v=4"}]}}]},"models":[],"tags":[{"id":"eyJuYW1lIjoiZGVuc2l0eSBlc3RpbWF0aW9uIiwidHlwZSI6InRhc2sifQ==","name":"density estimation","description":"In density estimation, the input is a dataset and the output is a model that estimates the probability density function of the underlying population. This task is often used in anomaly detection, where data points in low probability regions are considered anomalies.","scoreTrending":null,"count":{"stars":2296,"papers":1183,"models":1528},"__typename":"Tag"},{"id":"eyJuYW1lIjoidW5jZXJ0YWludHkgcXVhbnRpZmljYXRpb24iLCJ0eXBlIjoidGFzayJ9","name":"uncertainty quantification","description":"Uncertainty quantification in machine learning involves inputting data into a model and outputting predictions with associated measures of uncertainty. This task is crucial in fields like medical imaging, geophysics, and engineering where making decisions based on predictions from small datasets is common, and understanding the level of confidence in these predictions can significantly impact the outcomes.","scoreTrending":0.16381172005331637,"count":{"stars":2523,"papers":1317,"models":641},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[{"author":"david b dunson","node":{"id":"eyJhZGRyZXNzIjoiZHVuc29uQGR1a2UuZWR1In0=","address":"dunson@duke.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/22959677?v=4","username":"david-dunson"}],"scholar":[{"thirdPartyID":"KwEOawwAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJlYjViYTA3OC1kNjJhLTQ5NjItOTNmNC1jNGU5M2JjYTJkNmEifQ==","name":"david b dunson","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTMxMS40NjY5IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1311.4669"},{"id":"eyJwYXBlcklEIjoiMTQwMy4yNjYwIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1403.2660"},{"id":"eyJwYXBlcklEIjoiMTUwNi4wMzE2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1506.03164"},{"id":"eyJwYXBlcklEIjoiMTIxMC4yMDIyIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1210.2022"},{"id":"eyJwYXBlcklEIjoiMTQxMC42NjA0IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1410.6604"},{"id":"eyJwYXBlcklEIjoiMTUwMS4wNTM0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1501.05349"},{"id":"eyJwYXBlcklEIjoiMTMxMi4xMDk5IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1312.1099"},{"id":"eyJwYXBlcklEIjoiMTQwMS4zNjMyIiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1401.3632"},{"id":"eyJwYXBlcklEIjoiMTYwMy4wNTMyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1603.05324"},{"id":"eyJwYXBlcklEIjoiMTkwNC4xMTEzMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.11131"},{"id":"eyJwYXBlcklEIjoiMTgwMS4wMTA2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1801.01061"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wODkwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.08908"},{"id":"eyJwYXBlcklEIjoiMjAwOC4wNzExMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.07110"},{"id":"eyJwYXBlcklEIjoiMTgwNS4wODEwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.08102"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wNzk1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.07953"},{"id":"eyJwYXBlcklEIjoiMTUwNi4wMzc2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1506.03768"},{"id":"eyJwYXBlcklEIjoiMTkwMS4wMDE3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.00172"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wMjcyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.02728"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xNDA1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.14056"}]}]}},{"author":"shounak chattopadhyay","node":{"id":"eyJhZGRyZXNzIjoic2hvdW5hay5jaGF0dG9wYWRoeWF5QGR1a2UuZWR1In0=","address":"shounak.chattopadhyay@duke.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"NRFqmXgAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIxNDdhZjYwYS1jMjRkLTRjMmQtYjJhYi01NWYwMGNhNzA1YTYifQ==","name":"shounak chattopadhyay","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMy4wNzk1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.07953"}]}]}},{"author":"antik chakraborty","node":{"id":"eyJhZGRyZXNzIjoiYW50aWsuY2hha3JhYm9ydHlAZHVrZS5lZHUifQ==","address":"antik.chakraborty@duke.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars1.githubusercontent.com/u/17835119?v=4","username":"antik015"}],"scholar":[{"thirdPartyID":"dut8vUwAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI5NGM1OTkyMi1iMTBmLTRhZWQtODAzNi05ZDVjNTQzYjM1OGUifQ==","name":"Antik Chakraborty","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMy4wNzk1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.07953"},{"id":"eyJwYXBlcklEIjoiMTkxMi4xMTYwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.11600"}]}]}},{"author":"antik chakraborty","node":{"id":"eyJhZGRyZXNzIjoiYW50aWswMTVAcHVyZHVlLmVkdSJ9","address":"antik015@purdue.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI5MTliZWNmYi0yY2Y4LTRhMjctYjhhNS01MTBiODc1NWE2YjgifQ==","name":"antik chakraborty","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwMy4wNzk1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.07953"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xNzc2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.17763"}]}]}}]},"__typename":"paper","authorArray":["Shounak Chattopadhyay","Antik Chakraborty","David B. Dunson"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2003.07953","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2003.07953","publisher":"arxiv","paperJSON":{"title":"Nearest Neighbor Dirichlet Mixtures","paperID":"2003.07953","avgLineHeight":17.28,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"$31","element":"span"}],[{"style":{"width":"100%"},"width":1872,"height":141,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/0-0.png","element":"img"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"Bayesian nonparametric methods provide a useful alternative to black box machine learning algorithms, having potential advantages in terms of characterizing uncertainty in inferences and predictions. However, computation can be slow and unwieldy to implement. Hence, it is important to develop simpler and faster Bayesian nonparametric approaches, and ","element":"span"},{"text":"hybrid ","element":"span"},{"text":"methods that borrow the best of both worlds. For example, if one could use the Bayesian machinery for uncertainty quantification and reduction of mean square errors through shrinkage, while incorporating algorithmic aspects of machine learning approaches, one may be able to engineer a highly effective hybrid. The focus of this article is on proposing such an approach for density estimation, motivated by the successes and limitations of nearest neighbor algorithms and Bayesian mixture models.","element":"span"}],[{"text":"Nearest neighbor algorithms are popular due to a combination of simplicity and performance. Given a set of ","element":"span"},{"text":"n ","element":"span"},{"text":"observations ","element":"span"},{"style":{"height":21.71},"width":528.72,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/1-0.png","element":"img","alt":" X (n) = (X1, . . . , Xn) in Rp","inline":true},{"text":", the density at ","element":"span"},{"text":"x ","element":"span"},{"text":"is estimated as ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20},"width":633.16,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/1-1.png","element":"img","alt":"fknn(x) = k/(nVpRpk), where k","inline":true,"padRight":true},{"text":"is the number of neighbors of ","element":"span"},{"style":{"height":21.71},"width":524.44,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/1-2.png","element":"img","alt":" x in X (n), Rk = Rk(x) is","inline":true,"padRight":true},{"text":"the distance of ","element":"span"},{"text":"x ","element":"span"},{"text":"from its ","element":"span"},{"text":"k","element":"span"},{"text":"th nearest neighbor in ","element":"span"},{"style":{"height":21.79},"width":255.6,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/1-3.png","element":"img","alt":" X (n) and Vp","inline":true,"padRight":true},{"text":"is the volume of the ","element":"span"},{"text":"p","element":"span"},{"text":"-dimensional unit ball (","element":"span"},{"href":"#id-0","referenceIndex":26,"text":"Loftsgaarden and Quesenberry","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":26,"text":"1965","element":"a"},{"text":"; ","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"Mack and Rosenblatt","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"1979","element":"a"},{"text":"). Refer to ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"Biau and Devroye ","element":"a"},{"text":"(","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"2015","element":"a"},{"text":") for an overview of related estimators and corresponding theory.","element":"span"}],[{"text":"Nearest neighbor density estimators are a type of locally adaptive kernel density estimators. The literature on such methods identifies two broad classes: ","element":"span"},{"text":"balloon estimators ","element":"span"},{"text":"and ","element":"span"},{"text":"sample smoothing estimators","element":"span"},{"text":"; see ","element":"span"},{"href":"#id-3","referenceIndex":40,"text":"Scott ","element":"a"},{"text":"(","element":"span"},{"href":"#id-3","referenceIndex":40,"text":"2015","element":"a"},{"text":"); ","element":"span"},{"href":"#id-4","referenceIndex":44,"text":"Terrell and Scott ","element":"a"},{"text":"(","element":"span"},{"href":"#id-4","referenceIndex":44,"text":"1992","element":"a"},{"text":") for an overview. ","element":"span"},{"text":"Balloon estimators ","element":"span"},{"text":"characterize the density at a query point ","element":"span"},{"text":"x ","element":"span"},{"text":"using a bandwidth function ","element":"span"},{"text":"h","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":"); classical examples include the naive ","element":"span"},{"text":"k","element":"span"},{"text":"-nearest neighbor density estimator (","element":"span"},{"href":"#id-0","referenceIndex":26,"text":"Loftsgaarden and Quesenberry","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":26,"text":"1965","element":"a"},{"text":") and its modification in ","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"Mack and Rosenblatt ","element":"a"},{"text":"(","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"1979","element":"a"},{"text":"). More elaborate balloon estimators face challenges in terms of choice of ","element":"span"},{"text":"h","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") and obtaining estimators that do not integrate to 1. ","element":"span"},{"text":"Sample smoothing estimators ","element":"span"},{"text":"make use of ","element":"span"},{"text":"n ","element":"span"},{"text":"different bandwidths ","element":"span"},{"style":{"height":19.6},"width":96,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/1-4.png","element":"img","alt":" h(Xi","inline":true},{"text":"), one for each sample point ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/1-5.png","element":"img","alt":" Xi","inline":true},{"text":", to estimate the density at a query point ","element":"span"},{"text":"x ","element":"span"},{"text":"globally. By construction, sample smoothing estimators are ","element":"span"},{"text":"bona fide ","element":"span"},{"text":"density functions integrating to 1. To fit either the balloon or the sample smoothing estimator, one may compute an initial pilot density estimator employing a constant bandwidth and then use this pilot to estimate the bandwidth function (","element":"span"},{"href":"#id-5","referenceIndex":6,"text":"Breiman et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-5","referenceIndex":6,"text":"1977","element":"a"},{"text":"; ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"Abramson","element":"a"},{"text":", ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"1982","element":"a"},{"text":"). Another example of a locally adaptive density estimator is the local likelihood density estimator (","element":"span"},{"href":"#id-7","referenceIndex":25,"text":"Loader","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":25,"text":"1996","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":24,"text":"2006","element":"a"},{"text":"; ","element":"span"},{"href":"#id-9","referenceIndex":18,"text":"Hjort and Jones","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":18,"text":"1996","element":"a"},{"text":"), which fits a polynomial model in the neighborhood of a query point ","element":"span"},{"text":"x ","element":"span"},{"text":"to estimate the density at ","element":"span"},{"text":"x","element":"span"},{"text":", estimating the parameters of the local polynomial by maximizing a penalized local log-likelihood function. The above methods produce a point estimate of the density without uncertainty quantification (UQ).","element":"span"}],[{"text":"Alternatively, there is a Bayesian literature on locally adaptive kernel methods, which","element":"span"}],[{"text":"express the unknown density as:","element":"span"}],[{"id":"id-19","style":{"width":"76%"},"width":1439,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-0.png","element":"img"}],[{"text":"which is a mixture of ","element":"span"},{"text":"m ","element":"span"},{"text":"components, with the ","element":"span"},{"text":"h","element":"span"},{"text":"th having probability weight ","element":"span"},{"style":{"height":11.28},"width":46.4,"height":28.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-1.png","element":"img","alt":" πh","inline":true,"padRight":true},{"text":"and kernel parameters ","element":"span"},{"style":{"height":16.48},"width":41.6,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-2.png","element":"img","alt":" θh","inline":true},{"text":"; by allowing the location and bandwidth to vary across components, local adaptivity is obtained. A Bayesian specification is completed with prior ","element":"span"},{"style":{"height":16.08},"width":47.24,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-3.png","element":"img","alt":" P0","inline":true,"padRight":true},{"text":"for the kernel parameters and ","element":"span"},{"style":{"height":17.2},"width":53.96,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-4.png","element":"img","alt":" Q0","inline":true,"padRight":true},{"text":"for the weights. ","element":"span"},{"text":"In practice, it is common to rely on an over-fitted mixture model (","element":"span"},{"href":"#id-10","referenceIndex":39,"text":"Rousseau and Mengersen","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":39,"text":"2011","element":"a"},{"text":"), which chooses ","element":"span"},{"text":"m ","element":"span"},{"text":"as a pre-specified finite upper bound on the number of components, and lets","element":"span"}],[{"id":"id-20","style":{"width":"73%"},"width":1379,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-5.png","element":"img"}],[{"text":"By choosing ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-6.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"close to zero, this prior favors effective deletion of redundant components. Also, augmenting with component index ","element":"span"},{"style":{"height":19.2},"width":670.24,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-7.png","element":"img","alt":" ci ∈ {1, . . . , m}, for i = 1, . . . , n","inline":true},{"text":", a simple Gibbs sampler can be used for posterior computation, alternating between sampling (i) ","element":"span"},{"style":{"height":16.48},"width":186.73,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-8.png","element":"img","alt":" ci from a","inline":true,"padRight":true},{"text":"multinomial conditional posterior, for ","element":"span"},{"style":{"height":21.89},"width":1069.48,"height":54.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-9.png","element":"img","alt":" i = 1, . . . , n; (ii) θh | − ∼ P0(θh) �i:ci=h K(Xi; θh);","inline":true,"padRight":true},{"text":"and (iii) ","element":"span"},{"style":{"height":19.2},"width":155.56,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-10.png","element":"img","alt":" π | − ∼","inline":true,"padRight":true},{"text":"Dirichlet(","element":"span"},{"style":{"height":20.58},"width":1306.61,"height":51.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/2-11.png","element":"img","alt":"α + n1, . . . , α + nm), with nh = �ni=1 I(ci = h) for h = 1, . . . , m.","inline":true}],[{"text":"Relative to frequentist locally adaptive methods, Bayesian approaches are appealing in automatically providing a characterization of uncertainty in estimation, while having excellent practical performance for a broad variety of density shapes and dimensions. However, implementation typically relies on Markov chain Monte Carlo (MCMC), with the Gibbs sampler sketched above providing an example of a common algorithm used in practice. Unfortunately, current MCMC algorithms for posterior sampling in mixture models tend to face issues with ","element":"span"},{"text":"slow mixing","element":"span"},{"text":", meaning the sampler can take a very large number of iterations to adequately explore different posterior modes and obtain sufficiently accurate posterior summaries.","element":"span"}],[{"text":"MCMC inefficiency has motivated a literature on faster approaches, including sequential approximations (","element":"span"},{"href":"#id-11","referenceIndex":49,"text":"Wang and Dunson","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":49,"text":"2011","element":"a"},{"text":"; ","element":"span"},{"href":"#id-12","referenceIndex":52,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":52,"text":"2014","element":"a"},{"text":") and variational Bayes (","element":"span"},{"href":"#id-13","referenceIndex":4,"text":"Blei and Jordan","element":"a"},{"text":", ","element":"span"},{"href":"#id-13","referenceIndex":4,"text":"2006","element":"a"},{"text":"). ","element":"span"},{"text":"These methods are order dependent, tend to converge to local modes, and/or lack theory support. ","element":"span"},{"href":"#id-14","referenceIndex":34,"text":"Newton and Zhang ","element":"a"},{"text":"(","element":"span"},{"href":"#id-14","referenceIndex":34,"text":"1999","element":"a"},{"text":"); ","element":"span"},{"href":"#id-15","referenceIndex":33,"text":"Newton ","element":"a"},{"text":"(","element":"span"},{"href":"#id-15","referenceIndex":33,"text":"2002","element":"a"},{"text":") instead rely on predictive recursion. Such estimators are fast to compute and have theory support, but are also order dependent and do not provide a characterization of uncertainty. Alternatively, one can use a Polya tree as a conjugate prior (","element":"span"},{"href":"#id-16","referenceIndex":22,"text":"Lavine","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":22,"text":"1992","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":23,"text":"1994","element":"a"},{"text":"), and there is a rich literature on related multiscale and recursive partitioning approaches, such as the optional Polya tree (","element":"span"},{"href":"#id-18","referenceIndex":51,"text":"Wong and Ma","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":51,"text":"2010","element":"a"},{"text":"). However, Polya trees have disadvantages in terms of sensitivity to a base partition and a tendency to favor spiky/erratic densities. These disadvantages are","element":"span"}],[{"text":"inherited by most of the computationally fast modifications.","element":"span"}],[{"text":"This article develops an alternative to current locally adaptive density estimators, obtaining the practical advantages of Bayesian approaches in terms of uncertainty quantification and a tendency to have relatively good performance for a wide variety of true densities, but without the computational disadvantage due to the use of MCMC. This is accomplished with a ","element":"span"},{"text":"Nearest Neighbor-Dirichlet Mixture ","element":"span"},{"text":"(NN-DM) model. The basic idea is to rely on fast nearest neighbor search algorithms to group the data into local neighborhoods, and then condition on these neighborhoods in defining a Bayesian mixture model-based approach. Section ","element":"span"},{"text":"2 ","element":"span"},{"text":"outlines the NN-DM approach and describes implementation details for Gaussian kernels. Section ","element":"span"},{"text":"3 ","element":"span"},{"text":"provides some theory support for NN-DM. Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"contains simulation experiments comparing NN-DM with a rich variety of competitors in univariate and multivariate examples, including an assessment of UQ performance. Section ","element":"span"},{"text":"5 ","element":"span"},{"text":"contains a real data application, and Section ","element":"span"},{"text":"6 ","element":"span"},{"text":"a discussion.","element":"span"}]]},{"heading":"2 Methodology","paragraphs":[[{"text":"2.1 ","element":"span"},{"text":"Nearest Neighbor Dirichlet Mixture Framework","element":"span"}],[{"text":"Let ","element":"span"},{"style":{"height":19.6},"width":152.36,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-0.png","element":"img","alt":" d(x1, x2","inline":true},{"text":") denote a distance metric between data points ","element":"span"},{"style":{"height":17.2},"width":622.92,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-1.png","element":"img","alt":" x1, x2 ∈ X . For X = Rp, the","inline":true,"padRight":true},{"text":"Euclidean distance is typically chosen. For each ","element":"span"},{"style":{"height":20.48},"width":522.6,"height":51.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-2.png","element":"img","alt":" i ∈ {1, 2, . . . , n}, let Xi[j]","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"text":"j","element":"span"},{"text":"th nearest neighbor to ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-3.png","element":"img","alt":" Xi","inline":true,"padRight":true},{"text":"in the data ","element":"span"},{"style":{"height":21.71},"width":414.12,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-4.png","element":"img","alt":" X (n) = (X1, . . . , Xn","inline":true},{"text":"), such that ","element":"span"},{"style":{"height":20.67},"width":421.48,"height":51.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-5.png","element":"img","alt":" d(Xi, Xi[1]) ≤ . . . ≤","inline":true},{"style":{"height":20.48},"width":205.8,"height":51.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-6.png","element":"img","alt":"d(Xi, Xi[n]","inline":true},{"text":"), with ties broken by increasing order of indices. The indices on the ","element":"span"},{"text":"k ","element":"span"},{"text":"nearest neighbors to ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-7.png","element":"img","alt":" Xi","inline":true,"padRight":true},{"text":"are denoted as ","element":"span"},{"style":{"height":21.07},"width":705.12,"height":52.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-8.png","element":"img","alt":" Ni = {j : d(Xi, Xj) ≤ d(Xi, Xi[k])}","inline":true},{"text":", where by convention we define ","element":"span"},{"style":{"height":19.28},"width":214.56,"height":48.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-9.png","element":"img","alt":" Xi[1] = Xi","inline":true},{"text":". Denote the set of data points in the ","element":"span"},{"text":"i","element":"span"},{"text":"-th neighborhood by ","element":"span"},{"style":{"height":19.68},"width":230.92,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-10.png","element":"img","alt":" Si = {Xj :","inline":true},{"style":{"height":19.6},"width":157.44,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-11.png","element":"img","alt":"j ∈ Ni}","inline":true},{"text":". In implementing the proposed method, we typically let the number of neighbors ","element":"span"},{"text":"k ","element":"span"},{"text":"vary as a function of ","element":"span"},{"text":"n","element":"span"},{"text":". When necessary we use the notation ","element":"span"},{"style":{"height":16.48},"width":45.48,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-12.png","element":"img","alt":" kn","inline":true,"padRight":true},{"text":"to express this dependence. However, we routinely drop the ","element":"span"},{"text":"n ","element":"span"},{"text":"subscript for notational simplicity.","element":"span"}],[{"text":"Fixing ","element":"span"},{"style":{"height":14},"width":123.96,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-13.png","element":"img","alt":" x ∈ X","inline":true,"padRight":true},{"text":", we model the density of the data within the ","element":"span"},{"text":"i","element":"span"},{"text":"-th neighborhood using","element":"span"}],[{"id":"id-24","style":{"width":"64%"},"width":1204,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-14.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16.48},"width":33.6,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-15.png","element":"img","alt":" θi","inline":true,"padRight":true},{"text":"are parameters specific to neighborhood ","element":"span"},{"text":"i ","element":"span"},{"text":"that are given a global prior distribution ","element":"span"},{"style":{"height":16.08},"width":47.24,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-16.png","element":"img","alt":"P0","inline":true},{"text":". To combine the ","element":"span"},{"style":{"height":19.6},"width":82.2,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-17.png","element":"img","alt":" fi(x","inline":true},{"text":")s into a single global ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":"), similarly to equations (","element":"span"},{"href":"#id-19","text":"1","element":"a"},{"text":")-(","element":"span"},{"href":"#id-20","text":"2","element":"a"},{"text":"), we let","element":"span"}],[{"id":"id-21","style":{"width":"87%"},"width":1638,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/3-18.png","element":"img"}],[{"text":"The key difference relative to standard Bayesian mixture model (","element":"span"},{"href":"#id-19","text":"1","element":"a"},{"text":") is that in (","element":"span"},{"href":"#id-21","text":"4","element":"a"},{"text":") we include one component for each data sample and assume that only the data in the ","element":"span"},{"text":"k","element":"span"},{"text":"-nearest neighborhood of sample ","element":"span"},{"text":"i ","element":"span"},{"text":"will inform about ","element":"span"},{"style":{"height":16.48},"width":33.6,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-0.png","element":"img","alt":" θi","inline":true},{"text":". In contrast, (","element":"span"},{"href":"#id-19","text":"1","element":"a"},{"text":") lacks any sample dependence, and we infer allocation of samples to mixture components in a posterior inference phase.","element":"span"}],[{"text":"Given the restriction that only data in the ","element":"span"},{"text":"i","element":"span"},{"text":"-th neighborhood ","element":"span"},{"style":{"height":16.48},"width":40.8,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-1.png","element":"img","alt":" Si","inline":true,"padRight":true},{"text":"inform about ","element":"span"},{"style":{"height":17.2},"width":134.76,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-2.png","element":"img","alt":" θi, the","inline":true,"padRight":true},{"text":"pseudo-posterior distribution of ","element":"span"},{"style":{"height":16.48},"width":33.6,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-3.png","element":"img","alt":" θi","inline":true,"padRight":true},{"text":"given data ","element":"span"},{"style":{"height":17.2},"width":360.28,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-4.png","element":"img","alt":" Si and prior P0 is","inline":true}],[{"id":"id-25","style":{"width":"67%"},"width":1264,"height":109,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-5.png","element":"img"}],[{"text":"This pseudo-posterior is in a simple analytic form if ","element":"span"},{"style":{"height":16.08},"width":47.24,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-6.png","element":"img","alt":" P0","inline":true,"padRight":true},{"text":"is conjugate to ","element":"span"},{"style":{"height":19.6},"width":124.72,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-7.png","element":"img","alt":" K(x; θ","inline":true},{"text":"). The prior ","element":"span"},{"style":{"height":16.08},"width":47.24,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-8.png","element":"img","alt":"P0","inline":true,"padRight":true},{"text":"can involve unknown parameters and borrows information across neighborhoods; this reduces the large variance problem common to nearest neighbor estimators.","element":"span"}],[{"text":"Since the neighborhoods are overlapping, the conditional posterior for ","element":"span"},{"href":"#id-21","style":{"height":19.6},"width":300.76,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-9.png","element":"img","alt":" π under (4) is","inline":true,"padRight":true},{"text":"not exactly Dirichlet. ","element":"span"},{"text":"However, one can define the number of unique points in the ","element":"span"},{"text":"i","element":"span"},{"text":"-th neighborhood ","element":"span"},{"style":{"height":16.48},"width":40.8,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-10.png","element":"img","alt":" Si","inline":true,"padRight":true},{"text":"similar in spirit to the number of points in the ","element":"span"},{"text":"h","element":"span"},{"text":"-th cluster in mixture models of the form (","element":"span"},{"href":"#id-19","text":"1","element":"a"},{"text":"). By convention, we define the point ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-11.png","element":"img","alt":" Xi","inline":true,"padRight":true},{"text":"that generated its neighborhood ","element":"span"},{"style":{"height":16.48},"width":40.8,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-12.png","element":"img","alt":" Si","inline":true,"padRight":true},{"text":"to be a member of that neighborhood. For any other data point ","element":"span"},{"style":{"height":18.48},"width":53.88,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-13.png","element":"img","alt":" Xj","inline":true,"padRight":true},{"text":"to be a unique member of the neighborhood generated by ","element":"span"},{"style":{"height":18},"width":264.64,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-14.png","element":"img","alt":" Xi for j ̸= i","inline":true},{"text":", we require ","element":"span"},{"style":{"height":19.68},"width":609.16,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-15.png","element":"img","alt":" Xj ∈ Si but Xj /∈ Su for all","inline":true},{"style":{"height":19.68},"width":925.56,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-16.png","element":"img","alt":"u = 1, . . . , n such that u /∈ {j, i}. That is, Xj","inline":true,"padRight":true},{"text":"lies in the neighborhood generated by ","element":"span"},{"style":{"height":16.48},"width":138,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-17.png","element":"img","alt":" Xi but","inline":true,"padRight":true},{"text":"does not lie in the neighborhood of any other ","element":"span"},{"style":{"height":16.08},"width":58.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-18.png","element":"img","alt":" Xu","inline":true},{"text":". In Section ","element":"span"},{"href":"#id-22","text":"3.2","element":"a"},{"text":", we show that the number of unique points defined as above approaches 1 as ","element":"span"},{"style":{"height":10.4},"width":163.2,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-19.png","element":"img","alt":" n → ∞","inline":true},{"text":". This motivates the following Dirichlet pseudo-posterior update for the neighborhood weights ","element":"span"},{"style":{"height":8.8},"width":41.32,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-20.png","element":"img","alt":" π:","inline":true}],[{"id":"id-23","style":{"width":"70%"},"width":1319,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-21.png","element":"img"}],[{"text":"This distribution is equivalent to the conditional posterior on the kernel weights in the Dirichlet mixture of equations (","element":"span"},{"href":"#id-19","text":"1","element":"a"},{"text":")-(","element":"span"},{"href":"#id-20","text":"2","element":"a"},{"text":"), but we use ","element":"span"},{"text":"n ","element":"span"},{"text":"components and fix the effective number of samples allocated to each component at one. The Dirichlet distribution in (","element":"span"},{"href":"#id-23","text":"6","element":"a"},{"text":") is centered on (1","element":"span"},{"text":"/n, . . . , ","element":"span"},{"text":"1","element":"span"},{"text":"/n","element":"span"},{"text":") with concentration parameter ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-22.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"+ 1. In practice, ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-23.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"is typically chosen to be close to zero, as motivated in Section 1, and we let ","element":"span"},{"style":{"height":16.48},"width":134.28,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/4-24.png","element":"img","alt":" k = kn","inline":true,"padRight":true},{"text":"increase slowly with the sample size.","element":"span"}],[{"text":"Based on equations (","element":"span"},{"href":"#id-24","text":"3","element":"a"},{"text":")-(","element":"span"},{"href":"#id-23","text":"6","element":"a"},{"text":"), our nearest neighbor-Dirichlet mixture produces a pseudo-posterior distribution for the unknown density ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") through simple distributions for the parameters characterizing the density within each neighborhood and for the weights. To generate independent Monte Carlo samples from the pseudo-posterior for ","element":"span"},{"text":"f","element":"span"},{"text":", one can simply draw samples from (","element":"span"},{"href":"#id-25","text":"5","element":"a"},{"text":") and (","element":"span"},{"href":"#id-23","text":"6","element":"a"},{"text":") independently and plug these samples into the expression for ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") in (","element":"span"},{"href":"#id-21","text":"4","element":"a"},{"text":"). Although this is not exactly a coherent fully Bayesian posterior distribution, we claim that it can be used as a practical alternative to such a posterior in practice. This claim is backed up by theoretical arguments, simulation studies and a real data application in the sequel.","element":"span"}],[{"id":"id-34","text":"2.2 ","element":"span"},{"text":"Illustration with Gaussian Kernels","element":"span"}],[{"text":"Suppose we have independent and identically distributed (iid) observations ","element":"span"},{"style":{"height":17.1},"width":288.36,"height":42.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-0.png","element":"img","alt":" X (n) from the","inline":true,"padRight":true},{"text":"density ","element":"span"},{"style":{"height":17.6},"width":801.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-1.png","element":"img","alt":" f, where Xi ∈ Rp for i = 1, . . . , n and f","inline":true,"padRight":true},{"text":"is an unknown density function with respect to the Lebesgue measure on ","element":"span"},{"style":{"height":21.25},"width":480.24,"height":53.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-2.png","element":"img","alt":" Rp for p ≥ 1. Let Rp×p+","inline":true,"padRight":true},{"text":"denote the set of all real-valued ","element":"span"},{"style":{"height":13.2},"width":110,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-3.png","element":"img","alt":" p × p","inline":true,"padRight":true},{"text":"positive definite matrices. Fix ","element":"span"},{"style":{"height":14},"width":138,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-4.png","element":"img","alt":" x ∈ Rp","inline":true},{"text":". We will illustrate the method for a general ","element":"span"},{"style":{"height":17.2},"width":200.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-5.png","element":"img","alt":" p ≥ 1 and","inline":true,"padRight":true},{"text":"note key changes for the special case ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1. We proceed by setting ","element":"span"},{"style":{"height":19.6},"width":124.72,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-6.png","element":"img","alt":" K(x; θ","inline":true},{"text":") to be the multivariate Gaussian density ","element":"span"},{"style":{"height":21.79},"width":1349.64,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-7.png","element":"img","alt":" φp(x; η, Σ) = (2π)−p/2|Σ|−1/2 exp {−(x − η)TΣ−1(x − η)/2}, where","inline":true},{"style":{"height":21.25},"width":699.6,"height":53.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-8.png","element":"img","alt":"θ = (η, Σ), η ∈ Rp and Σ ∈ Rp×p+ ","inline":true,"padRight":true},{"text":". We first compute the neighborhoods ","element":"span"},{"style":{"height":17.68},"width":51.36,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-9.png","element":"img","alt":" Ni ","inline":true,"padRight":true},{"text":"corresponding ","element":"span"},{"text":"to ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-10.png","element":"img","alt":" Xi","inline":true,"padRight":true},{"text":"as in Section ","element":"span"},{"text":"2.1 ","element":"span"},{"text":"and place a normal-inverse Wishart prior on ","element":"span"},{"style":{"height":19.6},"width":225.6,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-11.png","element":"img","alt":" θi = (ηi, Σi","inline":true},{"text":"), given by (","element":"span"},{"style":{"height":19.68},"width":573.32,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-12.png","element":"img","alt":"ηi, Σi) ∼ NIWp(µ0, ν0, γ0, Ψ0","inline":true},{"text":") independently for ","element":"span"},{"style":{"height":19.6},"width":879.6,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-13.png","element":"img","alt":" i = 1, . . . , n. That is, ηi | Σi ∼ N(µ0, Σi/ν0)","inline":true,"padRight":true},{"text":"and Σ","element":"span"},{"style":{"height":21.1},"width":1368.72,"height":52.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-14.png","element":"img","alt":"i ∼ IWp(γ0, Ψ0) with µ0 ∈ Rp, ν0 > 0, γ0 > p − 1 and Ψ0 ∈ Rp×p+ ","inline":true,"padRight":true},{"text":"; for details about ","element":"span"},{"text":"parametrization see Section ","element":"span"},{"text":"J ","element":"span"},{"text":"of the Appendix.","element":"span"}],[{"text":"For ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1, we have a univariate Gaussian density ","element":"span"},{"style":{"height":20.51},"width":196.52,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-15.png","element":"img","alt":" φ(x; ηi, σ2i ","inline":true,"padRight":true},{"text":") in neighborhood ","element":"span"},{"text":"i ","element":"span"},{"text":"with ","element":"span"},{"text":"normal-inverse gamma priors (","element":"span"},{"style":{"height":20.51},"width":674.84,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-16.png","element":"img","alt":"ηi, σ2i ) ∼ NIG(µ0, ν0, γ0/2, γ0δ20/","inline":true},{"text":"2) independently for ","element":"span"},{"text":"i ","element":"span"},{"text":"= ","element":"span"},{"text":"1","element":"span"},{"style":{"height":20.7},"width":1848.52,"height":51.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-17.png","element":"img","alt":", . . . , n, with µ0 ∈ R and ν0, γ0, δ20 > 0. That is, ηi | σ2i ∼ N(µ0, σ2i /ν0) and σ2i ∼","inline":true,"padRight":true},{"text":"IG(","element":"span"},{"style":{"height":20.51},"width":477.2,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-18.png","element":"img","alt":"γ0/2, γ0δ20/2). When p","inline":true,"padRight":true},{"text":"= 1, the IW","element":"span"},{"style":{"height":19.68},"width":155.24,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-19.png","element":"img","alt":"p(γ0, Ψ0","inline":true},{"text":") density simplifies to an IG(","element":"span"},{"style":{"height":20.51},"width":260.88,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-20.png","element":"img","alt":"γ0/2, γ0δ20/2)","inline":true,"padRight":true},{"text":"density with ","element":"span"},{"style":{"height":20.11},"width":239.08,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-21.png","element":"img","alt":" δ20 = Ψ0/γ0.","inline":true}],[{"text":"Monte Carlo samples from the pseudo-posterior of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") given the data ","element":"span"},{"style":{"height":16.91},"width":255.24,"height":42.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-22.png","element":"img","alt":" X (n) can be","inline":true,"padRight":true},{"text":"obtained using Algorithm ","element":"span"},{"href":"#id-26","text":"1","element":"a"},{"text":". The corresponding steps for the univariate case are provided in Section ","element":"span"},{"text":"I ","element":"span"},{"text":"of the Appendix. Although the pseudo-posterior distribution of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") lacks an analytic form, we can obtain a simple form for its pseudo-posterior mean by integrating over the pseudo-posterior distribution of (","element":"span"},{"style":{"height":19.6},"width":271.24,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-23.png","element":"img","alt":"θi)ni=1 and π.","inline":true,"padRight":true},{"text":"Recall the definitions of ","element":"span"},{"style":{"height":18},"width":139.28,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-24.png","element":"img","alt":" µi and","inline":true,"padRight":true},{"text":"Ψ","element":"span"},{"style":{"height":8.4},"width":12,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-25.png","element":"img","alt":"i","inline":true,"padRight":true},{"text":"from Step 2 of Algorithm ","element":"span"},{"href":"#id-26","text":"1 ","element":"a"},{"text":"and define Λ","element":"span"},{"style":{"height":20.51},"width":943.56,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-26.png","element":"img","alt":"i = {νn(γn − p + 1)}−1(νn + 1) Ψi. Then the","inline":true,"padRight":true},{"text":"pseudo-posterior mean of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") is given by","element":"span"}],[{"id":"id-29","style":{"width":"66%"},"width":1252,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-27.png","element":"img"}],[{"text":"Here ","element":"span"},{"style":{"height":22.38},"width":1759.88,"height":55.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-28.png","element":"img","alt":" tγ(x; µ, Λ) = [Γ{(γ + p)/2}/Γ(γ/2)] (γπ)−p/2 |Λ|−1/2[1 + (x − µ)T(γΛ)−1(x − µ)]−(γ+p)/2","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":17.2},"width":312.08,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-29.png","element":"img","alt":" x ∈ Rp is the p","inline":true},{"text":"-dimensional Student’s t-density with degrees of freedom ","element":"span"},{"style":{"height":14.4},"width":78.28,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-30.png","element":"img","alt":" γ >","inline":true,"padRight":true},{"text":"0, location ","element":"span"},{"style":{"height":17.6},"width":161.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-31.png","element":"img","alt":"µ ∈ Rp ","inline":true,"padRight":true},{"text":"and scale matrix Λ ","element":"span"},{"style":{"height":21.25},"width":166.12,"height":53.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-32.png","element":"img","alt":" ∈ Rp×p+ .","inline":true,"padRight":true},{"text":"For the univariate case, we have ","element":"span"},{"style":{"height":18},"width":337.04,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/5-33.png","element":"img","alt":" γn and νn as in","inline":true}],[{"id":"id-26","style":{"width":"101%"},"width":1891,"height":974,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-0.png","element":"img"}],[{"text":"Algorithm ","element":"span"},{"href":"#id-26","text":"1","element":"a"},{"text":". The pseudo-posterior mean of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") is given by","element":"span"}],[{"id":"id-30","style":{"width":"66%"},"width":1253,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-1.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":24.67},"width":1734.76,"height":61.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-2.png","element":"img","alt":" µi = ν−1n (ν0µ0 + k ¯Xi), ¯Xi = k−1 �j∈Ni Xj, λi = δi{(νn + 1)/νn}1/2, δ2i = γ−1n {γ0δ20 +","inline":true},{"style":{"height":23.97},"width":954.28,"height":59.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-3.png","element":"img","alt":"�j∈Ni(Xj− ¯Xi)2+kν0ν−1n (µ0− ¯Xi)2}. Here tγn(·","inline":true},{"text":") represents the univariate Student’s t-density ","element":"span"},{"text":"with ","element":"span"},{"style":{"height":13.2},"width":45.48,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-4.png","element":"img","alt":" γn","inline":true,"padRight":true},{"text":"degrees of freedom.","element":"span"}],[{"id":"id-57","text":"2.3 ","element":"span"},{"text":"Hyperparameter Choice","element":"span"}],[{"text":"The hyperparameters in the prior for the neighborhood-specific parameters need to be chosen carefully – we found results to be sensitive to ","element":"span"},{"style":{"height":18},"width":197,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-5.png","element":"img","alt":" γ0 and Ψ0","inline":true},{"text":". If non-informative values are chosen for these key hyperparameters, we tend to inherit typical problems of nearest neighbor estimators including lack of smoothness and high variance. Suppose Σ ","element":"span"},{"style":{"height":19.68},"width":387.44,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-6.png","element":"img","alt":" ∼ IWp(γ0, Ψ0) and","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":18.88},"width":640.92,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-7.png","element":"img","alt":" i, j = 1, . . . , p, let Σij and Ψ0, ij","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"text":"i, j","element":"span"},{"text":"th entry of Σ and Ψ","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-8.png","element":"img","alt":"0","inline":true},{"text":", respectively. Then Σ","element":"span"},{"style":{"height":19.68},"width":855.92,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-9.png","element":"img","alt":"jj ∼ IG(γ∗/2, Ψ0, jj/2) where γ∗ = γ0 − p","inline":true,"padRight":true},{"text":"+ 1. Thus borrowing from the univariate case, we set Ψ","element":"span"},{"style":{"height":20.59},"width":428.28,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-10.png","element":"img","alt":"0, jj = γ∗δ20 and Ψ0, ij ","inline":true,"padRight":true},{"text":"= 0 for all ","element":"span"},{"style":{"height":18},"width":107.84,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-11.png","element":"img","alt":" i ̸= j","inline":true},{"text":", which implies that Ψ","element":"span"},{"style":{"height":20.59},"width":420.84,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-12.png","element":"img","alt":"0 = (γ∗δ20) Ip and we","inline":true,"padRight":true},{"text":"use leave-one-out cross-validation to select the optimum ","element":"span"},{"style":{"height":20.11},"width":234.32,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-13.png","element":"img","alt":" δ20. With p","inline":true,"padRight":true},{"text":"dimensional data, we ","element":"span"},{"text":"recommend fixing ","element":"span"},{"style":{"height":13.2},"width":135.44,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/6-14.png","element":"img","alt":" γ0 = p","inline":true,"padRight":true},{"text":"which implies a multivariate Cauchy prior predictive density. We choose the leave-one-out log-likelihood as the criterion function for cross-validation, which is closely related to minimizing the Kullback-Leibler divergence between the true and estimated density (","element":"span"},{"href":"#id-27","referenceIndex":17,"text":"Hall","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","referenceIndex":17,"text":"1987","element":"a"},{"text":"; ","element":"span"},{"href":"#id-28","referenceIndex":5,"text":"Bowman","element":"a"},{"text":", ","element":"span"},{"href":"#id-28","referenceIndex":5,"text":"1984","element":"a"},{"text":"). The explicit expression for the pseudo-posterior mean in equations (","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":") and (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":") makes cross-validation computationally efficient. The description of a fast implementation is provided in Section ","element":"span"},{"text":"H ","element":"span"},{"text":"of the Appendix.","element":"span"}],[{"text":"The proposed method has substantially faster runtime if one bypasses cross-validation and uses a default choice of hyperparameters. In particular, we found the default values ","element":"span"},{"style":{"height":18.88},"width":826.8,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/7-0.png","element":"img","alt":"µ0 = 0p, ν0 = 0.001, γ0 = p and Ψ0 = Ip","inline":true,"padRight":true},{"text":"to work well across a number of simulation cases, especially when the true density is smooth. Although using cross-validation to estimate ","element":"span"},{"style":{"height":20.11},"width":39.56,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/7-1.png","element":"img","alt":" δ20","inline":true,"padRight":true},{"text":"can lead to improved performance when the underlying density is spiky, cross-validation provides little to no gains for smooth true densities. Furthermore, with low sample size and increasing number of dimensions, we found this gap to diminish rapidly.","element":"span"}],[{"text":"The other key tuning parameter for NN-DM is the number of nearest neighbors ","element":"span"},{"style":{"height":16.48},"width":149.32,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/7-2.png","element":"img","alt":" k = kn.","inline":true,"padRight":true},{"text":"The pseudo-posterior mean in (","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":") reduces to a single ","element":"span"},{"style":{"height":19.28},"width":485.92,"height":48.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/7-3.png","element":"img","alt":" tγn−p+1 kernel if kn = n","inline":true},{"text":". In contrast, ","element":"span"},{"style":{"height":16.48},"width":45.48,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/7-4.png","element":"img","alt":"kn","inline":true,"padRight":true},{"text":"= 1 provides a sample smoothing kernel density estimate with a specific bandwidth function (","element":"span"},{"href":"#id-4","referenceIndex":44,"text":"Terrell and Scott","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":44,"text":"1992","element":"a"},{"text":"). Therefore, the choice of ","element":"span"},{"text":"k ","element":"span"},{"text":"can impact the smoothness of the density estimate. To assess the sensitivity of the NN-DM estimate to the choice of ","element":"span"},{"text":"k","element":"span"},{"text":", we investigate how the out-of-sample log-likelihood of a test set changes with respect to ","element":"span"},{"text":"k ","element":"span"},{"text":"in Section ","element":"span"},{"href":"#id-31","text":"4.7","element":"a"},{"text":". These simulations suggest that the proposed method is quite robust to the exact choice of ","element":"span"},{"text":"k","element":"span"},{"text":". In practice with finite samples and small dimensions, we recommend a default choice of ","element":"span"},{"style":{"height":21.31},"width":489,"height":53.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/7-5.png","element":"img","alt":" kn = ⌊n1/3⌋ + 1 and kn","inline":true,"padRight":true},{"text":"= 10 for univariate and multivariate cases, respectively. These values led to good performance across a wide variety of simulation cases as described in Section ","element":"span"},{"text":"4","element":"span"},{"text":".","element":"span"}]]},{"heading":"3 Theory","paragraphs":[[{"text":"3.1 ","element":"span"},{"text":"Asymptotic Properties","element":"span"}],[{"text":"There is a rich literature on asymptotic properties of the posterior measure for an unknown density under Bayesian models, providing a frequentist justification for Bayesian density estimation; refer, for example to ","element":"span"},{"href":"#id-32","referenceIndex":11,"text":"Ghosal et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-32","referenceIndex":11,"text":"1999","element":"a"},{"text":"), ","element":"span"},{"href":"#id-33","referenceIndex":13,"text":"Ghosal et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-33","referenceIndex":13,"text":"2007","element":"a"},{"text":"). Unfortunately, the tools developed in this literature rely critically on the mathematical properties of fully Bayes posteriors, providing theoretical guarantees for a computationally intractable exact posterior distribution under a Bayesian model. Our focus is instead on providing frequentist asymptotic guarantees for our computationally efficient NN-DM approach, with this task made much more complex by the dependence across neighborhoods induced by the use of a nearest neighbor procedure.","element":"span"}],[{"text":"We first focus on proving pointwise consistency of the pseudo-posterior of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") induced by (","element":"span"},{"href":"#id-24","text":"3","element":"a"},{"text":")-(","element":"span"},{"href":"#id-23","text":"6","element":"a"},{"text":") for each ","element":"span"},{"style":{"height":19.2},"width":218.16,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/7-6.png","element":"img","alt":" x ∈ [0, 1]p","inline":true},{"text":", using Gaussian kernels as in Section ","element":"span"},{"href":"#id-34","text":"2.2","element":"a"},{"text":". ","element":"span"},{"text":"We separately study the mean and variance of the NN-DM pseudo-posterior distribution, first showing that the pseudo-posterior mean in (","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":") is pointwise consistent and then that the pseudo-posterior variance vanishes asymptotically. The key idea behind our proof is to show that the pseudo-posterior mean is asymptotically close to a kernel density estimator with suitably chosen bandwidth for fixed ","element":"span"},{"style":{"height":17.2},"width":311.52,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-0.png","element":"img","alt":" p and kn → ∞","inline":true,"padRight":true},{"text":"at a desired rate. The proof then follows from standard arguments leading to consistency of kernel density estimators. The NN-DM pseudo-posterior mean mimics a kernel density estimator only in the asymptotic regime; in finite sample simulation studies (refer to Section ","element":"span"},{"text":"4","element":"span"},{"text":"), NN-DM has much better performance. The detailed proofs of all results in this section are in the Appendix.","element":"span"}],[{"text":"Consider independent and identically distributed data ","element":"span"},{"style":{"height":16.51},"width":87.88,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-1.png","element":"img","alt":" X (n) ","inline":true,"padRight":true},{"text":"from a fixed unknown density ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-2.png","element":"img","alt":"f0","inline":true,"padRight":true},{"text":"with respect to the Lebesgue measure on ","element":"span"},{"style":{"height":13.2},"width":52.56,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-3.png","element":"img","alt":" Rp ","inline":true,"padRight":true},{"text":"equipped with the Euclidean metric, inducing the measure ","element":"span"},{"style":{"height":21.79},"width":1617.6,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-4.png","element":"img","alt":" Pf0 on B(Rp). We use E{f(x) | X (n)}, var{f(x) | X (n)} and pr{f(x) ∈ B | X (n)}","inline":true,"padRight":true},{"text":"to denote the mean of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":"), variance of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") and probability of the event ","element":"span"},{"style":{"height":19.6},"width":318,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-5.png","element":"img","alt":" {f(x) ∈ B} for","inline":true},{"style":{"height":19.6},"width":199.92,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-6.png","element":"img","alt":"B ∈ B(Rp","inline":true},{"text":"), respectively, under the pseudo-posterior distribution of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") implied by equations (","element":"span"},{"href":"#id-24","text":"3","element":"a"},{"text":")-(","element":"span"},{"href":"#id-23","text":"6","element":"a"},{"text":"). We make the following regularity assumptions on ","element":"span"},{"style":{"height":17.6},"width":55.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-7.png","element":"img","alt":" f0:","inline":true}],[{"id":"id-36","style":{"width":"82%"},"width":1537,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-8.png","element":"img"}],[{"id":"id-37","text":"Assumption 3.2 ","element":"span"},{"text":"(Bounded gradient)","element":"span"},{"style":{"height":17.6},"width":75.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-9.png","element":"img","alt":". f0","inline":true,"padRight":true},{"text":"is continuous on ","element":"span"},{"text":"[0","element":"span"},{"style":{"height":19.6},"width":600,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-10.png","element":"img","alt":", 1]p with ||∇f0(x)||2 ≤ L for","inline":true,"padRight":true},{"text":"all ","element":"span"},{"style":{"height":19.2},"width":197.52,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-11.png","element":"img","alt":" x ∈ [0, 1]p ","inline":true,"padRight":true},{"text":"and some finite ","element":"span"},{"text":"L > ","element":"span"},{"text":"0","element":"span"},{"text":".","element":"span"}],[{"id":"id-39","text":"Assumption 3.3 ","element":"span"},{"text":"(Bounded sup-norm)","element":"span"},{"style":{"height":19.6},"width":395.08,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-12.png","element":"img","alt":". || log(f0)||∞ < ∞.","inline":true}],[{"text":"Our asymptotic analysis relies on analyzing the behaviour of the pseudo-posterior updates within each nearest neighborhood. We leverage on key results from ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"Biau and Devroye ","element":"a"},{"text":"(","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"2015","element":"a"},{"text":"); ","element":"span"},{"href":"#id-35","referenceIndex":10,"text":"Evans et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-35","referenceIndex":10,"text":"2002","element":"a"},{"text":") which are based on the assumption that the true density has compact support as in Assumption ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":". ","element":"span"},{"text":"Assumption ","element":"span"},{"href":"#id-37","text":"3.2 ","element":"a"},{"text":"ensures that the kernel density estimator has finite expectation. Versions of this assumption are common in the kernel density literature; for example, refer to ","element":"span"},{"href":"#id-38","referenceIndex":46,"text":"Tsybakov ","element":"a"},{"text":"(","element":"span"},{"href":"#id-38","referenceIndex":46,"text":"2004","element":"a"},{"text":"). Assumptions ","element":"span"},{"href":"#id-36","text":"3.1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"imply the existence of 0 ","element":"span"},{"style":{"height":13.6},"width":273.6,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-13.png","element":"img","alt":" < a1, a2 < ∞","inline":true,"padRight":true},{"text":"such that 0 ","element":"span"},{"style":{"height":19.6},"width":1042.36,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-14.png","element":"img","alt":" < a1 < f0(x) < a2 < ∞ for all x ∈ [0, 1]p, which is","inline":true,"padRight":true},{"text":"referred to as a positive density condition by ","element":"span"},{"href":"#id-40","referenceIndex":9,"text":"Evans ","element":"a"},{"text":"(","element":"span"},{"href":"#id-40","referenceIndex":9,"text":"2008","element":"a"},{"text":"); ","element":"span"},{"href":"#id-35","referenceIndex":10,"text":"Evans et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-35","referenceIndex":10,"text":"2002","element":"a"},{"text":"). This is used to establish consistency of the proposed method, justify the choice of the pseudo-posterior distribution of the weights, and obtain our tuning algorithm for the Dirichlet prior concentration parameter ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-15.png","element":"img","alt":" α","inline":true},{"text":". These assumptions are standard in the literature studying frequentist asymptotic properties of nearest neighbor and Bayesian density estimators.","element":"span"}],[{"text":"For ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":", recall the definitions of ","element":"span"},{"href":"#id-29","style":{"height":19.6},"width":393.16,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-16.png","element":"img","alt":" µi and Λi from (7):","inline":true}],[{"style":{"width":"48%"},"width":911,"height":111,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-17.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":20.4},"width":790.08,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/8-18.png","element":"img","alt":" νn = ν0 + kn, γn = γ0 + kn, and ¯Xi, Ψi","inline":true,"padRight":true},{"text":"are as in Algorithm ","element":"span"},{"href":"#id-26","text":"1","element":"a"},{"text":". Define the bandwidth","element":"span"}],[{"text":"matrix ","element":"span"},{"style":{"height":20.78},"width":345,"height":51.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-0.png","element":"img","alt":" Hn = h2nIp where","inline":true}],[{"id":"id-42","style":{"width":"65%"},"width":1221,"height":103,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-1.png","element":"img"}],[{"text":"We have suppressed the dependence of ","element":"span"},{"style":{"height":18},"width":324.64,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-2.png","element":"img","alt":" µi and Λi on n","inline":true,"padRight":true},{"text":"for notational convenience. ","element":"span"},{"text":"It is immediate that ","element":"span"},{"style":{"height":20.11},"width":937.2,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-3.png","element":"img","alt":" h2n → 0 if kn → ∞ as n → ∞. Fix x ∈ [0, 1]p","inline":true},{"text":". To prove consistency of the ","element":"span"},{"text":"pseudo-posterior mean, we first show that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.49},"width":995.76,"height":51.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-4.png","element":"img","alt":"fn(x) and fK(x) = (1/n) �ni=1 tγn−p+1(x; Xi, Hn)","inline":true,"padRight":true},{"text":"are asymptotically close, that is we show that ","element":"span"},{"style":{"height":26.85},"width":897.08,"height":67.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-5.png","element":"img","alt":" EPf0( | ˆfn(x) − fK(x)| ) → 0 as n → ∞. To","inline":true,"padRight":true},{"text":"obtain this result, we approximate ","element":"span"},{"style":{"height":18},"width":479.88,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-6.png","element":"img","alt":" µi by Xi and Λi by Hn","inline":true,"padRight":true},{"text":"using successive applications of the mean value theorem. Finally, we exploit the convergence of ","element":"span"},{"style":{"height":19.6},"width":100.92,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-7.png","element":"img","alt":" fK(x","inline":true},{"text":") to the true value ","element":"span"},{"style":{"height":19.6},"width":104.88,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-8.png","element":"img","alt":" f0(x)","inline":true,"padRight":true},{"text":"to obtain the consistency of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.6},"width":90.84,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-9.png","element":"img","alt":"fn(x","inline":true},{"text":"). The proof of convergence of ","element":"span"},{"style":{"height":19.6},"width":275.16,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-10.png","element":"img","alt":" fK(x) to f0(x","inline":true},{"text":") is provided in Section ","element":"span"},{"text":"G ","element":"span"},{"text":"of the Appendix. The precise statement regarding the consistency of the pseudo-posterior mean is given in the following theorem. Let ","element":"span"},{"style":{"height":14},"width":101.12,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-11.png","element":"img","alt":" a ∧ b","inline":true,"padRight":true},{"text":"denote the minimum of ","element":"span"},{"text":"a ","element":"span"},{"text":"and ","element":"span"},{"text":"b","element":"span"},{"text":".","element":"span"}],[{"id":"id-41","style":{"width":"100%"},"width":1872,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-12.png","element":"img"}],[{"text":"that ","element":"span"},{"style":{"height":24.48},"width":1547.64,"height":61.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-13.png","element":"img","alt":" kn → ∞ as n → ∞, and ν0 = o{n−2/pk(2/p)+1n }. Then, ˆfn(x) → f0(x) in Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":10.4},"width":164.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-14.png","element":"img","alt":" n → ∞.","inline":true}],[{"text":"We now look at the pseudo-posterior variance of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":"). We let","element":"span"}],[{"style":{"width":"1%"},"width":20,"height":5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-15.png","element":"img"}],[{"id":"id-43","style":{"height":45.52},"width":496.8,"height":113.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-16.png","element":"img","alt":"Rn = Γ{(γn − p + 2)/2}Γ{(γn − p + 1)/2}","inline":true}],[{"text":"For ","element":"span"},{"style":{"height":17.2},"width":777,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-17.png","element":"img","alt":" i = 1, . . . , n, let Bi = DnΛi and define","inline":true}],[{"id":"id-44","style":{"width":"67%"},"width":1269,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-18.png","element":"img"}],[{"text":"As ","element":"span"},{"style":{"height":19.2},"width":527.96,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-19.png","element":"img","alt":" n → ∞, we have Dn → 1/","inline":true},{"text":"2. Analogous steps to the ones used in the proof of Theorem ","element":"span"},{"href":"#id-41","text":"3.4 ","element":"a"},{"text":"can be used to imply that ","element":"span"},{"style":{"height":19.68},"width":447.48,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-20.png","element":"img","alt":"�fvar(x) → f0(x) in Pf0","inline":true},{"text":"-probability. Also, as ","element":"span"},{"style":{"height":22.8},"width":444.96,"height":57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-21.png","element":"img","alt":" n → ∞, k(p−1)/2n Rn →","inline":true,"padRight":true},{"text":"(4","element":"span"},{"style":{"height":21.71},"width":123.56,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-22.png","element":"img","alt":"π)−1/2 ","inline":true,"padRight":true},{"text":"using Stirling’s approximation. We now provide an upper bound on the pseudo-posterior variance of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") which shows convergence of the pseudo-posterior variance to 0.","element":"span"}],[{"style":{"height":16.48},"width":467.4,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-23.png","element":"img","alt":"Theorem 3.5. Let Hn","inline":true,"padRight":true},{"text":"be the bandwidth matrix defined in ","element":"span"},{"text":"(","element":"span"},{"href":"#id-42","text":"9","element":"a"},{"text":")","element":"span"},{"text":". Let ","element":"span"},{"href":"#id-43","style":{"height":19.6},"width":504.6,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-24.png","element":"img","alt":" Rn, Dn be as in (10) and","inline":true},{"href":"#id-44","style":{"height":19.6},"width":349.2,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-25.png","element":"img","alt":"�fvar be as in (11)","inline":true},{"text":". Under Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":17.2},"width":239.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-26.png","element":"img","alt":" x, kn and ν0","inline":true,"padRight":true},{"text":"as in Theorem ","element":"span"},{"href":"#id-41","text":"3.4","element":"a"},{"text":", we have","element":"span"}],[{"id":"id-45","style":{"width":"99%"},"width":1860,"height":244,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/9-27.png","element":"img"}],[{"text":"Refer to Sections ","element":"span"},{"text":"B ","element":"span"},{"text":"and ","element":"span"},{"text":"C ","element":"span"},{"text":"in the Appendix for proofs of Theorems 4 and 5, respectively. Pointwise pseudo-posterior consistency follows directly from Theorems ","element":"span"},{"href":"#id-41","text":"3.4 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-45","text":"3.5 ","element":"a"},{"text":"as shown below.","element":"span"}],[{"id":"id-51","style":{"height":17.6},"width":448.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-0.png","element":"img","alt":"Theorem 3.6. Let f0","inline":true,"padRight":true},{"text":"satisfy Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":17.2},"width":239.72,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-1.png","element":"img","alt":" x, kn and ν0","inline":true,"padRight":true},{"text":"as in Theorem ","element":"span"},{"href":"#id-41","text":"3.4","element":"a"},{"text":". Fix ","element":"span"},{"style":{"height":13.6},"width":105.08,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-2.png","element":"img","alt":"ǫ > 0","inline":true,"padRight":true},{"text":"and define the ","element":"span"},{"style":{"height":8.8},"width":19,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-3.png","element":"img","alt":" ǫ","inline":true},{"text":"-ball around ","element":"span"},{"style":{"height":19.6},"width":1165.96,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-4.png","element":"img","alt":" f0(x) by Uǫ = {y∗ : |y∗ − f0(x)| ≤ ǫ}. Let pr{f(x) ∈ Ucǫ |","inline":true},{"style":{"height":21.31},"width":114.24,"height":53.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-5.png","element":"img","alt":"X (n)}","inline":true,"padRight":true},{"text":"denote the probability of the set ","element":"span"},{"style":{"height":18},"width":51.48,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-6.png","element":"img","alt":" Ucǫ ","inline":true,"padRight":true},{"text":"under the pseudo-posterior distribution of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"as ","element":"span"},{"text":"induced by equations ","element":"span"},{"text":"(","element":"span"},{"href":"#id-24","text":"3","element":"a"},{"text":")","element":"span"},{"text":"-","element":"span"},{"text":"(","element":"span"},{"href":"#id-23","text":"6","element":"a"},{"text":")","element":"span"},{"text":". Then ","element":"span"},{"text":"pr","element":"span"},{"style":{"height":21.79},"width":614.04,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-7.png","element":"img","alt":"{f(x) ∈ Ucǫ | X (n)} → 0 in Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":10.4},"width":164.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-8.png","element":"img","alt":" n → ∞.","inline":true}],[{"style":{"height":17.6},"width":299.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-9.png","element":"img","alt":"Proof. Fix ǫ >","inline":true,"padRight":true},{"text":"0 and consider the ","element":"span"},{"style":{"height":19.6},"width":694.56,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-10.png","element":"img","alt":" ǫ-ball Uǫ = {y∗ : |y∗ − f0(x)| ≤ ǫ}","inline":true},{"text":". Then by Chebychev’s inequality, we have pr","element":"span"},{"style":{"height":23.79},"width":1423.28,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-11.png","element":"img","alt":"{f(x) ∈ Ucǫ | X (n)} ≤ [{ ˆfn(x) − f0(x)}2 + var{f(x) | X (n)}]/ǫ2 −→ 0 in","inline":true},{"style":{"height":18.48},"width":61.56,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-12.png","element":"img","alt":"Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":10.4},"width":150.72,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-13.png","element":"img","alt":" n → ∞","inline":true},{"text":", using Theorems ","element":"span"},{"href":"#id-41","text":"3.4 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-45","text":"3.5","element":"a"},{"text":".","element":"span"}],[{"text":"Our next result focuses on the limiting distribution of ","element":"span"},{"text":"g","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":"), corresponding to a simplified form of the NN-DM which sets the weights ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-14.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"to their pseudo-posterior mean,","element":"span"}],[{"id":"id-47","style":{"width":"63%"},"width":1183,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-15.png","element":"img"}],[{"text":"where we focus on the univariate case with Gaussian kernels. From Section ","element":"span"},{"text":"I ","element":"span"},{"text":"of the Appendix, the pseudo-posterior distribution of (","element":"span"},{"style":{"height":20.7},"width":453.28,"height":51.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-16.png","element":"img","alt":"ηi, σ2i ) for i = 1, . . . , n","inline":true,"padRight":true},{"text":"is given by NIG(","element":"span"},{"style":{"height":13.2},"width":121.48,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-17.png","element":"img","alt":"µi, νn,","inline":true},{"style":{"height":20.51},"width":606.6,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-18.png","element":"img","alt":"γn/2, γnδ2i /2), where µi, νn, γn","inline":true,"padRight":true},{"text":"are as before and","element":"span"}],[{"style":{"width":"51%"},"width":973,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-19.png","element":"img"}],[{"text":"We show that the limiting distribution of ","element":"span"},{"text":"g","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") is a Gaussian distribution with appropriate centering and scaling. This allows interpretation of 100(1 ","element":"span"},{"style":{"height":17.2},"width":72.16,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-20.png","element":"img","alt":" −β","inline":true},{"text":")% pseudo-credible intervals as 100(1","element":"span"},{"style":{"height":17.2},"width":69.76,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-21.png","element":"img","alt":"−β","inline":true},{"text":")% frequentist confidence intervals on average for large ","element":"span"},{"text":"n","element":"span"},{"text":". Use of simplified versions of the original estimator to derive asymptotic distribution results is standard in the Bayesian nonparametric literature; refer to Chapter 12 of ","element":"span"},{"href":"#id-46","referenceIndex":12,"text":"Ghosal and Van der Vaart ","element":"a"},{"text":"(","element":"span"},{"href":"#id-46","referenceIndex":12,"text":"2017","element":"a"},{"text":"). Let ","element":"span"},{"style":{"height":24.19},"width":64.84,"height":60.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-22.png","element":"img","alt":" f (l)0","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"text":"l","element":"span"},{"text":"-th derivative of ","element":"span"},{"style":{"height":17.6},"width":244.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-23.png","element":"img","alt":" f0 for l ≥ 1.","inline":true}],[{"id":"id-48","text":"Theorem 3.7. ","element":"span"},{"text":"For a fixed ","element":"span"},{"style":{"height":19.6},"width":355.44,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-24.png","element":"img","alt":" x ∈ [0, 1], let g(x)","inline":true,"padRight":true},{"text":"denote the simplified NN-DM estimator in ","element":"span"},{"text":"(","element":"span"},{"href":"#id-47","text":"13","element":"a"},{"text":")","element":"span"},{"text":". Suppose ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-25.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"satisfies Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"and also satisfies ","element":"span"},{"style":{"height":24.59},"width":643.24,"height":61.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-26.png","element":"img","alt":" |f (4)0 (x)| ≤ C0 for all x ∈ [0, 1]","inline":true,"padRight":true},{"text":"for some finite ","element":"span"},{"style":{"height":21.71},"width":1558.24,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-27.png","element":"img","alt":" C0 > 0. Let kn satisfy kn = o(n2/7) such that n−2/9kn → ∞, and h2n be as in","inline":true,"padRight":true},{"text":"(","element":"span"},{"href":"#id-42","text":"9","element":"a"},{"text":") ","element":"span"},{"text":"satisfying ","element":"span"},{"style":{"height":19.92},"width":772.72,"height":49.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-28.png","element":"img","alt":" h2n → 0, as n → ∞. For t ∈ R, define","inline":true}],[{"style":{"width":"71%"},"width":1332,"height":147,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/10-29.png","element":"img"}],[{"style":{"width":"93%"},"width":1743,"height":247,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-0.png","element":"img"}],[{"id":"id-22","text":"For a proof of Theorem ","element":"span"},{"href":"#id-48","text":"3.7","element":"a"},{"text":", we refer the reader to Section ","element":"span"},{"text":"D ","element":"span"},{"text":"of the Appendix.","element":"span"}],[{"text":"3.2 ","element":"span"},{"text":"Pseudo-Posterior Distribution of Weights","element":"span"}],[{"text":"We investigate the rationale behind the pseudo-posterior update of the weight vector ","element":"span"},{"style":{"height":12},"width":41.32,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-1.png","element":"img","alt":" π,","inline":true,"padRight":true},{"text":"which has a symmetric prior distribution ","element":"span"},{"style":{"height":8.8},"width":85,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-2.png","element":"img","alt":" π ∼","inline":true,"padRight":true},{"text":"Dirichlet(","element":"span"},{"style":{"height":12.4},"width":164.88,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-3.png","element":"img","alt":"α, . . . , α","inline":true},{"text":") as motivated in Section ","element":"span"},{"text":"1","element":"span"},{"text":". As discussed in Section ","element":"span"},{"text":"1","element":"span"},{"text":", the conditional update for the weights ","element":"span"},{"style":{"height":8.8},"width":27,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-4.png","element":"img","alt":" π","inline":true,"padRight":true},{"text":"in a finite Bayesian mixture model with ","element":"span"},{"text":"m ","element":"span"},{"text":"components given the cluster allocation indices ","element":"span"},{"style":{"height":19.2},"width":234.72,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-5.png","element":"img","alt":" {c1, . . . , cn}","inline":true,"padRight":true},{"text":"is obtained by Dirichlet(","element":"span"},{"style":{"height":19.6},"width":615.6,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-6.png","element":"img","alt":"α + n1, . . . , α + nm), where α","inline":true,"padRight":true},{"text":"is the prior concentration parameter and ","element":"span"},{"style":{"height":11.68},"width":105.16,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-7.png","element":"img","alt":" nh =","inline":true},{"style":{"height":20.58},"width":275.64,"height":51.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-8.png","element":"img","alt":"�ni=1 I(ci = h","inline":true},{"text":") is the number of data points allocated to the ","element":"span"},{"text":"h","element":"span"},{"text":"-th cluster. This is not true in ","element":"span"},{"text":"our case as the ","element":"span"},{"style":{"height":16.48},"width":45.48,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-9.png","element":"img","alt":" kn","inline":true},{"text":"-nearest neighborhoods have considerable overlap between them. Instead, we consider the number of unique data points in each of these neighborhoods.","element":"span"}],[{"text":"Define the ","element":"span"},{"style":{"height":16.48},"width":45.48,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-10.png","element":"img","alt":" kn","inline":true},{"text":"-nearest neighborhood of ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-11.png","element":"img","alt":" Xi","inline":true,"padRight":true},{"text":"to be the set ","element":"span"},{"style":{"height":20.67},"width":746.88,"height":51.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-12.png","element":"img","alt":" Si = {Xj : d(Xi, Xj) ≤ d(Xi, Xi[kn])}","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":19.68},"width":284.52,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-13.png","element":"img","alt":" Xi[kn] is the kn","inline":true},{"text":"-th nearest neighbor of ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-14.png","element":"img","alt":" Xi","inline":true,"padRight":true},{"text":"in the data ","element":"span"},{"style":{"height":16.7},"width":87.88,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-15.png","element":"img","alt":" X (n)","inline":true},{"text":", following the notation in Section ","element":"span"},{"text":"2.1","element":"span"},{"text":". We assume ","element":"span"},{"style":{"height":19.6},"width":90.28,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-16.png","element":"img","alt":" d(·, ·","inline":true},{"text":") is the Euclidean metric from here on, and let ","element":"span"},{"style":{"height":20.67},"width":405.64,"height":51.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-17.png","element":"img","alt":" Ri = d(Xi, Xi[kn]) =","inline":true},{"style":{"height":20.67},"width":290.6,"height":51.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-18.png","element":"img","alt":"||Xi − Xi[kn]||2","inline":true,"padRight":true},{"text":"denote the distance of ","element":"span"},{"style":{"height":16.48},"width":289.32,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-19.png","element":"img","alt":" Xi from its kn","inline":true},{"text":"-th nearest neighbor in ","element":"span"},{"style":{"height":16.51},"width":102.76,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-20.png","element":"img","alt":" X (n).","inline":true}],[{"text":"Let ","element":"span"},{"style":{"height":16.08},"width":49.44,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-21.png","element":"img","alt":" Ni","inline":true,"padRight":true},{"text":"denote the number of unique members in ","element":"span"},{"style":{"height":16.48},"width":40.8,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-22.png","element":"img","alt":" Si","inline":true,"padRight":true},{"text":"as defined in Section ","element":"span"},{"text":"2.1","element":"span"},{"text":". Then, we can express ","element":"span"},{"style":{"height":16.08},"width":108.76,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-23.png","element":"img","alt":" Ni as","inline":true}],[{"style":{"width":"72%"},"width":1359,"height":134,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-24.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"I","element":"span"},{"text":"(","element":"span"},{"text":"A","element":"span"},{"text":") is the indicator function of the set ","element":"span"},{"style":{"height":23.55},"width":733.8,"height":58.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-25.png","element":"img","alt":" A. Under X1, . . . , Xniid∼ f0, we have","inline":true}],[{"style":{"width":"78%"},"width":1477,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-26.png","element":"img"}],[{"text":"by symmetry. Furthermore, ","element":"span"},{"style":{"height":16.08},"width":49.44,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-27.png","element":"img","alt":" Ni","inline":true,"padRight":true},{"text":"are identically distributed for ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":". We now state a result which provides a motivation for our choice of the pseudo-posterior update of ","element":"span"},{"style":{"height":13.6},"width":135.6,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-28.png","element":"img","alt":" π. For","inline":true,"padRight":true},{"text":"two sequences of real numbers (","element":"span"},{"style":{"height":19.6},"width":229.32,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-29.png","element":"img","alt":"an) and (bn","inline":true},{"text":"), we write ","element":"span"},{"style":{"height":19.2},"width":756.72,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-30.png","element":"img","alt":" an ∼ bn if |an/bn| → c0 as n → ∞ for","inline":true,"padRight":true},{"text":"some constant ","element":"span"},{"style":{"height":15.68},"width":138.76,"height":39.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-31.png","element":"img","alt":" c0 > 0.","inline":true}],[{"id":"id-49","style":{"height":23.75},"width":1013.96,"height":59.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-32.png","element":"img","alt":"Theorem 3.8. Suppose X1, . . . , Xniid∼ f0 with f0","inline":true,"padRight":true},{"text":"satisfying Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3","element":"a"},{"text":". Furthermore, suppose that ","element":"span"},{"style":{"height":20.51},"width":837.32,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/11-33.png","element":"img","alt":" kn ∼ ni0−ǫ for some ǫ ∈ (0, i0), where i0","inline":true,"padRight":true},{"text":"is as defined in Theorem ","element":"span"},{"href":"#id-41","text":"3.4","element":"a"},{"text":".","element":"span"}],[{"style":{"width":"99%"},"width":1860,"height":191,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-0.png","element":"img"}],[{"text":"Proof of Theorem ","element":"span"},{"href":"#id-49","text":"3.8 ","element":"a"},{"text":"is in Section ","element":"span"},{"text":"E ","element":"span"},{"text":"of the Appendix. The above theorem suggests we asymptotically have only one unique member per neighborhood ","element":"span"},{"style":{"height":16.48},"width":40.8,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-1.png","element":"img","alt":" Si","inline":true},{"text":", namely the point ","element":"span"},{"style":{"height":16.48},"width":150.48,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-2.png","element":"img","alt":" Xi that","inline":true,"padRight":true},{"text":"itself generated this neighborhood. This result motivates our choice of the pseudo-posterior update of the weight vector ","element":"span"},{"style":{"height":8.8},"width":41.32,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-3.png","element":"img","alt":" π.","inline":true,"padRight":true},{"text":"We illustrate uncertainty quantification of the proposed method in finite samples in Section ","element":"span"},{"href":"#id-50","text":"4.4 ","element":"a"},{"text":"with this choice of pseudo-posterior update of the weight vector ","element":"span"},{"style":{"height":8.8},"width":41.32,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-4.png","element":"img","alt":" π.","inline":true}],[{"id":"id-78","text":"3.3 ","element":"span"},{"text":"Choice of Dirichlet Prior Parameter","element":"span"}],[{"text":"Although Theorem ","element":"span"},{"href":"#id-51","text":"3.6 ","element":"a"},{"text":"implies consistency of the pseudo-posterior for fixed ","element":"span"},{"style":{"height":17.6},"width":276.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-5.png","element":"img","alt":" x and any α,","inline":true,"padRight":true},{"text":"the choice of ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-6.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"impacts frequentist coverage of the pseudo-posterior credible intervals as it directly influences the pseudo-posterior variance of ","element":"span"},{"text":"f","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") through Theorem ","element":"span"},{"href":"#id-45","text":"3.5","element":"a"},{"text":". We now describe a data-dependent method to choose ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-7.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"using Bernstein-von Mises results for linear functionals of Bayesian density estimators in the univariate setup (","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"Rivoirard and Rousseau","element":"a"},{"text":", ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"2012","element":"a"},{"text":"). ","element":"span"},{"text":"The key idea we adopt is to consider a linear functional ","element":"span"},{"style":{"height":22.99},"width":88.28,"height":57.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-8.png","element":"img","alt":" P( ˜f","inline":true},{"text":") of the density ","element":"span"},{"text":"˜","element":"span"},{"text":"f ","element":"span"},{"text":"such as its mean, obtain its Bernstein-von Mises limit distribution using the result in ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"Rivoirard and Rousseau ","element":"a"},{"text":"(","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"2012","element":"a"},{"text":"), and equate the variance of this limit distribution with the pseudo-posterior variance of ","element":"span"},{"text":"P","element":"span"},{"text":"(","element":"span"},{"text":"f","element":"span"},{"text":") when ","element":"span"},{"text":"f ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"has the NN-DM form","element":"a"},{"text":"u","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"lation","element":"a"},{"text":".","element":"span"}],[{"text":"Let ˜","element":"span"},{"style":{"height":17.6},"width":185,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-9.png","element":"img","alt":"p and f0","inline":true,"padRight":true},{"text":"satisfy conditions in ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"Rivoirard and Rousseau ","element":"a"},{"text":"(","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"2012","element":"a"},{"text":"). ","element":"span"},{"text":"Define the linear functional ","element":"span"},{"style":{"height":24.03},"width":463.32,"height":60.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-10.png","element":"img","alt":" P( ˜f) = � ˜p(u) ˜f(u) du","inline":true,"padRight":true},{"text":"for a generic density ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":17.6},"width":706.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-11.png","element":"img","alt":"f on R. Suppose X1, . . . , Xn are","inline":true,"padRight":true},{"text":"independent and identically distributed data from a density ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-12.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"supported on [0","element":"span"},{"text":", ","element":"span"},{"text":"1]. Define ","element":"span"},{"style":{"height":21.6},"width":792.84,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-13.png","element":"img","alt":"q(u) = ˜p(u)−� ˜p(v)f0(v) dv and let Fn","inline":true,"padRight":true},{"text":"be the empirical distribution function of ","element":"span"},{"style":{"height":16.8},"width":237.16,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-14.png","element":"img","alt":" X1, . . . , Xn.","inline":true,"padRight":true},{"text":"Then under a suitable prior distribution on ","element":"span"},{"text":"˜","element":"span"},{"text":"f","element":"span"},{"text":", ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"Rivoirard and Rousseau ","element":"a"},{"text":"(","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"2012","element":"a"},{"text":") show that the posterior distribution of ","element":"span"},{"style":{"height":22.99},"width":421.92,"height":57.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-15.png","element":"img","alt":" n1/2{P( ˜f) − P(Fn)}","inline":true,"padRight":true},{"text":"is asymptotically Gaussian with mean 0 and variance Ω","element":"span"},{"style":{"height":21.6},"width":1069.48,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-16.png","element":"img","alt":"0 =�q2(x)f0(x) dx, where P(Fn) = n−1 �ni=1 ˜p(Xi).","inline":true}],[{"text":"As a specific example, consider the case when ˜","element":"span"},{"text":"p","element":"span"},{"text":"(","element":"span"},{"text":"u","element":"span"},{"text":") = ","element":"span"},{"text":"u","element":"span"},{"text":", corresponding to the linear functional given by ","element":"span"},{"style":{"height":24.03},"width":387.96,"height":60.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-17.png","element":"img","alt":" P( ˜f) =�u ˜f(u) du","inline":true},{"text":". Following ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"Rivoirard and Rousseau ","element":"a"},{"text":"(","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"2012","element":"a"},{"text":"), the limiting variance of the posterior distribution of ","element":"span"},{"style":{"height":23.18},"width":420.96,"height":57.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-18.png","element":"img","alt":" n1/2{P( ˜f) − P(Fn)}","inline":true,"padRight":true},{"text":"under a suitable prior on ","element":"span"},{"text":"˜","element":"span"},{"text":"f ","element":"span"},{"text":"is given by Ω","element":"span"},{"style":{"height":22.75},"width":1156.92,"height":56.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-19.png","element":"img","alt":"0 =�(u − mf0)2f0(u) du = σ2f0 where mf0 =�uf0(u) du","inline":true,"padRight":true},{"text":"is the population mean and ","element":"span"},{"style":{"height":22.7},"width":57.72,"height":56.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-20.png","element":"img","alt":" σ2f0 ","inline":true,"padRight":true},{"text":"is the population variance of the density ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-21.png","element":"img","alt":" f0","inline":true},{"text":". This provides the asymptotic variance ","element":"span"},{"text":"of ","element":"span"},{"style":{"height":24.03},"width":397.56,"height":60.08,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-22.png","element":"img","alt":" P( ˜f) = �u ˜f(u) du","inline":true,"padRight":true},{"text":"for an appropriate Bayesian density estimator ","element":"span"},{"text":"˜","element":"span"},{"text":"f","element":"span"},{"text":". Our strategy for finding a value of ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-23.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"involves equating the pseudo-posterior variance of ","element":"span"},{"style":{"height":23.91},"width":370.68,"height":59.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-24.png","element":"img","alt":" n1/2P(f) with σ2f0","inline":true,"padRight":true},{"text":"when ","element":"span"},{"text":"f ","element":"span"},{"text":"has the NN-DM formulation, as ","element":"span"},{"style":{"height":10.4},"width":153.6,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-25.png","element":"img","alt":" n → ∞","inline":true},{"text":". This is done to ensure that ","element":"span"},{"style":{"height":21.71},"width":267.64,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-26.png","element":"img","alt":" n1/2P(f) has","inline":true,"padRight":true},{"text":"the same limiting variance as ","element":"span"},{"style":{"height":23.18},"width":375.48,"height":57.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/12-27.png","element":"img","alt":" n1/2P( ˜f), where f","inline":true,"padRight":true},{"text":"is the proposed estimator and ","element":"span"},{"text":"˜","element":"span"},{"text":"f ","element":"span"},{"text":"is a valid Bayesian density estimator according to ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"Rivoirard and Rousseau ","element":"a"},{"text":"(","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"2012","element":"a"},{"text":"). When ","element":"span"},{"text":"f ","element":"span"},{"text":"has the NN-DM formulation with Gaussian kernels, ","element":"span"},{"style":{"height":21.6},"width":636.96,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-0.png","element":"img","alt":" P(f) =�uf(u) du = �ni=1 πiηi","inline":true},{"text":", following from ","element":"span"},{"text":"Section ","element":"span"},{"href":"#id-34","text":"2.2","element":"a"},{"text":", with the variance of ","element":"span"},{"style":{"height":21.9},"width":162.84,"height":54.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-1.png","element":"img","alt":" n1/2P(f","inline":true},{"text":") provided below.","element":"span"}],[{"id":"id-53","style":{"height":17.6},"width":557.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-2.png","element":"img","alt":"Theorem 3.9. Suppose f0","inline":true,"padRight":true},{"text":"satisfies Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"with ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1","element":"span"},{"text":". Let ","element":"span"},{"text":"f ","element":"span"},{"text":"have the NNDM formulation, and ","element":"span"},{"style":{"height":17.2},"width":108.2,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-3.png","element":"img","alt":" kn, ν0","inline":true,"padRight":true},{"text":"be chosen as in Theorem ","element":"span"},{"href":"#id-41","text":"3.4","element":"a"},{"text":". Let ","element":"span"},{"text":"Θ =","element":"span"},{"style":{"height":21.6},"width":473.76,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-4.png","element":"img","alt":"�uf(u) du = �ni=1 πiηi","inline":true,"padRight":true},{"text":"as in Section ","element":"span"},{"href":"#id-34","text":"2.2","element":"a"},{"text":". ","element":"span"},{"text":"For ","element":"span"},{"style":{"height":20.51},"width":1363.76,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-5.png","element":"img","alt":" γn > 2, define vi = {(νn + 1)(γn − 2)}−1γnλ2i for i = 1, . . . , n,","inline":true,"padRight":true},{"text":"¯","element":"span"},{"style":{"height":22.91},"width":1531.36,"height":57.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-6.png","element":"img","alt":"v = (1/n) �ni=1 vi, ¯µ = (1/n) �ni=1 µi and S2µ = (1/n) �ni=1(µi − ¯µ)2. Then","inline":true}],[{"id":"id-54","style":{"width":"96%"},"width":1797,"height":423,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-7.png","element":"img"}],[{"text":"The proof of Theorem ","element":"span"},{"href":"#id-53","text":"3.9 ","element":"a"},{"text":"and derivation of (","element":"span"},{"href":"#id-54","text":"17","element":"a"},{"text":") are in Section ","element":"span"},{"href":"#id-55","text":"F ","element":"a"},{"text":"of the Appendix. ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"Du","element":"a"},{"text":"e to the lack of a multivariate analogue of the result discussed in ","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"Rivoirard and Rousseau","element":"a"}],[{"text":"(","element":"span"},{"href":"#id-52","referenceIndex":38,"text":"2012","element":"a"},{"text":"), a choice of ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-8.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"when data are multivariate is not immediate. We observe that the univariate choice of ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-9.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"may be written as ","element":"span"},{"style":{"height":22.7},"width":492.16,"height":56.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-10.png","element":"img","alt":" α ≈ h2n/(σ2f0νn) when n","inline":true,"padRight":true},{"text":"is large with ","element":"span"},{"href":"#id-42","style":{"height":20.51},"width":280.76,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-11.png","element":"img","alt":" h2n in (9). To","inline":true,"padRight":true},{"text":"that end, let ","element":"span"},{"style":{"height":20.78},"width":208.56,"height":51.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-12.png","element":"img","alt":" Hn = h2nIp ","inline":true,"padRight":true},{"text":"be the multivariate bandwidth matrix as defined in Section ","element":"span"},{"text":"3.1 ","element":"span"},{"text":"and ","element":"span"},{"text":"Σ","element":"span"},{"style":{"height":11.2},"width":31.32,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-13.png","element":"img","alt":"f0","inline":true,"padRight":true},{"text":"be the unknown population covariance matrix of ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-14.png","element":"img","alt":" f0","inline":true},{"text":". Then, one may potentially extend the findings of (","element":"span"},{"href":"#id-54","text":"17","element":"a"},{"text":") to a multivariate extension given by","element":"span"}],[{"id":"id-56","style":{"width":"56%"},"width":1064,"height":116,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-15.png","element":"img"}],[{"text":"It is immediate that the choice of ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-16.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"in the multivariate case as in (","element":"span"},{"href":"#id-56","text":"18","element":"a"},{"text":") reduces to the choice in (","element":"span"},{"href":"#id-54","text":"17","element":"a"},{"text":") for ","element":"span"},{"style":{"height":19.92},"width":314.12,"height":49.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-17.png","element":"img","alt":" p = 1. Once δ20 ","inline":true,"padRight":true},{"text":"is estimated according to Section ","element":"span"},{"href":"#id-57","text":"2.3 ","element":"a"},{"text":"and the underlying ","element":"span"},{"text":"population variance is estimated, one can use (","element":"span"},{"href":"#id-54","text":"17","element":"a"},{"text":") or its multivariate analogue (","element":"span"},{"href":"#id-56","text":"18","element":"a"},{"text":") to select an appropriate value of ","element":"span"},{"style":{"height":9.2},"width":43.24,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-18.png","element":"img","alt":" α.","inline":true}]]},{"heading":"4 Simulation Experiments","paragraphs":[[{"text":"4.1 ","element":"span"},{"text":"Preliminaries","element":"span"}],[{"text":"In this section, we compare the performance of the proposed density estimator with several other standard density estimators through several numerical experiments. We evaluate performance based on the expected ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/13-19.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"distance (","element":"span"},{"href":"#id-58","referenceIndex":7,"text":"Devroye and Gyorfi","element":"a"},{"text":", ","element":"span"},{"href":"#id-58","referenceIndex":7,"text":"1985","element":"a"},{"text":"). For the pair (","element":"span"},{"style":{"height":23.98},"width":323.72,"height":59.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-0.png","element":"img","alt":"f0, ˆf), where f0","inline":true,"padRight":true},{"text":"is the true data generating density and ","element":"span"},{"text":"ˆ","element":"span"},{"text":"f ","element":"span"},{"text":"is an estimator, the expected ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-1.png","element":"img","alt":"L1","inline":true,"padRight":true},{"text":"distance is defined as ","element":"span"},{"style":{"height":26.65},"width":769.92,"height":66.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-2.png","element":"img","alt":" L1(f0, ˆf) = EPf0{�|f0(x) − ˆf(x)| dx}","inline":true},{"text":". Given a sample size ","element":"span"},{"text":"n","element":"span"},{"text":", we compute an estimate ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.99},"width":417.56,"height":59.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-3.png","element":"img","alt":"L1(f0, ˆf) of L1(f0, ˆf","inline":true},{"text":") in two steps. First, we sample ","element":"span"},{"style":{"height":17.6},"width":332.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-4.png","element":"img","alt":" X1, . . . , Xn ∼ f0","inline":true,"padRight":true},{"text":"and obtain ","element":"span"},{"text":"ˆ","element":"span"},{"text":"f ","element":"span"},{"text":"based on this sample, and then further sample ","element":"span"},{"style":{"height":11.68},"width":39.84,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-5.png","element":"img","alt":" nt","inline":true,"padRight":true},{"text":"independent test points ","element":"span"},{"style":{"height":18.02},"width":435.08,"height":45.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-6.png","element":"img","alt":"Xn+1, . . . , Xn+nt ∼ f0","inline":true,"padRight":true},{"text":"and compute","element":"span"}],[{"style":{"width":"29%"},"width":548,"height":146,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-7.png","element":"img"}],[{"text":"In the second step, to approximate the expectation with respect to ","element":"span"},{"style":{"height":18.48},"width":61.56,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-8.png","element":"img","alt":" Pf0","inline":true},{"text":", the first step is repeated ","element":"span"},{"text":"R ","element":"span"},{"text":"times. Letting ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":16.08},"width":46.68,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-9.png","element":"img","alt":"Lr","inline":true,"padRight":true},{"text":"denote the estimate for the ","element":"span"},{"text":"r","element":"span"},{"text":"th replicate, we compute the final estimate as ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":24.16},"width":540.12,"height":60.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-10.png","element":"img","alt":"L1(f0, ˆf) = (1/R) �Rr=1 ˆLr","inline":true},{"text":". Then, it follows that ˆ","element":"span"},{"style":{"height":23.79},"width":496.6,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-11.png","element":"img","alt":"L1(f0, ˆf) → L1(f0, ˆf) as","inline":true},{"style":{"height":16.8},"width":223.68,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-12.png","element":"img","alt":"nt, R −→ ∞","inline":true},{"text":", by the law of large numbers. In our experiments, we set ","element":"span"},{"style":{"height":16.48},"width":446.92,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-13.png","element":"img","alt":" nt = 500 and R = 20.","inline":true,"padRight":true},{"text":"We let 0","element":"span"},{"style":{"height":18.88},"width":165.36,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-14.png","element":"img","alt":"p and 1p","inline":true,"padRight":true},{"text":"denote the vector with all entries equal to 0 and the vector with all entries equal to 1 in ","element":"span"},{"style":{"height":13.2},"width":52.56,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-15.png","element":"img","alt":" Rp","inline":true},{"text":", respectively, for ","element":"span"},{"style":{"height":16.4},"width":123.4,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-16.png","element":"img","alt":" p ≥ 1.","inline":true}],[{"text":"All simulations were carried out using the R programming language (","element":"span"},{"href":"#id-59","referenceIndex":37,"text":"R Core Team","element":"a"},{"text":", ","element":"span"},{"href":"#id-59","referenceIndex":37,"text":"2018","element":"a"},{"text":"). For Dirichlet process mixture models, we collect 2","element":"span"},{"text":", ","element":"span"},{"text":"000 Markov chain Monte Carlo (MCMC) samples after discarding a burn-in of 3","element":"span"},{"text":", ","element":"span"},{"text":"000 samples using the ","element":"span"},{"text":"dirichletprocess ","element":"span"},{"text":"package (","element":"span"},{"href":"#id-60","referenceIndex":19,"text":"J. Ross and Markwick","element":"a"},{"text":", ","element":"span"},{"href":"#id-60","referenceIndex":19,"text":"2019","element":"a"},{"text":"). The default implementation of the Dirichlet process mixture model in ","element":"span"},{"text":"p ","element":"span"},{"text":"dimensions in the ","element":"span"},{"text":"dirichletprocess ","element":"span"},{"text":"package uses multivariate Gaussian kernels and has the base measure as NIW","element":"span"},{"style":{"height":19.68},"width":224.88,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-17.png","element":"img","alt":"p(0p, p, p, Ip","inline":true},{"text":") with the Dirichlet concentration parameter having the Gamma(2","element":"span"},{"text":", ","element":"span"},{"text":"4) prior (","element":"span"},{"href":"#id-61","referenceIndex":50,"text":"West","element":"a"},{"text":", ","element":"span"},{"href":"#id-61","referenceIndex":50,"text":"1992","element":"a"},{"text":"). For the nearest neighbor-Dirichlet mixture, 1","element":"span"},{"text":", ","element":"span"},{"text":"000 Monte Carlo samples are taken. For the kernel density estimator, we select the bandwidth by the default plug-in method ","element":"span"},{"text":"hpi ","element":"span"},{"text":"for univariate cases and ","element":"span"},{"text":"Hpi ","element":"span"},{"text":"for multivariate cases (","element":"span"},{"href":"#id-62","referenceIndex":41,"text":"Sheather and Jones","element":"a"},{"text":", ","element":"span"},{"href":"#id-62","referenceIndex":41,"text":"1991","element":"a"},{"text":"; ","element":"span"},{"href":"#id-63","referenceIndex":48,"text":"Wand and Jones","element":"a"},{"text":", ","element":"span"},{"href":"#id-63","referenceIndex":48,"text":"1994","element":"a"},{"text":") using the package ","element":"span"},{"text":"ks ","element":"span"},{"text":"(","element":"span"},{"href":"#id-64","referenceIndex":8,"text":"Duong","element":"a"},{"text":", ","element":"span"},{"href":"#id-64","referenceIndex":8,"text":"2020","element":"a"},{"text":"). We additionally consider the k-nearest neighbor estimator studied in ","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"Mack and Rosenblatt ","element":"a"},{"text":"(","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"1979","element":"a"},{"text":"), setting the number of neighbors ","element":"span"},{"style":{"height":16.91},"width":171.08,"height":42.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/14-18.png","element":"img","alt":" k = n1/2","inline":true},{"text":", and the variational Bayes (VB) approximation to Dirichlet process mixture models (","element":"span"},{"href":"#id-13","referenceIndex":4,"text":"Blei and Jordan","element":"a"},{"text":", ","element":"span"},{"href":"#id-13","referenceIndex":4,"text":"2006","element":"a"},{"text":"). We also compare with the optional Polya tree (OPT) (","element":"span"},{"href":"#id-18","referenceIndex":51,"text":"Wong and Ma","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":51,"text":"2010","element":"a"},{"text":") using the package ","element":"span"},{"text":"PTT","element":"span"},{"text":". For univariate cases, we consider the recursive predictive density estimator (RD) from ","element":"span"},{"href":"#id-65","referenceIndex":16,"text":"Hahn et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-65","referenceIndex":16,"text":"2018","element":"a"},{"text":"), Polya tree mixtures (PTM) using the package ","element":"span"},{"text":"DPpackage ","element":"span"},{"text":"(","element":"span"},{"href":"#id-66","referenceIndex":20,"text":"Jara et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-66","referenceIndex":20,"text":"2011","element":"a"},{"text":"), and the sample smoothing kernel density estimator (A-KDE) using the package ","element":"span"},{"text":"quantreg","element":"span"},{"text":". Lastly, we also compare with the local likelihood density estimator (LLDE) using the package ","element":"span"},{"text":"locfit ","element":"span"},{"text":"for both univariate and multivariate cases. Dirichlet process mixture model hyperparameter values are kept the same in both the MCMC and variational Bayes implementations, with the number of components of the variational family set to 10 for all cases. We denote the nearest neighbor-Dirichlet mixture, Dirichlet process mixture (DPM) implemented with MCMC, kernel density estimator, variational Bayes approximation to the DPM, and k-nearest neigh-","element":"span"}],[{"id":"id-68","style":{"width":"95%"},"width":1789,"height":657,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-0.png","element":"img"}],[{"text":"Figure 1: Box plots of ","element":"figcaption","subtype":"caption"},{"text":"ˆ","element":"figcaption","subtype":"caption"},{"style":{"height":23.79},"width":166.04,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-1.png","element":"img","alt":"L1(f0, ˆf","inline":true},{"text":") for the 10 different choices of the true density ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-2.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"and different estimators ","element":"figcaption","subtype":"caption"},{"text":"ˆ","element":"figcaption","subtype":"caption"},{"text":"f ","element":"figcaption","subtype":"caption"},{"text":"for univariate data. The box plots for KDE and RD exclude the heavy-tailed cases CA, IE, and SP.","element":"figcaption","subtype":"caption"}],[{"text":"bor density estimator by NN-DM, DP-MC, KDE, DP-VB and KNN, respectively, in tables and figures.","element":"span"}],[{"id":"id-76","text":"4.2 ","element":"span"},{"text":"Univariate Cases","element":"span"}],[{"text":"We set ","element":"span"},{"style":{"height":21.31},"width":907.56,"height":53.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-3.png","element":"img","alt":" n = 200, 500 with kn = ⌊n1/3⌋+1 where ⌊n0⌋","inline":true,"padRight":true},{"text":"denotes the greatest integer less than or equal to ","element":"span"},{"style":{"height":11.68},"width":44.84,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-4.png","element":"img","alt":" n0","inline":true},{"text":". We consider 10 choices of ","element":"span"},{"style":{"height":19.6},"width":662.64,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-5.png","element":"img","alt":" f0 from the R package benchden (","inline":true},{"href":"#id-67","referenceIndex":31,"text":"Mildenberger and Weinert","element":"a"},{"text":", ","element":"span"},{"href":"#id-67","referenceIndex":31,"text":"2012","element":"a"},{"text":"); the specific choices are Cauchy (CA), claw (CW), double exponential (DE), Gaussian (GS), inverse exponential (IE), lognormal (LN), logistic (LO), skewed bimodal (SB), symmetric Pareto (SP), and sawtooth (ST) with default choices of the corresponding parameters. The prior hyperparameter choices for the proposed method are ","element":"span"},{"style":{"height":17.2},"width":548.2,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-6.png","element":"img","alt":" µ0 = 0, ν0 = 0.001, γ0 = 1;","inline":true},{"style":{"height":20.11},"width":39.56,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-7.png","element":"img","alt":"δ20 ","inline":true,"padRight":true},{"text":"is chosen via the cross-validation method of Section ","element":"span"},{"href":"#id-57","text":"2.3","element":"a"},{"text":". Detailed numerical results are ","element":"span"},{"text":"deferred to Table ","element":"span"},{"text":"3 ","element":"span"},{"text":"in the Appendix. Instead, in Figure ","element":"span"},{"href":"#id-68","text":"1","element":"a"},{"text":", we provide a visual summary of the performance of each method under consideration by forming a box plot of the estimated ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-8.png","element":"img","alt":"L1","inline":true,"padRight":true},{"text":"errors of the methods across all the data generating densities. Methods with lower median as indicated by the solid line of the box plot, and smaller overall spread are preferable as they provide higher accuracy and also maintain such accuracy across a collection of true density cases. Results of KNN are omitted in Figure ","element":"span"},{"href":"#id-68","text":"1 ","element":"a"},{"text":"due to much higher values compared to other methods. For the KDE and RD estimator, the plot and the table exclude the results for the heavy-tailed densities CA, IE and SP due to very high ","element":"span"},{"style":{"height":16.48},"width":197.8,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/15-9.png","element":"img","alt":" L1 errors.","inline":true}],[{"text":"Overall, a major advantage of the proposed method is its versatility among the considered methods. The Bayesian nonparametric methods DP-MC, DP-VB, PTM, OPT, and RD are often close to NN-DM in terms of their performance when the true densities are smooth and do not display locally spiky behavior. However, the NN-DM performs better than other methods in densities where such local behavior is present and performs very close to the best estimator for either the smooth heavy-tailed or thin-tailed densities. The KDE and RD perform well when data are generated from a smooth underlying density. However, there are some cases where the error for KDE and RD is very high. For instance, when ","element":"span"},{"text":"n ","element":"span"},{"text":"= 500 and ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/16-0.png","element":"img","alt":"f0","inline":true,"padRight":true},{"text":"is the standard Cauchy (CA) density, the estimated ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/16-1.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"error for the KDE is 38501.85 and the algorithm for the RD estimate did not converge. Both the KDE and RD also perform poorly in very spiky multi-modal densities such as the ST. Compared to the LLDE and the A-KDE, the NN-DM displays similar performance in heavy-tailed and smooth densities when ","element":"span"},{"text":"n ","element":"span"},{"text":"= 200, with the NN-DM performing better for the spiky densities. However, when ","element":"span"},{"text":"n ","element":"span"},{"text":"= 500, the NN-DM shows significant improvements over the LLDE and the A-KDE for spiky densities such as the CW and the ST.","element":"span"}],[{"text":"In Figure ","element":"span"},{"href":"#id-69","text":"2","element":"a"},{"text":", we show the performance of the NN-DM estimator ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.6},"width":44.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/16-2.png","element":"img","alt":"fn","inline":true,"padRight":true},{"text":"(with hyperparameters chosen as described earlier) relative to the posterior mean under a DP-MC with default or hand-tuned hyperparameters, when 500 data points are generated from the sawtooth (ST) density. The Dirichlet process mixture with default hyperparameters is unable to detect the multiple spikes, merging adjacent modes to form larger clusters, perhaps due to inadequate mixing of the Markov chain Monte Carlo sampler or to the Gaussian kernels used in the mixture. As a result, we had to hand-tune the hyperparameters for the Dirichlet process mixture to obtain comparable performance with the NN-DM (without hand-tuning). We obtained the best results when changing the hyperparameters of the base measure of the DPMC to NIG(0","element":"span"},{"text":", ","element":"span"},{"text":"0","element":"span"},{"text":".","element":"span"},{"text":"01","element":"span"},{"text":", ","element":"span"},{"text":"1","element":"span"},{"text":", ","element":"span"},{"text":"1) while keeping the prior on ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/16-3.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"the same as before. This illustrates the deficiency of the DP-MC in estimating densities with spiky local behavior unless we hand-tune the hyperparameters, which requires knowledge of the true density. We also compare the performance of the two methods with a smoother test density in Figure ","element":"span"},{"href":"#id-70","text":"3","element":"a"},{"text":", where the data are generated from a skewed bimodal (SB) distribution. Both the estimates are comparable, but the nearest neighbor-Dirichlet mixture provides better uncertainty quantification. Similar results are obtained for ","element":"span"},{"text":"n ","element":"span"},{"text":"= 1000, and hence are omitted.","element":"span"}],[{"id":"id-77","text":"4.3 ","element":"span"},{"text":"Multivariate Cases","element":"span"}],[{"text":"For the multivariate cases, we consider ","element":"span"},{"text":"n ","element":"span"},{"text":"= 200 and 1000. The number of neighbors is set to ","element":"span"},{"text":"k ","element":"span"},{"text":"= 10 and the dimension ","element":"span"},{"text":"p ","element":"span"},{"text":"is chosen from ","element":"span"},{"text":"{","element":"span"},{"text":"2","element":"span"},{"text":", ","element":"span"},{"text":"3","element":"span"},{"text":", ","element":"span"},{"text":"4","element":"span"},{"text":", ","element":"span"},{"text":"6","element":"span"},{"text":"}","element":"span"},{"text":". Recall the definition of ","element":"span"},{"style":{"height":19.68},"width":213.36,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/16-4.png","element":"img","alt":" φp(x; µ, Σ)","inline":true,"padRight":true},{"text":"from Section ","element":"span"},{"href":"#id-34","text":"2.2 ","element":"a"},{"text":"and let Φ(","element":"span"},{"text":"x","element":"span"},{"text":") be the cumulative distribution function of the standard Gaussian density. Let ","element":"span"},{"style":{"height":21.6},"width":1224.36,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/16-5.png","element":"img","alt":" S0 = ρ 1p1Tp + (1 − ρ) Ip with ρ = 0.8. Let x = (x1, . . . , xp)T","inline":true},{"text":". We consider ","element":"span"},{"text":"the following cases.","element":"span"}],[{"text":"(1) ","element":"span"},{"style":{"height":19.68},"width":1793.8,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/16-6.png","element":"img","alt":" Mixture of Gaussians (MG): f0(x) = 0.4 φp(x; m1, S0) + 0.6 φp(x; m2, S0), where m1 =","inline":true},{"style":{"height":18.08},"width":442.6,"height":45.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/16-7.png","element":"img","alt":"−2 × 1p, m2 = 2 × 1p.","inline":true}],[{"id":"id-69","style":{"width":"91%"},"width":1708,"height":1154,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/17-0.png","element":"img"}],[{"text":"Figure 2: Plot comparing density estimates for the NN-DM and DP-MC for ","element":"figcaption","subtype":"caption"},{"text":"n ","element":"figcaption","subtype":"caption"},{"text":"= 500 samples generated from the sawtooth (ST) density. ","element":"figcaption","subtype":"caption"},{"text":"Shaded regions correspond to 95% (pseudo) posterior credible intervals. The true density is displayed using dotted lines. The top panel shows the performance of DP-MC with default hyperparameters on the left and with hand-tuned hyperparameters on the right. The bottom panel shows the performance of the NNDM.","element":"figcaption","subtype":"caption"}],[{"id":"id-70","style":{"width":"91%"},"width":1706,"height":540,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/17-1.png","element":"img"}],[{"text":"Figure 3: Similar to Figure ","element":"figcaption","subtype":"caption"},{"href":"#id-69","text":"2","element":"a","subtype":"caption"},{"text":", with data of sample size ","element":"figcaption","subtype":"caption"},{"text":"n ","element":"figcaption","subtype":"caption"},{"text":"= 500 generated from the skewed bimodal (SB) density. Left panel shows the NN-DM fit and the right panel shows the DP-MC fit.","element":"figcaption","subtype":"caption"}],[{"text":"(2) ","element":"span"},{"href":"#id-71","referenceIndex":2,"style":{"height":20.78},"width":1797.2,"height":51.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-0.png","element":"img","alt":" Skew normal (SN): f0(x) = 2φp(x; m0, S0)Φ{sT0W −1(x − m0)} (Azzalini, 2005), where W","inline":true,"padRight":true},{"text":"is the diagonal matrix with diagonal entries ","element":"span"},{"style":{"height":20.59},"width":538.64,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-1.png","element":"img","alt":" W 2ii = S0, ii for i = 1, . . . , p","inline":true},{"text":". We choose ","element":"span"},{"style":{"height":18.08},"width":164.4,"height":45.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-2.png","element":"img","alt":" m0 = 0p","inline":true,"padRight":true},{"text":"and the skewness parameter vector ","element":"span"},{"style":{"height":18.08},"width":277.96,"height":45.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-3.png","element":"img","alt":" s0 = 0.5 × 1p.","inline":true,"padRight":true},{"text":"(3) ","element":"span"},{"style":{"height":19.6},"width":1061,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-4.png","element":"img","alt":" Multivariate t-distribution (T): f0(x) = td0(x; m∗, S0","inline":true},{"text":") is the density of the ","element":"span"},{"text":"p","element":"span"},{"text":"-dimensional multivariate Student’s t-distribution as in Section ","element":"span"},{"href":"#id-34","text":"2.2","element":"a"},{"text":". We set ","element":"span"},{"style":{"height":18.88},"width":438.28,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-5.png","element":"img","alt":" d0 = 10 and m∗ = 1p.","inline":true,"padRight":true},{"text":"(4) ","element":"span"},{"text":"Mixture of multivariate skew t-distributions (MST)","element":"span"},{"text":": We consider a two component mixture of multivariate skew t-distribution (","element":"span"},{"href":"#id-71","referenceIndex":2,"text":"Azzalini","element":"a"},{"text":", ","element":"span"},{"href":"#id-71","referenceIndex":2,"text":"2005","element":"a"},{"text":") given by ","element":"span"},{"style":{"height":19.6},"width":623.56,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-6.png","element":"img","alt":" f0(x) = 0.25 td0(x; m1, S0, s0)+","inline":true,"padRight":true},{"text":"0","element":"span"},{"style":{"height":19.6},"width":767.92,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-7.png","element":"img","alt":".75 td0(x; m2, S0, s0). Here, td(· ; µ, S, s","inline":true},{"text":") is the skew t-density with parameters ","element":"span"},{"style":{"height":18},"width":283.28,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-8.png","element":"img","alt":" d, µ, S, s, with","inline":true},{"style":{"height":17.2},"width":103.4,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-9.png","element":"img","alt":"d0, s0","inline":true,"padRight":true},{"text":"defined as before and ","element":"span"},{"style":{"height":12.4},"width":138.92,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-10.png","element":"img","alt":" m1, m2","inline":true,"padRight":true},{"text":"the same as in the first case. (5) ","element":"span"},{"style":{"height":21.55},"width":1660.36,"height":53.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-11.png","element":"img","alt":" Multivariate Cauchy (MVC): f0(x) ∝ {1 + (x − µ∗)TS−10 (x − µ∗)} where µ∗ = 0p.","inline":true,"padRight":true},{"text":"(6) ","element":"span"},{"style":{"height":22.78},"width":1791.12,"height":56.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-12.png","element":"img","alt":" Multivariate Gamma (MVG): f0(x) ∝ cΦ(F1(x1), . . . , Fp(xp) | S0) �pj=1 fj(xj; γj1, γj2)","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":18.88},"width":194.04,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-13.png","element":"img","alt":" fj and Fj","inline":true,"padRight":true},{"text":"denote the density and distribution function of the univariate gamma distribution with shape parameter ","element":"span"},{"style":{"height":14.08},"width":57.32,"height":35.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-14.png","element":"img","alt":" γj1","inline":true,"padRight":true},{"text":"and rate parameter ","element":"span"},{"style":{"height":14.08},"width":57.32,"height":35.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-15.png","element":"img","alt":" γj2","inline":true},{"text":", respectively, for ","element":"span"},{"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , p ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":19.6},"width":104.2,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-16.png","element":"img","alt":"cΦ(· |","inline":true,"padRight":true},{"text":"Γ) is as described in ","element":"span"},{"href":"#id-72","referenceIndex":43,"text":"Song ","element":"a"},{"text":"(","element":"span"},{"href":"#id-72","referenceIndex":43,"text":"2000","element":"a"},{"text":"). This is a Gaussian copula based construction of the multivariate gamma distribution. We set ","element":"span"},{"style":{"height":18.88},"width":603.89,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-17.png","element":"img","alt":" γj1 = γj2 = 1 for j = 1, . . . , p.","inline":true}],[{"text":"The hyperparameters for the nearest neighbor-Dirichlet mixture are chosen as ","element":"span"},{"style":{"height":13.2},"width":110.44,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-18.png","element":"img","alt":" µ0 =","inline":true,"padRight":true},{"text":"0","element":"span"},{"style":{"height":20.59},"width":1126.8,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-19.png","element":"img","alt":"p, ν0 = 0.001, γ0 = p and Ψ0 = {(γ0 − p + 1)δ20}Ip = δ20 Ip","inline":true},{"text":", where the optimal ","element":"span"},{"style":{"height":20.11},"width":39.56,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-20.png","element":"img","alt":" δ20 ","inline":true,"padRight":true},{"text":"is chosen via ","element":"span"},{"text":"cross-validation as described in Section ","element":"span"},{"href":"#id-57","text":"2.3","element":"a"},{"text":". Default hyperparameters as described in Section ","element":"span"},{"text":"4.1 ","element":"span"},{"text":"are chosen for the MCMC and VB implementations of the DPM.","element":"span"}],[{"text":"Similar to the univariate case, we defer the numerical results to Table ","element":"span"},{"href":"#id-73","text":"4 ","element":"a"},{"text":"in the Appendix and in Figure ","element":"span"},{"href":"#id-74","text":"4 ","element":"a"},{"text":"display a visual summary consisting of box plot of ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-21.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"errors over the densities considered. The proposed method is very robust against a wide selection of true distributions, with its ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-22.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"error scaling nicely with the dimension. The KDE shows a noticeably sharp decline in performance - when the dimension is changed from 2 to 6, the average increase in ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-23.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"error is by factors of about 5 and 7 for sample sizes 200 and 1000, respectively. This is possibly due to lack of adaptive density estimation in higher dimensions using a single bandwidth matrix, since data in ","element":"span"},{"style":{"height":13.2},"width":52.56,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/18-24.png","element":"img","alt":" Rp ","inline":true,"padRight":true},{"text":"become increasingly sparse with increasing ","element":"span"},{"text":"p","element":"span"},{"text":". As in the univariate case, we had to exclude the MVC density for the KDE due to the algorithm not converging. The performances of NN-DM, DP-MC, and DP-VB are quite competitive across densities, with NN-DM faring better than the DP-VB when estimating densities such as the MVC and the MVG. Furthermore, the NN-DM is hit the least significantly by the curse of dimensionality out of the three. This is particularly prominent when ","element":"span"},{"text":"n ","element":"span"},{"text":"= 200 and ","element":"span"},{"text":"p ","element":"span"},{"text":"= 6 for the DP-MC when the true density is either MG or MST, and for the DP-VB when the true density is MVC. It is also important to keep in mind that the NN-DM provides similar results compared to the DP-MC while being at least an order of magnitude faster, as illustrated in Section ","element":"span"},{"href":"#id-75","text":"4.6","element":"a"},{"text":". The performance of the OPT is hit quite significantly as the number of dimensions increases, along with the algorithm not converging for ","element":"span"},{"text":"p ","element":"span"},{"text":"= 6. The","element":"span"}],[{"id":"id-74","style":{"width":"78%"},"width":1467,"height":1811,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/19-0.png","element":"img"}],[{"text":"Figure 4: Box plots of ","element":"figcaption","subtype":"caption"},{"text":"ˆ","element":"figcaption","subtype":"caption"},{"style":{"height":23.79},"width":166.04,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/19-1.png","element":"img","alt":"L1(f0, ˆf","inline":true},{"text":") for the 6 different choices of the true density ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/19-2.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"and different estimators ","element":"figcaption","subtype":"caption"},{"text":"ˆ","element":"figcaption","subtype":"caption"},{"text":"f ","element":"figcaption","subtype":"caption"},{"text":"for multivariate data. The box plots for KDE and LLDE exclude the MVC density. The box plots for ","element":"figcaption","subtype":"caption"},{"text":"p ","element":"figcaption","subtype":"caption"},{"text":"= 6 exclude results from OPT.","element":"figcaption","subtype":"caption"}],[{"text":"LLDE provides competitive results with the NN-DM in lower dimensions. However, in higher dimensions, the LLDE often does not converge, indicating lack of stability of the algorithm. We reported the average of the replicates for which the algorithm did converge. The results suggest that the performance of the LLDE is also affected quite drastically with increasing dimensions. When compared across all data generating cases considering the variation in densities, dimensions and sample sizes, the proposed method is seen to be more versatile than its competitors.","element":"span"}],[{"id":"id-50","text":"4.4 ","element":"span"},{"text":"Accuracy of Uncertainty Quantification","element":"span"}],[{"text":"In this section, we assess frequentist coverage of 95% pseudo-posterior credible intervals for the NN-DM and compare with coverage based on the 95% posterior credible intervals obtained from DP-MC and DP-VB. ","element":"span"},{"href":"#id-46","referenceIndex":12,"text":"Ghosal and Van der Vaart ","element":"a"},{"text":"(","element":"span"},{"href":"#id-46","referenceIndex":12,"text":"2017","element":"a"},{"text":") recommend investigating the frequentist coverage of Bayesian credible intervals. We do not include frequentist coverage for Polya tree mixtures (PTMs) and the optional Polya tree (OPT) due to the lack of available code. We consider the cases ","element":"span"},{"style":{"height":19.2},"width":215.04,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/20-0.png","element":"img","alt":" p ∈ {1, 2}","inline":true,"padRight":true},{"text":"in our experiments with sample size ","element":"span"},{"text":"n ","element":"span"},{"text":"= 500. For each choice of density ","element":"span"},{"style":{"height":17.6},"width":256.8,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/20-1.png","element":"img","alt":" f0, we fix nt","inline":true,"padRight":true},{"text":"= 200 test points ","element":"span"},{"style":{"height":19.2},"width":432.96,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/20-2.png","element":"img","alt":" Xt = {Xt1, . . . , Xtnt}","inline":true,"padRight":true},{"text":"generated from the density ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/20-3.png","element":"img","alt":" f0","inline":true},{"text":". With these fixed test points, we generate ","element":"span"},{"text":"n ","element":"span"},{"text":"= 500 data points in our sample for ","element":"span"},{"style":{"height":16.08},"width":83.72,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/20-4.png","element":"img","alt":" Rcov","inline":true,"padRight":true},{"text":"= 200 times and check the coverage of posterior/pseudo-posterior credible intervals obtained from the three methods. We implement the DP-MC with base measure NIW","element":"span"},{"style":{"height":19.68},"width":284.88,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/20-5.png","element":"img","alt":"p(0p, 0.01, p, Ip","inline":true},{"text":") and a Gamma(2","element":"span"},{"text":", ","element":"span"},{"text":"4) prior on the concentration parameter as in ","element":"span"},{"href":"#id-61","referenceIndex":50,"text":"West ","element":"a"},{"text":"(","element":"span"},{"href":"#id-61","referenceIndex":50,"text":"1992","element":"a"},{"text":"). These choices of hyperparameters were seen to give better frequentist coverage results than using the default values used in Sections ","element":"span"},{"href":"#id-76","text":"4.2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-77","text":"4.3","element":"a"},{"text":". Same choices of hyperparameters are maintained for DP-VB. For the NN-DM we take ","element":"span"},{"text":"k ","element":"span"},{"text":"= 8 in the univariate case, ","element":"span"},{"text":"k ","element":"span"},{"text":"= 5 in the bivariate case, ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/20-6.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"as in Section ","element":"span"},{"href":"#id-78","text":"3.3 ","element":"a"},{"text":"and other hyperparameters chosen as before. In Table ","element":"span"},{"href":"#id-79","text":"1 ","element":"a"},{"text":"and in Table ","element":"span"},{"href":"#id-80","text":"2","element":"a"},{"text":", we report the average coverage probability and average length of the (pseudo) credible intervals across all the points in the test data ","element":"span"},{"style":{"height":16.08},"width":46.08,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/20-7.png","element":"img","alt":" Xt","inline":true,"padRight":true},{"text":"for the univariate and bivariate cases, respectively.","element":"span"}],[{"text":"For univariate densities, both the DP-MC and DP-VB display severe under-coverage. In most of the cases, the DP-VB and NN-DM have similar width of (pseudo) credible intervals but the DP-VB displays dramatically lower coverage than the NN-DM. The under-coverage displayed by the DP-MC may be due to MCMC mixing issues. The NN-DM shows near nominal coverage in the smooth Gaussian (GS) and lognormal (LN) densities, while also attaining near nominal coverage in the skewed bimodal (SB), claw (CW) and sawtooth (ST) densities which are multi-modal. The shortcomings of DP-MC and DP-VB are especially noticeable when dealing with spiky densities such as the claw or sawtooth. For bivariate cases considered in Table ","element":"span"},{"href":"#id-80","text":"2 ","element":"a"},{"text":"we see a similar trend; the NN-DM method provides uniformly better uncertainty quantification across all the densities considered. It is clear that in terms","element":"span"}],[{"id":"id-79","style":{"width":"73%"},"width":1376,"height":686,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/21-0.png","element":"img"}],[{"text":"Table 1: Comparison of the frequentist coverage of 95% (pseudo) posterior credible intervals of the nearest neighbor-Dirichlet mixture and the MCMC and variational implementations of the Dirichlet process mixture for univariate data. Average length of the intervals are also provided for each case within parentheses. Number of replications and sample size are ","element":"figcaption","subtype":"caption"},{"style":{"height":16.48},"width":402.44,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/21-1.png","element":"img","alt":"Rcov = 200 and ncov","inline":true,"padRight":true},{"text":"= 500, respectively.","element":"figcaption","subtype":"caption"}],[{"id":"id-80","style":{"width":"85%"},"width":1596,"height":324,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/21-2.png","element":"img"}],[{"text":"Table 2: Comparison of the frequentist coverage of 95% (pseudo) posterior credible intervals of the nearest neighbor-Dirichlet mixture and the MCMC and variational implementations of the Dirichlet process mixture for bivariate data. ","element":"figcaption","subtype":"caption"},{"text":"Average length of the intervals are also provided for each case within parentheses. Number of replications and sample size are ","element":"figcaption","subtype":"caption"},{"style":{"height":16.48},"width":402.44,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/21-3.png","element":"img","alt":"Rcov = 200 and ncov","inline":true,"padRight":true},{"text":"= 500, respectively.","element":"figcaption","subtype":"caption"}],[{"text":"of frequentist uncertainty quantification, the NN-DM displays vastly superior coverage to the DP-MC and the DP-VB without inflating the interval width.","element":"span"}],[{"text":"4.5 ","element":"span"},{"text":"Comparison for high dimensional data","element":"span"}],[{"text":"In addition to the above experiments, we performed a simulation experiment for high-dimensional data. Specifically, we set ","element":"span"},{"text":"p ","element":"span"},{"text":"= 50 and consider the same set of true densities in Section ","element":"span"},{"href":"#id-77","text":"4.3","element":"a"},{"text":". We compared results from the proposed NN-DM method and the DP-VB. Due to severe computational time, we did not consider the DP-MC in this scenario. We","element":"span"}],[{"id":"id-82","style":{"width":"47%"},"width":881,"height":664,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/22-0.png","element":"img"}],[{"text":"Figure 5: Out-of-sample log-likelihood of NN-DM and DP-VB on a test set of 500 points for 6 different multivariate densities considered in Section ","element":"figcaption","subtype":"caption"},{"href":"#id-77","text":"4.3","element":"a","subtype":"caption"},{"text":", for ","element":"figcaption","subtype":"caption"},{"text":"n ","element":"figcaption","subtype":"caption"},{"text":"= 1000 and ","element":"figcaption","subtype":"caption"},{"text":"p ","element":"figcaption","subtype":"caption"},{"text":"= 50.","element":"figcaption","subtype":"caption"}],[{"text":"also tried optional Polya trees (","element":"span"},{"href":"#id-18","referenceIndex":51,"text":"Wong and Ma","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":51,"text":"2010","element":"a"},{"text":") using the ","element":"span"},{"text":"PTT ","element":"span"},{"text":"package; however, the current implementation of the method breaks down in this high-dimensional setup. Due to numerical instability in computing the ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/22-1.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"error in higher dimensions, we evaluate the methods in terms of their out-of-sample log-likelihood (OOSLL) instead (","element":"span"},{"href":"#id-81","referenceIndex":14,"text":"Gneiting and Raftery","element":"a"},{"text":", ","element":"span"},{"href":"#id-81","referenceIndex":14,"text":"2007","element":"a"},{"text":"), on a test set of 500 data points. We report the average OOSLL over 30 replications in Figure ","element":"span"},{"href":"#id-82","text":"5","element":"a"},{"text":". The results indicate that both methods perform very similarly in terms of out-of-sample fit to the data, with the NN-DM outperforming the DP-VB when the true density is MVC. We also observed that for this experiment, the NN-DM methods with default choice of hyperparameters and with cross-validated choice of Ψ","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/22-2.png","element":"img","alt":"0","inline":true,"padRight":true},{"text":"have almost identical performance. For the NN-DM, we set ","element":"span"},{"text":"k ","element":"span"},{"text":"= 12 after carrying out a sensitivity analysis on ","element":"span"},{"text":"k ","element":"span"},{"text":"by considering ","element":"span"},{"text":"k ","element":"span"},{"text":"= 5","element":"span"},{"text":", ","element":"span"},{"text":"7","element":"span"},{"text":", ","element":"span"},{"text":"10","element":"span"},{"text":", ","element":"span"},{"text":"15","element":"span"},{"text":", ","element":"span"},{"text":"and 20. The best results for the NN-DM were obtained for ","element":"span"},{"style":{"height":19.2},"width":298.56,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/22-3.png","element":"img","alt":" k ∈ {7, 10, 12}","inline":true,"padRight":true},{"text":"with negligible difference in out-of-sample log-likelihoods between these three choices, with ","element":"span"},{"text":"k ","element":"span"},{"text":"= 12 performing the best.","element":"span"}],[{"id":"id-75","text":"4.6 ","element":"span"},{"text":"Runtime Comparison","element":"span"}],[{"text":"The results in the previous subsection suggest that the DP-VB has dramatic under-coverage with respect to the nominal frequentist coverage. Hence, the VB approach is not useful for uncertainty quantification. ","element":"span"},{"text":"For this reason, we focus our comparison of runtimes on the NN-DM and the DP-MC, noting that VB algorithms are faster than either of these approaches.","element":"span"}],[{"text":"With ","element":"span"},{"text":"n ","element":"span"},{"text":"data points in ","element":"span"},{"text":"p ","element":"span"},{"text":"dimensions, the initial nearest neighbor allocation into ","element":"span"},{"text":"n ","element":"span"},{"text":"neighborhoods can be carried out in ","element":"span"},{"text":"O","element":"span"},{"text":"(","element":"span"},{"text":"n ","element":"span"},{"text":"log ","element":"span"},{"text":"n","element":"span"},{"text":") steps (","element":"span"},{"href":"#id-83","referenceIndex":47,"text":"Vaidya","element":"a"},{"text":", ","element":"span"},{"href":"#id-83","referenceIndex":47,"text":"1986","element":"a"},{"text":"; ","element":"span"},{"href":"#id-84","referenceIndex":29,"text":"Ma and Li","element":"a"},{"text":", ","element":"span"},{"href":"#id-84","referenceIndex":29,"text":"2019","element":"a"},{"text":"). Once the neighborhoods are determined with ","element":"span"},{"style":{"height":16.48},"width":45.48,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/22-4.png","element":"img","alt":" kn","inline":true,"padRight":true},{"text":"points in each neighborhood, obtaining the neighborhood specific empirical means and covariance matrices has ","element":"span"},{"style":{"height":20.7},"width":599.28,"height":51.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-0.png","element":"img","alt":" O(nknp + nknp2) = O(nknp2)","inline":true,"padRight":true},{"text":"complexity. Obtaining the pseudo-posterior mean (","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":") then requires inversion of ","element":"span"},{"style":{"height":17.2},"width":248.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-1.png","element":"img","alt":" n such p × p","inline":true,"padRight":true},{"text":"matrices to evaluate the multivariate t-density, with a runtime of ","element":"span"},{"style":{"height":20.71},"width":125.96,"height":51.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-2.png","element":"img","alt":" O(np3","inline":true},{"text":"). Therefore, the total runtime to obtain the pseudo-posterior mean is of the order ","element":"span"},{"style":{"height":20.51},"width":536.52,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-3.png","element":"img","alt":" O(nknp2 + np3). When we","inline":true,"padRight":true},{"text":"are interested in uncertainty quantification, we require Monte Carlo samples of the NN-DM, which are independently drawn from its pseudo-posterior. This involves sampling the Dirichlet weights, the neighborhood specific unknown mean and covariance matrix parameters of the Gaussian kernel, and evaluating a Gaussian density for each neighborhood, as outlined in Algorithm ","element":"span"},{"href":"#id-26","text":"1","element":"a"},{"text":". To obtain ","element":"span"},{"text":"M ","element":"span"},{"text":"Monte Carlo samples, the combined complexity of this step is thus ","element":"span"},{"style":{"height":20.51},"width":573.32,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-4.png","element":"img","alt":" O(Mn + Mnp3) = O(Mnp3","inline":true},{"text":"). Overall the runtime complexity to obtain NN-DM samples is therefore ","element":"span"},{"style":{"height":20.51},"width":453.8,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-5.png","element":"img","alt":" O(Mnp3+nknp2+np3","inline":true},{"text":"). For high dimensional scenarios, this runtime can be greatly improved by using a low rank matrix factorization of both the neighborhood specific empirical covariance matrix and the sampled covariance matrix parameter to make matrix inversion more efficient (","element":"span"},{"href":"#id-85","referenceIndex":15,"text":"Golub and van Loan","element":"a"},{"text":", ","element":"span"},{"href":"#id-85","referenceIndex":15,"text":"1996","element":"a"},{"text":"). We now provide a detailed simulation study of runtimes of the proposed method.","element":"span"}],[{"text":"In our experiments, we focus on ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1 and ","element":"span"},{"text":"p ","element":"span"},{"text":"= 4. The runtime for NN-DM consists of the time to estimate ","element":"span"},{"style":{"height":20.11},"width":39.56,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-6.png","element":"img","alt":" δ20 ","inline":true,"padRight":true},{"text":"by cross-validation as in Section ","element":"span"},{"href":"#id-57","text":"2.3 ","element":"a"},{"text":"and then drawing samples from its ","element":"span"},{"text":"pseudo-posterior. For both dimensions, the sample size is varied from ","element":"span"},{"text":"n ","element":"span"},{"text":"= 200 to ","element":"span"},{"text":"n ","element":"span"},{"text":"= 1500 in increments of 100. Data are generated from the standard Gaussian density (GS) for ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1 and from a mixture of skew t-distributions with the parameters as described for the case MST in Section ","element":"span"},{"href":"#id-77","text":"4.3 ","element":"a"},{"text":"for ","element":"span"},{"text":"p ","element":"span"},{"text":"= 4. For ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1, we evaluate the two methods at 500 test points, while for ","element":"span"},{"text":"p ","element":"span"},{"text":"= 4 we evaluate the methods at 200 test points. The hyperparameters are kept the same as in Sections ","element":"span"},{"href":"#id-76","text":"4.2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-77","text":"4.3","element":"a"},{"text":". We took 1000 Monte Carlo samples for the NN-DM and 2500 MCMC samples for the DP-MC with a burn-in of 1500 samples. The simulations were carried out on an i7-8700K processor with 16 gigabytes of memory.","element":"span"}],[{"text":"In the top panel of Figure ","element":"span"},{"href":"#id-86","text":"6","element":"a"},{"text":", we plot the average of the logarithm of the run times of each approach for 10 independent replications. The corresponding average ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-7.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"error of the two methods is also included in the bottom panel of Figure ","element":"span"},{"href":"#id-86","text":"6","element":"a"},{"text":". ","element":"span"},{"text":"The NN-DM is at least an order of magnitude faster than DP-MC. The time saved becomes more pronounced in the multivariate case, where for sample size 1500 the NN-DM is ","element":"span"},{"style":{"height":7.2},"width":37,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-8.png","element":"img","alt":" ∼","inline":true,"padRight":true},{"text":"15 times faster. The gain in computing time does not come at the cost of accuracy as can be seen from the right panel; the proposed method maintains the same order of ","element":"span"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/23-9.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"error as the DP-MC in the univariate case and often outperforms the DP-MC in the multivariate case. We did not implement the Monte Carlo sampler for the proposed algorithm in parallel, but such a modification would substantially improve runtime. Bypassing cross-validation and choosing default hyperparameters instead as outlined in Section ","element":"span"},{"href":"#id-57","text":"2.3","element":"a"},{"text":", NN-DM took 7","element":"span"},{"text":".","element":"span"},{"text":"7 seconds and 28","element":"span"},{"text":".","element":"span"},{"text":"4 seconds when ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1 and ","element":"span"},{"text":"p ","element":"span"},{"text":"= 4, respectively, with sample size ","element":"span"},{"text":"n ","element":"span"},{"text":"= 1500. In the same","element":"span"}],[{"id":"id-86","style":{"width":"77%"},"width":1453,"height":895,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/24-0.png","element":"img"}],[{"text":"Figure 6: ","element":"figcaption","subtype":"caption"},{"text":"Runtime comparison of DP-MC and NN-DM in univariate case and for 4-dimensional data. ","element":"figcaption","subtype":"caption"},{"text":"Top panel shows runtimes in log scale whereas bottom panel shows corresponding ","element":"figcaption","subtype":"caption"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/24-1.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"error. Sample size ","element":"figcaption","subtype":"caption"},{"text":"n ","element":"figcaption","subtype":"caption"},{"text":"is varied from 200 to 1500 in increments of 100.","element":"figcaption","subtype":"caption"}],[{"text":"scenario, DP-MC took 291","element":"span"},{"text":".","element":"span"},{"text":"3 seconds and 1504","element":"span"},{"text":".","element":"span"},{"text":"4 seconds for ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1 and ","element":"span"},{"text":"p ","element":"span"},{"text":"= 4, respectively. Thus the NN-DM with default hyperparameters is about 38 times faster when ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1 and about 53 times faster when ","element":"span"},{"text":"p ","element":"span"},{"text":"= 4.","element":"span"}],[{"id":"id-31","text":"4.7 ","element":"span"},{"text":"Sensitivity to the choice of ","element":"span"},{"text":"k","element":"span"}],[{"text":"In this subsection, we investigate the role of ","element":"span"},{"style":{"height":16.48},"width":157.48,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/24-2.png","element":"img","alt":" kn = k","inline":true,"padRight":true},{"text":"in finite samples for the proposed method. We consider ","element":"span"},{"text":"n ","element":"span"},{"text":"= 200 samples from the SP density in the univariate case and the MG density in the bivariate case. In each case, we fix a test set of ","element":"span"},{"style":{"height":11.68},"width":39.84,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/24-3.png","element":"img","alt":" nt","inline":true,"padRight":true},{"text":"= 500 points, and evaluate the out-of-sample log-likelihood (OOSLL) of the test points for 20 different integer values of ","element":"span"},{"text":"k ","element":"span"},{"text":"ranging from 2 to 50. Finally, we report results averaged from 10 independent replicates of this setup. We note that for each considered value of ","element":"span"},{"text":"k","element":"span"},{"text":", the parameter ","element":"span"},{"style":{"height":20.11},"width":132.76,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/24-4.png","element":"img","alt":" δ20 was","inline":true,"padRight":true},{"text":"estimated using leave-one-out cross-validation. Figure ","element":"span"},{"href":"#id-87","text":"7 ","element":"a"},{"text":"shows how the OOSLL averaged over replicates changes as a function of ","element":"span"},{"text":"k ","element":"span"},{"text":"for each density considered. The original OOSLL values of the test data points were scaled by the number of test points ","element":"span"},{"style":{"height":11.68},"width":39.85,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/24-5.png","element":"img","alt":" nt","inline":true,"padRight":true},{"text":"= 500 for better representability.","element":"span"}],[{"text":"For the univariate SP density, the optimal value of ","element":"span"},{"text":"k ","element":"span"},{"text":"which maximizes the average OOSLL is ","element":"span"},{"style":{"height":14},"width":25,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/24-6.png","element":"img","alt":"�k","inline":true,"padRight":true},{"text":"= 9. This is close to the choice of ","element":"span"},{"text":"k ","element":"span"},{"text":"= 6 as taken in Section ","element":"span"},{"href":"#id-76","text":"4.2","element":"a"},{"text":". For the bivariate MG density, we observe that the choice of ","element":"span"},{"text":"k ","element":"span"},{"text":"maximizing the OOSLL is ","element":"span"},{"style":{"height":14},"width":25.48,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/24-7.png","element":"img","alt":"�k","inline":true,"padRight":true},{"text":"= 12, which is also close to the choice of ","element":"span"},{"text":"k ","element":"span"},{"text":"= 10 as taken in Section ","element":"span"},{"href":"#id-77","text":"4.3","element":"a"},{"text":". For both the univariate and the bivariate","element":"span"}],[{"id":"id-87","style":{"width":"91%"},"width":1713,"height":541,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-0.png","element":"img"}],[{"text":"Figure 7: Average out-of-sample log-likelihood of 500 test points for the NN-DM as a function of ","element":"figcaption","subtype":"caption"},{"text":"k ","element":"figcaption","subtype":"caption"},{"text":"for one-dimensional and two-dimensional data. ","element":"figcaption","subtype":"caption"},{"text":"Number of samples and number of replications are ","element":"figcaption","subtype":"caption"},{"text":"n ","element":"figcaption","subtype":"caption"},{"text":"= 200 and ","element":"figcaption","subtype":"caption"},{"text":"R ","element":"figcaption","subtype":"caption"},{"text":"= 10, respectively.","element":"figcaption","subtype":"caption"}],[{"text":"case, the out-of-sample log-likelihood of the test set shows little variation with changing ","element":"span"},{"text":"k","element":"span"},{"text":". This indicates that the estimates obtained from the proposed method are quite robust to the particular choice of ","element":"span"},{"text":"k","element":"span"},{"text":".","element":"span"}]]},{"heading":"5 Application","paragraphs":[[{"text":"We apply the proposed density estimator to binary classification. ","element":"span"},{"text":"Consider data ","element":"span"},{"text":"D ","element":"span"},{"text":"= ","element":"span"},{"style":{"height":19.6},"width":959.12,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-1.png","element":"img","alt":"{(Xi, Yi) : i = 1, . . . , n}, where Xi ∈ Rp are p","inline":true},{"text":"-dimensional feature vectors and ","element":"span"},{"style":{"height":19.2},"width":224.16,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-2.png","element":"img","alt":" Yi ∈ {0, 1}","inline":true,"padRight":true},{"text":"are binary class labels. To predict the probability that ","element":"span"},{"style":{"height":12.8},"width":40.04,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-3.png","element":"img","alt":" y0","inline":true,"padRight":true},{"text":"= 1 for a test point ","element":"span"},{"style":{"height":12.4},"width":214.92,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-4.png","element":"img","alt":" x0, we use","inline":true}],[{"text":"Bayes rule:","element":"span"}],[{"style":{"width":"79%"},"width":1491,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-5.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":19.68},"width":102.44,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-6.png","element":"img","alt":"fj(x0","inline":true},{"text":") is the feature density at ","element":"span"},{"style":{"height":19.6},"width":532.16,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-7.png","element":"img","alt":" x0 in class j and pr(y0 = j","inline":true},{"text":") is the marginal probability of class ","element":"span"},{"text":"j","element":"span"},{"text":", for ","element":"span"},{"text":"j ","element":"span"},{"text":"= 0","element":"span"},{"text":", ","element":"span"},{"text":"1. Based on ","element":"span"},{"style":{"height":11.68},"width":39.84,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-8.png","element":"img","alt":" nt","inline":true,"padRight":true},{"text":"test data, we let ","element":"span"},{"style":{"height":20.38},"width":714.8,"height":50.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-9.png","element":"img","alt":" �pr(y0 = 1) = (1/nt) �nti=1 Yi, with","inline":true},{"style":{"height":19.6},"width":474.92,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-10.png","element":"img","alt":"�pr(y0 = 0) = 1 − �pr(y0","inline":true,"padRight":true},{"text":"= 1), and use either the NN-DM pseudo-posterior mean ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.6},"width":192.36,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-11.png","element":"img","alt":"fn(·), the","inline":true,"padRight":true},{"text":"DP-MC posterior mean ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.6},"width":105.16,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-12.png","element":"img","alt":"fDP(·","inline":true},{"text":"), or the DP-VB posterior mean ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.6},"width":105.64,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/25-13.png","element":"img","alt":"fVB(·","inline":true},{"text":") for estimating the within class densities. We omit the kernel density estimator as to the best of our knowledge, no routine ","element":"span"},{"text":"R ","element":"span"},{"text":"implementation is available for data having more than 6 dimensions. We compare the resulting classification performances in terms of sensitivity, specificity and probabilistic calibration.","element":"span"}],[{"text":"The high time resolution universe survey data (","element":"span"},{"href":"#id-88","referenceIndex":21,"text":"Keith et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-88","referenceIndex":21,"text":"2010","element":"a"},{"text":") contain information on sampled pulsar stars. Pulsar stars are a type of neutron stars and their radio emissions are detectable from the Earth. These stars have gained considerable interest from the scientific community due to their several applications (","element":"span"},{"href":"#id-89","referenceIndex":27,"text":"Lorimer and Kramer","element":"a"},{"text":", ","element":"span"},{"href":"#id-89","referenceIndex":27,"text":"2012","element":"a"},{"text":"). ","element":"span"},{"text":"The data are","element":"span"}],[{"id":"id-91","style":{"width":"92%"},"width":1739,"height":604,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/26-0.png","element":"img"}],[{"text":"Figure 8: Sensitivity and specificity of the NN-DM, DP-MC, and DP-VB for the universe survey data.","element":"figcaption","subtype":"caption"}],[{"text":"publicly available from the University of California at Irvine machine learning repository. Stars are classified into pulsar and non-pulsar groups according to 8 attributes (","element":"span"},{"href":"#id-90","referenceIndex":28,"text":"Lyon","element":"a"},{"text":", ","element":"span"},{"href":"#id-90","referenceIndex":28,"text":"2016","element":"a"},{"text":"). There are a total of 17898 instances of stars, among which 1639 are classified as pulsar stars. We create a test data set of 200 stars, among which 23 are pulsar stars. The training size is then varied from 300 to 1800 in increments of 300, each time adding 300 training points by randomly sampling from the entire data leaving out the initial test set. In Figure ","element":"span"},{"href":"#id-91","text":"8","element":"a"},{"text":", we plot the sensitivity and specificity of the three methods in consideration. All the methods exhibit similar sensitivity across various training sizes; the DP-MC has marginally better specificity for training sizes 1200 and 1500, while the NN-DM has better specificity for training sizes 300 and 600. Both the NN-DM and the DP-MC exhibit higher specificity and sensitivity than the DP-VB across all training sample sizes considered. ","element":"span"},{"href":"#id-81","referenceIndex":14,"text":"W","element":"a"},{"text":"e also compare the methods using the Brier score, a proper scoring rule (","element":"span"},{"href":"#id-81","referenceIndex":14,"text":"Gneiting and Raftery","element":"a"},{"text":",","element":"span"}],[{"href":"#id-81","referenceIndex":14,"text":"2007","element":"a"},{"text":") for probabilistic classification. Suppose for ","element":"span"},{"style":{"height":11.68},"width":39.84,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/26-1.png","element":"img","alt":" nt","inline":true,"padRight":true},{"text":"test points and the ","element":"span"},{"text":"i","element":"span"},{"text":"th Monte Carlo sample, ","element":"span"},{"style":{"height":12.4},"width":35.52,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/26-2.png","element":"img","alt":" pi","inline":true,"padRight":true},{"text":"denotes the sampled ","element":"span"},{"style":{"height":12.48},"width":88.36,"height":31.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/26-3.png","element":"img","alt":" nt ×","inline":true,"padRight":true},{"text":"1 probability vector for a generic method. We compute the normalized Brier score for the ","element":"span"},{"text":"i","element":"span"},{"text":"th sample as (1","element":"span"},{"style":{"height":20.51},"width":523.2,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/26-4.png","element":"img","alt":"/nt) ||pi − Yt||22, where Yt","inline":true,"padRight":true},{"text":"is the vector of ","element":"span"},{"text":"class labels in the test set. Then with ","element":"span"},{"text":"T ","element":"span"},{"text":"samples of ","element":"span"},{"style":{"height":16.8},"width":311.44,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/26-5.png","element":"img","alt":" pi, i = 1, . . . , T","inline":true},{"text":", we compute the mean Brier score for the three methods considered. The mean Brier score for each training size is shown in the right panel of Figure ","element":"span"},{"href":"#id-92","text":"9","element":"a"},{"text":", which naturally shows a declining trend with increasing training size. There is little to choose between the three classifiers in terms of mean Brier score; the proposed method fairs equally well in terms of calibration of estimated test set probabilities with the MCMC implementation of the Dirichlet process. In the left panel of Figure ","element":"span"},{"href":"#id-92","text":"9","element":"a"},{"text":", the receiver operating characteristic curve of the methods is shown for 1800 training samples. The area under the curve (AUC) for the NN-DM, the DP-MC and the DP-VB are 0","element":"span"},{"text":".","element":"span"},{"text":"96, 0","element":"span"},{"text":".","element":"span"},{"text":"95 and 0","element":"span"},{"text":".","element":"span"},{"text":"96, respectively. For 1800 training samples, the computation time for the proposed method is about 13 minutes while for the DP-MC it is approximately 5 hours.","element":"span"}],[{"id":"id-92","style":{"width":"92%"},"width":1725,"height":609,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/27-0.png","element":"img"}],[{"text":"Figure 9: Left plot shows the receiver operating characteristic curve of the NN-DM, DP-MC, and DP-VB with 1800 training samples. Area under the curve is abbreviated as AUC. Right plot shows normalized Brier scores for the methods with varying training sample size.","element":"figcaption","subtype":"caption"}],[{"id":"id-93","style":{"width":"90%"},"width":1701,"height":609,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/27-1.png","element":"img"}],[{"text":"Figure 10: Left and right plot show the out-of-sample log-likelihoods of NN-DM, DP-MC, and DP-VB for the two different star types.","element":"figcaption","subtype":"caption"}],[{"text":"Hence, the proposed method is much faster, even without exploiting parallel computation. We also fitted the proposed method using the training set of all 17698 points; DP-MC was too slow in this case. The sensitivity and specificity of the proposed method increased to 0","element":"span"},{"text":".","element":"span"},{"text":"99 and 0","element":"span"},{"text":".","element":"span"},{"text":"91, respectively. We additionally evaluated the methods in terms of the out-of-sample log-likelihood. The results are displayed in Figure ","element":"span"},{"href":"#id-93","text":"10","element":"a"},{"text":". While the methods perform comparably in terms of their classification performance, NN-DM achieves a better fit overall, especially for the significantly less prevalent pulsar star type.","element":"span"}]]},{"heading":"6 Discussion","paragraphs":[[{"text":"The proposed nearest neighbor-Dirichlet mixture provides a useful alternative to Bayesian density estimation based on Dirichlet mixtures with much faster computational speed and stability in avoiding MCMC. MCMC can have very poor performance in mixture models and other multimodal cases, due to difficulty in mixing, and hence can lead to posterior inferences that are unreliable. There is a recent literature attempting to scale up MCMCbased analyses in model-based clustering contexts including for Dirichlet process mixtures; refer, for example to ","element":"span"},{"href":"#id-94","referenceIndex":42,"text":"Song et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-94","referenceIndex":42,"text":"2020","element":"a"},{"text":") and ","element":"span"},{"href":"#id-95","referenceIndex":35,"text":"Ni et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-95","referenceIndex":35,"text":"2020","element":"a"},{"text":"). However, these approaches are complex to implement and are primarily focused on the problem of clustering, while we are instead focused on flexible modeling of unknown densities.","element":"span"}],[{"text":"The main conceptual disadvantage of the proposed approach is the lack of a coherent Bayesian posterior updating rule. However, we have shown that nonetheless the resulting pseudo-posterior can have appealing behavior in terms of frequentist asymptotic properties, finite sample performance, and accuracy in uncertainty quantification. ","element":"span"},{"text":"In addition, it is important to keep in mind that Bayesian kernel mixtures have key disadvantages that are difficult to remove within a fully coherent Bayesian modeling framework. These include a strong sensitivity to the choice of kernel and prior on the weights on these kernels; refer, for example to ","element":"span"},{"href":"#id-96","referenceIndex":32,"text":"Miller and Dunson ","element":"a"},{"text":"(","element":"span"},{"href":"#id-96","referenceIndex":32,"text":"2019","element":"a"},{"text":").","element":"span"}],[{"text":"There are several important next steps. The first is to develop fast and robust algorithms for using the nearest neighbor-Dirichlet mixture not just for density estimation but also as a component of more complex hierarchical models. For example, one may want to model the residual density in regression nonparametrically or treat a random effects distribution as unknown. In such settings, one can potentially update other parameters within a Bayesian model using Markov chain Monte Carlo, while using algorithms related to those proposed in this article to update the nonparametric part conditionally on these other parameters.","element":"span"}]]},{"heading":"7 Acknowledgements","paragraphs":[[{"text":"R ","element":"span"},{"text":"package ","element":"span"},{"text":"NNDM ","element":"span"},{"text":"available at ","element":"span"},{"href":"https://github.com/shounakchattopadhyay/NN-DM","text":"https://github.com/shounakchattopadhyay/NN-DM ","element":"a"},{"text":"was used for the numerical experiments. This research was partially supported by grants R01ES027498 and R01ES028804 of the United States National Institutes of Health and grant N00014-16-1-2147 of the Office of Naval Research.","element":"span"}]]},{"heading":"Appendix A Prerequisites","paragraphs":[[{"text":"We first introduce some notation with accompanying technical details which will be used hereafter. We define the Frobenius norm of the ","element":"span"},{"style":{"height":17.2},"width":312.44,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-0.png","element":"img","alt":" p × p matrix A","inline":true,"padRight":true},{"text":"with real-valued entries by ","element":"span"},{"style":{"height":21.71},"width":465.8,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-1.png","element":"img","alt":"||A||F = {tr(ATA)}1/2 ","inline":true,"padRight":true},{"text":"and denote its determinant by ","element":"span"},{"text":"|","element":"span"},{"text":"A","element":"span"},{"text":"|","element":"span"},{"text":". We observe that, for a vector ","element":"span"},{"style":{"height":21.71},"width":1131.56,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-2.png","element":"img","alt":"v ∈ Rp, one has ||vvT||F = ||v||22 where ||a||2 = (aTa)1/2","inline":true,"padRight":true},{"text":"is the Euclidean norm of ","element":"span"},{"text":"a","element":"span"},{"text":". For two ","element":"span"},{"text":"symmetric ","element":"span"},{"style":{"height":17.2},"width":497.28,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-3.png","element":"img","alt":" p × p matrices A and B","inline":true},{"text":", we say that ","element":"span"},{"style":{"height":16.4},"width":338.88,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-4.png","element":"img","alt":" A ≥ B if A − B","inline":true,"padRight":true},{"text":"is positive semi-definite, that is ","element":"span"},{"style":{"height":19.68},"width":1353.84,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-5.png","element":"img","alt":" xT(A − B)x ≥ 0 for all x ∈ Rp, x ̸= 0p where 0p = (0, . . . , 0)T ∈ Rp","inline":true},{"text":". For a symmetric matrix ","element":"span"},{"style":{"height":16.48},"width":52.04,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-6.png","element":"img","alt":" A∗","inline":true},{"text":", let the eigenvalues of ","element":"span"},{"style":{"height":16.48},"width":52.04,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-7.png","element":"img","alt":" A∗","inline":true,"padRight":true},{"text":"be denoted by ","element":"span"},{"style":{"height":19.68},"width":347.24,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-8.png","element":"img","alt":" e1(A∗), . . . , ep(A∗","inline":true},{"text":"), arranged such that ","element":"span"},{"style":{"height":19.68},"width":698.4,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-9.png","element":"img","alt":"e1(A∗) ≥ . . . ≥ ep(A∗). If A ≥ B","inline":true},{"text":", then it follows by the min-max theorem (","element":"span"},{"href":"#id-97","referenceIndex":45,"text":"Teschl","element":"a"},{"text":", ","element":"span"},{"href":"#id-97","referenceIndex":45,"text":"2009","element":"a"},{"text":") that, for each ","element":"span"},{"style":{"height":19.68},"width":732.48,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-10.png","element":"img","alt":" j = 1, . . . , p, we have ej(A) ≥ ej(B","inline":true},{"text":"). In particular, we have ","element":"span"},{"style":{"height":19.2},"width":306.8,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-11.png","element":"img","alt":" |A| ≥ |B| and","inline":true},{"style":{"height":19.2},"width":328.36,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-12.png","element":"img","alt":"||A||F ≥ ||B||F.","inline":true}],[{"text":"Now consider independent and identically distributed data ","element":"span"},{"style":{"height":17.6},"width":328.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-13.png","element":"img","alt":" X1, . . . , Xn ∼ f0","inline":true,"padRight":true},{"text":"supported on the interval [0","element":"span"},{"style":{"height":19.2},"width":75.6,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-14.png","element":"img","alt":", 1]p ","inline":true,"padRight":true},{"text":"and satisfying Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"as mentioned in Section ","element":"span"},{"text":"3.1","element":"span"},{"text":". Let ","element":"span"},{"style":{"height":16.71},"width":140.2,"height":41.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-15.png","element":"img","alt":" X (n) =","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":16.8},"width":222.6,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-16.png","element":"img","alt":"X1, . . . , Xn","inline":true},{"text":") and suppose ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-17.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"induces the measure ","element":"span"},{"style":{"height":18.48},"width":61.56,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-18.png","element":"img","alt":" Pf0","inline":true,"padRight":true},{"text":"on the Borel ","element":"span"},{"style":{"height":14},"width":258.96,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-19.png","element":"img","alt":" σ-field on Rp","inline":true},{"text":", denoted by ","element":"span"},{"style":{"height":19.6},"width":103.44,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-20.png","element":"img","alt":"B(Rp","inline":true},{"text":"). We form the ","element":"span"},{"text":"k","element":"span"},{"text":"-nearest neighborhood of ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-21.png","element":"img","alt":" Xi","inline":true,"padRight":true},{"text":"using the Euclidean norm for ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":". For a generic ","element":"span"},{"style":{"height":17.2},"width":373.48,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-22.png","element":"img","alt":" Xi, let Qi be its k","inline":true},{"text":"-th nearest neighbor in ","element":"span"},{"style":{"height":20.51},"width":724.56,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-23.png","element":"img","alt":" X −i = (X1, . . . , Xi−1, Xi+1, . . . , Xn)","inline":true,"padRight":true},{"text":"and let ","element":"span"},{"style":{"height":16.08},"width":47.52,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-24.png","element":"img","alt":" Ri","inline":true,"padRight":true},{"text":"be the distance between ","element":"span"},{"style":{"height":19.2},"width":732.68,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-25.png","element":"img","alt":" Xi and Qi, given by Ri = ||Xi−Qi||2","inline":true},{"text":". Define the ball ","element":"span"},{"style":{"height":16.08},"width":99.4,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-26.png","element":"img","alt":" Bi =","inline":true},{"style":{"height":19.2},"width":675.84,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-27.png","element":"img","alt":"{y ∈ [0, 1]p : 0 < ||y−Xi||2 < Ri}","inline":true,"padRight":true},{"text":"and the probability ","element":"span"},{"style":{"height":23.81},"width":494.52,"height":59.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-28.png","element":"img","alt":" G(Xi, Ri) =�Bi f0(u) du","inline":true,"padRight":true},{"text":"of the ball ","element":"span"},{"style":{"height":16.08},"width":61.96,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-29.png","element":"img","alt":" Bi.","inline":true,"padRight":true},{"text":"Let ","element":"span"},{"style":{"height":24.58},"width":570.92,"height":61.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-30.png","element":"img","alt":" Y (i)1 = Xi and Y (i)2 , . . . , Y (i)k−1 ","inline":true,"padRight":true},{"text":"denote the sample points which fall in ","element":"span"},{"style":{"height":16.08},"width":47.52,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-31.png","element":"img","alt":" Bi","inline":true},{"text":". Then, we define ","element":"span"},{"text":"the neighborhood specific empirical mean and covariance matrix as ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":27.36},"width":522.72,"height":68.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-32.png","element":"img","alt":"Xi = k−1{�k−1j=1 Y (i)j +Qi}","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":27.36},"width":1217.28,"height":68.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-33.png","element":"img","alt":" Si = k−1{�k−1j=1(Y (i)j − ¯Xi)(Y (i)j − ¯Xi)T+(Qi− ¯Xi)(Qi− ¯Xi)T}","inline":true},{"text":", respectively. Note that the random vector (","element":"span"},{"style":{"height":24.77},"width":343.2,"height":61.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-34.png","element":"img","alt":"Y (i)2 , . . . , Y (i)k−1, Qi","inline":true},{"text":") is identically distributed for ","element":"span"},{"style":{"height":16.8},"width":585.48,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-35.png","element":"img","alt":" i = 1, . . . , n since X1, . . . , Xn","inline":true,"padRight":true},{"text":"are independent and identically distributed. Thus we only consider the case ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1 from here on. For sake of brevity, denote by ","element":"span"},{"style":{"height":23.39},"width":1152.92,"height":58.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-36.png","element":"img","alt":" Yu = Y (1)u for u = 2, . . . , k − 1 and by Q = Q1. We also","inline":true,"padRight":true},{"text":"let ","element":"span"},{"text":"k ","element":"span"},{"text":"depend on ","element":"span"},{"text":"n ","element":"span"},{"text":"and express this dependence as ","element":"span"},{"style":{"height":16.48},"width":45.48,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-37.png","element":"img","alt":" kn","inline":true,"padRight":true},{"text":"when required. However, we routinely drop this dependence for notational simplicity.","element":"span"}],[{"text":"Conditional on ","element":"span"},{"style":{"height":19.2},"width":647.08,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-38.png","element":"img","alt":" X1 = x1 ∈ [0, 1]p and R1 = r1 >","inline":true,"padRight":true},{"text":"0, following ","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"Mack and Rosenblatt ","element":"a"},{"text":"(","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"1979","element":"a"},{"text":") the conditional joint density of ","element":"span"},{"style":{"height":17.2},"width":433.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-39.png","element":"img","alt":" Y2, . . . , Yk−1 and Q is","inline":true}],[{"style":{"width":"95%"},"width":1791,"height":146,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-40.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.6},"width":732.92,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-41.png","element":"img","alt":" G′(x1, r1) = ∂G(x1, r1)/∂r1 and I(A","inline":true},{"text":") denotes the indicator function of the event ","element":"span"},{"style":{"height":14.4},"width":80.96,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-42.png","element":"img","alt":" A ∈","inline":true},{"style":{"height":19.6},"width":103.44,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-43.png","element":"img","alt":"B(Rp","inline":true},{"text":"). Thus conditional on ","element":"span"},{"style":{"height":16.48},"width":218.6,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-44.png","element":"img","alt":" X1 and R1","inline":true},{"text":", the random variables ","element":"span"},{"style":{"height":16.8},"width":240.2,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/29-45.png","element":"img","alt":" Y2, . . . , Yk−1","inline":true,"padRight":true},{"text":"are independent and identically distributed, and independent of ","element":"span"},{"text":"Q","element":"span"},{"text":". Also, ","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"Mack and Rosenblatt ","element":"a"},{"text":"(","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"1979","element":"a"},{"text":") states that under Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"which imply ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-0.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"is bounded and continuous on [0","element":"span"},{"style":{"height":19.2},"width":164.04,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-1.png","element":"img","alt":", 1]p, we","inline":true,"padRight":true},{"text":"have","element":"span"}],[{"style":{"width":"66%"},"width":1246,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-2.png","element":"img"}],[{"text":"as ","element":"span"},{"style":{"height":21.79},"width":1163.24,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-3.png","element":"img","alt":" n → ∞ and r1 → 0, where Cp = [Γ{(p + 2)/2}]−1πp/2 ","inline":true,"padRight":true},{"text":"is the volume of the unit ball in ","element":"span"},{"style":{"height":13.2},"width":52.56,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-4.png","element":"img","alt":" Rp","inline":true},{"text":". Let the function ","element":"span"},{"style":{"height":19.6},"width":504.2,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-5.png","element":"img","alt":" ρ(x1, r1) = rκ11 where κ1","inline":true,"padRight":true},{"text":"is a non-negative integer. This function can be identified with ","element":"span"},{"style":{"height":19.6},"width":59.08,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-6.png","element":"img","alt":" φ(·","inline":true},{"text":") in equation (11) of ","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"Mack and Rosenblatt ","element":"a"},{"text":"(","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"1979","element":"a"},{"text":"). In the following propositions we will require the expected values of ","element":"span"},{"style":{"height":19.6},"width":147.08,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-7.png","element":"img","alt":" ρ(x1, r1","inline":true},{"text":") for different choices of ","element":"span"},{"style":{"height":16.48},"width":228.72,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-8.png","element":"img","alt":" κ1. To that","inline":true,"padRight":true},{"text":"end, we shall repeatedly make use of the equation (12) from ","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"Mack and Rosenblatt ","element":"a"},{"text":"(","element":"span"},{"href":"#id-1","referenceIndex":30,"text":"1979","element":"a"},{"text":") adapted to our setting:","element":"span"}],[{"id":"id-98","style":{"width":"88%"},"width":1654,"height":216,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-9.png","element":"img"}]]},{"heading":"B Proof of Theorem 3.4","paragraphs":[[{"text":"Suppose ","element":"span"},{"style":{"height":16.8},"width":222.6,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-10.png","element":"img","alt":" X1, . . . , Xn","inline":true,"padRight":true},{"text":"are independent and identically distributed random variables generated from the density ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-11.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"supported on [0","element":"span"},{"style":{"height":19.2},"width":75.6,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-12.png","element":"img","alt":", 1]p ","inline":true,"padRight":true},{"text":"satisfying Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3","element":"a"},{"text":". For ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":", recall the definitions of ","element":"span"},{"style":{"height":18},"width":407.04,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-13.png","element":"img","alt":" µi = µi and Λi = Λi","inline":true,"padRight":true},{"text":"from equation (","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":"):","element":"span"}],[{"style":{"width":"48%"},"width":909,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-14.png","element":"img"}],[{"text":"We want to show that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.69},"width":1082.04,"height":51.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-15.png","element":"img","alt":"fn(x) = (1/n) �ni=1 tγn−p+1(x; µi, Λi) → f0(x) in Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":23.79},"width":764.76,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-16.png","element":"img","alt":"n → ∞ for any x ∈ [0, 1]p, where ˆfn(x","inline":true},{"text":") is as described in (","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":"). We first prove two propositions involving successive mean value theorem type approximations to ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.6},"width":90.84,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-17.png","element":"img","alt":"fn(x","inline":true},{"text":"), which will imply the final result. We now state the two propositions, with accompanying proofs, before stating the final theorem.","element":"span"}],[{"id":"id-102","style":{"width":"100%"},"width":1873,"height":189,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-18.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Since the (Λ","element":"span"},{"style":{"height":19.6},"width":86.6,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-19.png","element":"img","alt":"i)ni=1 ","inline":true,"padRight":true},{"text":"are identically distributed and (","element":"span"},{"style":{"height":19.6},"width":114.92,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-20.png","element":"img","alt":"µi)ni=1 ","inline":true,"padRight":true},{"text":"are identically distributed, ","element":"span"},{"text":"we have ","element":"span"},{"style":{"height":26.85},"width":1693.12,"height":67.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-21.png","element":"img","alt":" EPf0( | ˆfn(x) − fA(x)| ) ≤ EPf0{ |tγn−p+1(x; µ1, Λ1) − tγn−p+1(x; X1, Λ1)| }. The mul-","inline":true,"padRight":true},{"text":"tivariate mean value theorem now implies that","element":"span"}],[{"id":"id-99","style":{"width":"97%"},"width":1823,"height":87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/30-22.png","element":"img"}],[{"style":{"width":"99%"},"width":1868,"height":326,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-0.png","element":"img"}],[{"text":"If we let ","element":"span"},{"style":{"height":20.59},"width":1681.48,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-1.png","element":"img","alt":" Hn = H = {νn(γn − p + 1)}−1(νn + 1)Ψ0 = h2Ip where h2 = h2n = {νn(γn − p +","inline":true,"padRight":true},{"text":"1)","element":"span"},{"style":{"height":20.51},"width":541.16,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-2.png","element":"img","alt":"}−1{(νn +1)(γ0−p+1)} δ20 ","inline":true,"padRight":true},{"text":"following the choice of Ψ","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-3.png","element":"img","alt":"0","inline":true,"padRight":true},{"text":"from Section ","element":"span"},{"href":"#id-57","text":"2.3","element":"a"},{"text":", then it is clear that ","element":"span"},{"text":"Λ","element":"span"},{"style":{"height":16.08},"width":128.4,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-4.png","element":"img","alt":"1 ≥ H","inline":true},{"text":". Therefore, we have ","element":"span"},{"style":{"height":24.59},"width":929.8,"height":61.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-5.png","element":"img","alt":" ||Λ−1/21 (X1 − µ1)||2 ≤ ||H−1/2||F ||X1 − µ1||2.","inline":true,"padRight":true},{"text":"Straightforward calculations show that ","element":"span"},{"style":{"height":21.71},"width":1391.04,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-6.png","element":"img","alt":" ||H−1/2 ||F = h−1p1/2 and ||X1 − µ1||2 ≤ R1 + {ν−1n (1 + ||µ0||2 )ν0}","inline":true,"padRight":true},{"text":"where ","element":"span"},{"style":{"height":20.67},"width":400.04,"height":51.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-7.png","element":"img","alt":" R1 = ||X1 −X1[k]||2","inline":true},{"text":". Using Theorem 2","element":"span"},{"text":".","element":"span"},{"text":"4 from ","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"Biau and Devroye ","element":"a"},{"text":"(","element":"span"},{"href":"#id-2","referenceIndex":3,"text":"2015","element":"a"},{"text":") for ","element":"span"},{"style":{"height":17.2},"width":200.24,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-8.png","element":"img","alt":" p ≥ 2 and","inline":true,"padRight":true},{"text":"(","element":"span"},{"href":"#id-98","text":"22","element":"a"},{"text":") for ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1, one gets","element":"span"}],[{"id":"id-100","style":{"width":"62%"},"width":1170,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-9.png","element":"img"}],[{"text":"for an appropriate constant ","element":"span"},{"style":{"height":18.88},"width":95.08,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-10.png","element":"img","alt":" dp >","inline":true,"padRight":true},{"text":"0. Thus, we have ","element":"span"},{"style":{"height":24.57},"width":808.08,"height":61.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-11.png","element":"img","alt":" EPf0(R1) ≤ {EPf0(R21)}1/2 ≤ dp(k/n)1/p","inline":true}],[{"text":"for sufficiently large ","element":"span"},{"text":"n","element":"span"},{"text":". This implies that","element":"span"}],[{"id":"id-117","style":{"width":"72%"},"width":1358,"height":127,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-12.png","element":"img"}],[{"text":"We also have ","element":"span"},{"style":{"height":21.31},"width":519.12,"height":53.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-13.png","element":"img","alt":" |Λ1|−1/2 ≤ |H|−1/2 = h−p","inline":true},{"text":". Finally, simple calculations yield that","element":"span"}],[{"style":{"width":"32%"},"width":603,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-14.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":18.48},"width":158.44,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-15.png","element":"img","alt":" L1,n,p >","inline":true,"padRight":true},{"text":"0 satisfies ","element":"span"},{"style":{"height":21.79},"width":676.32,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-16.png","element":"img","alt":" L1,n,p → (2π)−p/2e−1/2 as n → ∞","inline":true},{"text":". Plugging all these back in (","element":"span"},{"href":"#id-99","text":"23","element":"a"},{"text":"), we obtain a finite constant ","element":"span"},{"style":{"height":18.48},"width":157.48,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-17.png","element":"img","alt":" L2,n,p >","inline":true,"padRight":true},{"text":"0 such that","element":"span"}],[{"style":{"width":"89%"},"width":1684,"height":69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-18.png","element":"img"}],[{"text":"which goes to 0 as ","element":"span"},{"style":{"height":10.4},"width":150.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-19.png","element":"img","alt":" n → ∞","inline":true},{"text":", completing the proof.","element":"span"}],[{"text":"We now provide the second mean value theorem type approximation which approximates the random bandwidth matrix Λ","element":"span"},{"style":{"height":19.6},"width":885.64,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-20.png","element":"img","alt":"i in fA(x) by H = Hn for each i = 1, . . . , n.","inline":true}],[{"id":"id-103","style":{"height":20.69},"width":1643.76,"height":51.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-21.png","element":"img","alt":"Proposition B.2. Fix x ∈ [0, 1]p. Let fK(x) = (1/n) �ni=1 tγn−p+1(x; Xi, H)","inline":true},{"text":". Also, let ","element":"span"},{"style":{"height":21.71},"width":1212.48,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-22.png","element":"img","alt":"k = o(ni2) with i2 = 4/(p + 2)2 and ν0 = o{n−2/pk(2/p)+1}","inline":true},{"text":". Then, we have ","element":"span"},{"style":{"height":22.46},"width":293.8,"height":56.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-23.png","element":"img","alt":" EPf0( |fA(x) −","inline":true},{"style":{"height":19.6},"width":496.88,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-24.png","element":"img","alt":"fK(x)| ) → 0 as n → ∞.","inline":true}],[{"text":"Proof. ","element":"span"},{"text":"Using the identically distributed properties of (Λ","element":"span"},{"style":{"height":19.6},"width":328.04,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-25.png","element":"img","alt":"i)ni=1 and (Xi)ni=1","inline":true},{"text":", we obtain ","element":"span"},{"style":{"height":22.46},"width":280.84,"height":56.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-26.png","element":"img","alt":" EPf0( |fA(x)−","inline":true},{"style":{"height":22.46},"width":1119.4,"height":56.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/31-27.png","element":"img","alt":"fK(x)| ) ≤ EPf0( |tγn−p+1(x; X1, Λ1)−tγn−p+1(x; X1, H)|","inline":true,"padRight":true},{"text":"). Using the multivariate mean value","element":"span"}],[{"text":"theorem, we obtain that","element":"span"}],[{"id":"id-101","style":{"width":"93%"},"width":1747,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-0.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":20.08},"width":692.76,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-1.png","element":"img","alt":" M1 = [∂{tγn−p+1(x; X1, Σ)}/∂Σ]Σ0","inline":true,"padRight":true},{"text":"for some Σ","element":"span"},{"style":{"height":17.2},"width":205.16,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-2.png","element":"img","alt":"0, with Σ0","inline":true,"padRight":true},{"text":"in the convex hull of Λ","element":"span"},{"style":{"height":16.48},"width":110,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-3.png","element":"img","alt":"1 and","inline":true},{"style":{"height":16.48},"width":354.96,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-4.png","element":"img","alt":"H. Since Λ1 ≥ H","inline":true},{"text":", we immediately have Σ","element":"span"},{"style":{"height":16.08},"width":125.04,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-5.png","element":"img","alt":"0 ≥ H","inline":true,"padRight":true},{"text":"as well. Using the definitions of Λ","element":"span"},{"style":{"height":17.2},"width":180.04,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-6.png","element":"img","alt":"1 and H,","inline":true,"padRight":true},{"text":"we have","element":"span"}],[{"style":{"width":"55%"},"width":1039,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-7.png","element":"img"}],[{"style":{"height":45.71},"width":602.16,"height":114.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-8.png","element":"img","alt":"||Λ1 − H||F ≤ (νn + 1)νn(γn − p + 1)","inline":true}],[{"text":"Since ","element":"span"},{"style":{"height":23.97},"width":1748.2,"height":59.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-9.png","element":"img","alt":" || �j∈N1(Xj − ¯X1)(Xj − ¯X1)T||F ≤ �j∈N1 ||(Xj − ¯X1)(Xj − ¯X1)T||F = �j∈N1 ||Xj −","inline":true,"padRight":true},{"text":"¯","element":"span"},{"style":{"height":23.28},"width":506.6,"height":58.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-10.png","element":"img","alt":"X1||22 ≤ �j∈N1 R21 = kR21","inline":true},{"text":", we get for sufficiently large ","element":"span"},{"text":"n ","element":"span"},{"text":"the following:","element":"span"}],[{"id":"id-118","style":{"width":"74%"},"width":1392,"height":269,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-11.png","element":"img"}],[{"text":"using (","element":"span"},{"href":"#id-100","text":"24","element":"a"},{"text":") and ","element":"span"},{"style":{"height":23.36},"width":456.16,"height":58.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-12.png","element":"img","alt":" ν0 = o�n−2/pk(2/p)+1�","inline":true},{"text":". Taking partial derivatives of log","element":"span"},{"style":{"height":20.08},"width":387.84,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-13.png","element":"img","alt":"{tγn−p+1(x; X1, Σ)}","inline":true,"padRight":true},{"text":"with respect to Σ evaluated at Σ","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-14.png","element":"img","alt":"0","inline":true,"padRight":true},{"text":"and taking Frobenius norm of both sides, we obtain","element":"span"}],[{"style":{"width":"43%"},"width":811,"height":61,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-15.png","element":"img"}],[{"text":"for sufficiently large ","element":"span"},{"text":"n","element":"span"},{"text":". We now observe that","element":"span"}],[{"style":{"width":"78%"},"width":1472,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-16.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":21.79},"width":1125.16,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-17.png","element":"img","alt":" cp,β = (πβ)−p/2{Γ(β/2)}−1Γ{(β + p)/2} for p ≥ 1, β >","inline":true,"padRight":true},{"text":"0. Note that ","element":"span"},{"style":{"height":21.79},"width":314.12,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-18.png","element":"img","alt":" cp,β → (2π)−p/2","inline":true}],[{"text":"as ","element":"span"},{"style":{"height":17.6},"width":405.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-19.png","element":"img","alt":" β → ∞ for any p ≥","inline":true,"padRight":true},{"text":"1. This immediately implies that ","element":"span"},{"style":{"height":22.19},"width":699.12,"height":55.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-20.png","element":"img","alt":" ||M1||F ≤ h−(p+2)cp,γn−p+1(γn + 1)","inline":true,"padRight":true},{"text":"for sufficiently large ","element":"span"},{"text":"n","element":"span"},{"text":". Plugging all these back in equation (","element":"span"},{"href":"#id-101","text":"27","element":"a"},{"text":"), we obtain for sufficiently large ","element":"span"},{"style":{"height":18.88},"width":373,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-21.png","element":"img","alt":" n, a finite L3,n,p >","inline":true,"padRight":true},{"text":"0 such that","element":"span"}],[{"style":{"width":"87%"},"width":1646,"height":69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-22.png","element":"img"}],[{"text":"which goes to 0 as ","element":"span"},{"style":{"height":10.4},"width":150.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/32-23.png","element":"img","alt":" n → ∞","inline":true},{"text":", proving the proposition.","element":"span"}],[{"text":"We now prove Theorem ","element":"span"},{"href":"#id-41","text":"3.4","element":"a"},{"text":".","element":"span"}],[{"style":{"width":"99%"},"width":1863,"height":60,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-0.png","element":"img"}],[{"text":"triangle inequality. Using Propositions ","element":"span"},{"href":"#id-102","text":"B.1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-103","text":"B.2","element":"a"},{"text":", we obtain that ","element":"span"},{"style":{"height":26.65},"width":504.48,"height":66.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-1.png","element":"img","alt":" EPf0( | ˆfn(x)−fK(x)| ) →","inline":true,"padRight":true},{"text":"0 as ","element":"span"},{"style":{"height":10.4},"width":150.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-2.png","element":"img","alt":" n → ∞","inline":true},{"text":". From Section ","element":"span"},{"text":"G ","element":"span"},{"text":"of the Appendix, we obtain ","element":"span"},{"style":{"height":19.68},"width":429.24,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-3.png","element":"img","alt":" fK(x) → f0(x) in Pf0","inline":true},{"text":"-probability. This immediately implies that given the conditions on ","element":"span"},{"style":{"height":17.2},"width":87.08,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-4.png","element":"img","alt":" k, ν0","inline":true,"padRight":true},{"text":"and for any ","element":"span"},{"style":{"height":19.2},"width":388.68,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-5.png","element":"img","alt":" x ∈ [0, 1]p, we have","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"style":{"height":19.68},"width":419.64,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-6.png","element":"img","alt":"fn(x) → f0(x) in Pf0","inline":true},{"text":"-probability.","element":"span"}]]},{"heading":"C Proof of Theorem 3.5","paragraphs":[[{"style":{"height":19.68},"width":1183.2,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-7.png","element":"img","alt":"Proof. Fix x ∈ [0, 1]p. For i = 1, . . . , n, let zi = φp(x ; ηi, Σi","inline":true},{"text":") and suppose ","element":"span"},{"style":{"height":21.71},"width":397.96,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-8.png","element":"img","alt":" z(n) = (z1, . . . , zn)T.","inline":true,"padRight":true},{"text":"Then, we have ","element":"span"},{"style":{"height":22.08},"width":1246.12,"height":55.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-9.png","element":"img","alt":" f(x) = �ni=1 πizi = z(n)Tπ(n) where π(n) = (π1, . . . , πn)T ∼","inline":true,"padRight":true},{"text":"Dirichlet(","element":"span"},{"style":{"height":12.8},"width":81.16,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-10.png","element":"img","alt":"α +","inline":true,"padRight":true},{"text":"1","element":"span"},{"style":{"height":12.4},"width":135.12,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-11.png","element":"img","alt":", . . . , α","inline":true,"padRight":true},{"text":"+ 1) given ","element":"span"},{"style":{"height":16.51},"width":87.88,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-12.png","element":"img","alt":" X (n)","inline":true},{"text":". We begin with the identity","element":"span"}],[{"id":"id-105","style":{"width":"97%"},"width":1826,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-13.png","element":"img"}],[{"text":"For ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":", we have","element":"span"}],[{"id":"id-104","style":{"width":"73%"},"width":1368,"height":59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-14.png","element":"img"}],[{"text":"where","element":"span"}],[{"style":{"width":"99%"},"width":1871,"height":1068,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-15.png","element":"img"}],[{"text":"where the first equality is obtained using ","element":"span"},{"style":{"height":21.71},"width":1007.4,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-16.png","element":"img","alt":" E(πi | X (n)) = 1/n for each i = 1, . . . , n and the","inline":true,"padRight":true},{"text":"last equality is obtained using equation (","element":"span"},{"href":"#id-104","text":"32","element":"a"},{"text":"). For ","element":"span"},{"style":{"height":19.2},"width":823.56,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/33-17.png","element":"img","alt":" i = 1, . . . , n, since |Λi| ≥ |Hn|, we have","inline":true}],[{"id":"id-107","style":{"width":"99%"},"width":1863,"height":250,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-0.png","element":"img"}],[{"text":"We now analyze the second term on the right hand side of (","element":"span"},{"href":"#id-105","text":"31","element":"a"},{"text":"). Recall that ","element":"span"},{"style":{"height":16.91},"width":130.36,"height":42.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-1.png","element":"img","alt":" π(n) is","inline":true,"padRight":true},{"text":"conditionally independent of ","element":"span"},{"style":{"height":16.91},"width":70.12,"height":42.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-2.png","element":"img","alt":" z(n) ","inline":true,"padRight":true},{"text":"given the data ","element":"span"},{"style":{"height":16.51},"width":87.88,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-3.png","element":"img","alt":" X (n) ","inline":true,"padRight":true},{"text":"following from the nearest neighborDirichlet mixture framework as discussed in Section ","element":"span"},{"text":"2.1","element":"span"},{"text":". Let Σ","element":"span"},{"style":{"height":5.6},"width":20,"height":14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-4.png","element":"img","alt":"π","inline":true,"padRight":true},{"text":"denote the pseudo-posterior covariance matrix of ","element":"span"},{"style":{"height":20.11},"width":591.88,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-5.png","element":"img","alt":" π(n). Given X (n), since π(n) ∼","inline":true,"padRight":true},{"text":"Dirichlet(","element":"span"},{"style":{"height":16.4},"width":226.32,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-6.png","element":"img","alt":"α+1, . . . , α","inline":true},{"text":"+1), standard results yield that Σ","element":"span"},{"style":{"height":20.71},"width":1633.48,"height":51.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-7.png","element":"img","alt":"π = Vn{(1−Cn)In+CnJn}, where Vn = (n−1)/[n2{n(α+1)+1}], Cn = −1/(n−1),","inline":true},{"style":{"height":16.48},"width":301.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-8.png","element":"img","alt":"In is the n × n","inline":true,"padRight":true},{"text":"identity matrix, and ","element":"span"},{"style":{"height":19.6},"width":788.04,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-9.png","element":"img","alt":" Jn = 1n1Tn where 1n = (1, . . . , 1)T ∈ Rn","inline":true},{"text":". Then, we have","element":"span"}],[{"id":"id-106","style":{"width":"79%"},"width":1494,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-10.png","element":"img"}],[{"text":"Using the expression for Σ","element":"span"},{"style":{"height":5.6},"width":20,"height":14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-11.png","element":"img","alt":"π","inline":true,"padRight":true},{"text":"along with equation (","element":"span"},{"href":"#id-106","text":"34","element":"a"},{"text":"), we obtain,","element":"span"}],[{"style":{"width":"90%"},"width":1685,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-12.png","element":"img"}],[{"text":"where ¯","element":"span"},{"style":{"height":20.38},"width":354.72,"height":50.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-13.png","element":"img","alt":"z = (1/n) �ni=1 zi","inline":true},{"text":". We now have","element":"span"}],[{"style":{"width":"93%"},"width":1758,"height":449,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-14.png","element":"img"}],[{"text":"where the last inequality is obtained using equation (","element":"span"},{"href":"#id-104","text":"32","element":"a"},{"text":"). Using ","element":"span"},{"style":{"height":19.2},"width":584.8,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-15.png","element":"img","alt":" |Bi| ≥ Dpn|H| for i = 1, . . . , n","inline":true,"padRight":true},{"text":"as before, we have","element":"span"}],[{"id":"id-108","style":{"width":"80%"},"width":1506,"height":126,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-16.png","element":"img"}],[{"text":"Combining equations (","element":"span"},{"href":"#id-107","text":"33","element":"a"},{"text":") and (","element":"span"},{"href":"#id-108","text":"36","element":"a"},{"text":") and putting the results back in equation (","element":"span"},{"href":"#id-105","text":"31","element":"a"},{"text":"), we have the inequality. ","element":"span"},{"text":"If we let ","element":"span"},{"style":{"height":10.4},"width":171.36,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-17.png","element":"img","alt":" n → ∞","inline":true,"padRight":true},{"text":"we immediately obtain that var","element":"span"},{"style":{"height":21.9},"width":469.52,"height":54.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-18.png","element":"img","alt":"{f(x) | X (n)} → 0 in","inline":true},{"style":{"height":18.48},"width":61.56,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/34-19.png","element":"img","alt":"Pf0","inline":true},{"text":"-probability.","element":"span"}]]},{"heading":"D Proof of Theorem 3.7","paragraphs":[[{"text":"Proof. ","element":"span"},{"text":"We have iid data ","element":"span"},{"style":{"height":24.94},"width":520.52,"height":62.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-0.png","element":"img","alt":" X (n) = (X1, . . . , Xn)iid∼ f0","inline":true,"padRight":true},{"text":"satisfying Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"for ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1. The simplified NN-DM density estimator is given by","element":"span"}],[{"style":{"width":"26%"},"width":499,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-1.png","element":"img"}],[{"text":"where (","element":"span"},{"style":{"height":21.9},"width":787.64,"height":54.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-2.png","element":"img","alt":"ηi, σ2i ) | X (n) ∼ NIG(µi, νn, γn/2, γnδ2i /","inline":true},{"text":"2). The pseudo-posterior distribution of ","element":"span"},{"text":"g","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"given ","element":"span"},{"style":{"height":16.51},"width":87.88,"height":41.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-3.png","element":"img","alt":" X (n) ","inline":true,"padRight":true},{"text":"is induced through the pseudo-posterior distributions of (","element":"span"},{"style":{"height":20.51},"width":178.28,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-4.png","element":"img","alt":"ηi, σ2i )ni=1","inline":true},{"text":". The pseudo- ","element":"span"},{"text":"posterior mean is of the form ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.69},"width":1244.68,"height":51.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-5.png","element":"img","alt":"fn(x) = (1/n) �ni=1 tγn {(x − µi)/λi} /λi, where λi = {(νn +","inline":true,"padRight":true},{"text":"1)","element":"span"},{"style":{"height":21.71},"width":1148.24,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-6.png","element":"img","alt":"/νn}1/2δi. Let hn = (νnγn)−1/2(νn + 1)1/2(γ0δ20)1/2. Then","inline":true}],[{"id":"id-109","style":{"width":"67%"},"width":1272,"height":68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-7.png","element":"img"}],[{"text":"for ","element":"span"},{"style":{"height":21.71},"width":748.8,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-8.png","element":"img","alt":" kn = o(n2/7) and kn → ∞ as n → ∞","inline":true,"padRight":true},{"text":"from Section ","element":"span"},{"text":"B ","element":"span"},{"text":"of the Appendix, where","element":"span"}],[{"style":{"width":"34%"},"width":652,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-9.png","element":"img"}],[{"text":"We want to investigate the asymptotic distribution of ","element":"span"},{"style":{"height":19.6},"width":337.44,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-10.png","element":"img","alt":" g(x) as n → ∞","inline":true},{"text":". For that, let us start with the asymptotic distribution of ","element":"span"},{"style":{"height":19.6},"width":101.4,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-11.png","element":"img","alt":" fK(x","inline":true},{"text":"), which can be expressed as ","element":"span"},{"style":{"height":19.6},"width":180.04,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-12.png","element":"img","alt":" fK(x) =","inline":true},{"style":{"height":21.18},"width":996.96,"height":52.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-13.png","element":"img","alt":"n−1 �ni=1 uin, where uin = h−1n tγn{(x − Xi)/hn}","inline":true},{"text":". Using Lyapunov’s central limit theorem ","element":"span"},{"text":"and denoting convergence in distribution under ","element":"span"},{"style":{"height":17.6},"width":355.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-14.png","element":"img","alt":" f0 by d0, we have","inline":true}],[{"style":{"width":"99%"},"width":1867,"height":348,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-15.png","element":"img"}],[{"text":"for some ","element":"span"},{"style":{"height":20.51},"width":1680.04,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-16.png","element":"img","alt":" r > 2, where ρin = E|uin − E(uin)|r and τ 2in = E{uin − E(uin)}2 for i = 1, . . . , n.","inline":true,"padRight":true},{"text":"By standard calculations, we have","element":"span"}],[{"style":{"width":"37%"},"width":701,"height":115,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-17.png","element":"img"}],[{"text":"For ","element":"span"},{"text":"r ","element":"span"},{"text":"= 3,","element":"span"}],[{"style":{"width":"38%"},"width":729,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-18.png","element":"img"}],[{"text":"It is straightforward to see that ","element":"span"},{"style":{"height":22.77},"width":576.56,"height":56.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-19.png","element":"img","alt":"�trγn(u)du/�tγn(u)du = O","inline":true},{"text":"(1) for any ","element":"span"},{"style":{"height":17.2},"width":337.8,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/35-20.png","element":"img","alt":" r ≥ 1. So, the","inline":true,"padRight":true},{"text":"Lyapunov’s condition is satisfied as the ratio in this case satisfies ","element":"span"},{"style":{"height":21.9},"width":525.12,"height":54.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-0.png","element":"img","alt":" O{(nhn)−1/6} and nhn →","inline":true},{"style":{"height":8.8},"width":48,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-1.png","element":"img","alt":"∞","inline":true},{"text":". Additionally, ","element":"span"},{"style":{"height":21.6},"width":656.64,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-2.png","element":"img","alt":" |τ 2in − {f0(x)/hn}�φ2(u) du| →","inline":true,"padRight":true},{"text":"0. So by a combination of Lyapunov’s central limit theorem and Slutsky’s theorem, we have","element":"span"}],[{"style":{"width":"76%"},"width":1439,"height":115,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-3.png","element":"img"}],[{"text":"since","element":"span"},{"style":{"height":22.56},"width":462.92,"height":56.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-4.png","element":"img","alt":"�φ2(u) du = (2π1/2)−1","inline":true},{"text":". From the calculations in Section ","element":"span"},{"text":"G ","element":"span"},{"text":"of the Appendix, we can expand the Taylor series to two more terms to obtain","element":"span"}],[{"id":"id-110","style":{"width":"99%"},"width":1871,"height":932,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-5.png","element":"img"}],[{"text":"since ","element":"span"},{"style":{"height":23.98},"width":456.12,"height":59.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-6.png","element":"img","alt":" E{g(x) | X (n)} = ˆfn(x","inline":true},{"text":"). The pseudo-posterior variance of ","element":"span"},{"text":"g","element":"span"},{"text":"(","element":"span"},{"text":"x","element":"span"},{"text":") is given by","element":"span"}],[{"style":{"width":"83%"},"width":1563,"height":734,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-7.png","element":"img"}],[{"text":"with ","element":"span"},{"style":{"height":21.71},"width":743.52,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/36-8.png","element":"img","alt":" ud = Γ{(d + 1)/2}/{(dπ)1/2Γ(d/2)}","inline":true,"padRight":true},{"text":"being the normalizing constant of the Student’s t-density with degrees of freedom ","element":"span"},{"text":"d > ","element":"span"},{"text":"0. Using Stirling’s approximation, ∆","element":"span"},{"style":{"height":15.68},"width":352.84,"height":39.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-0.png","element":"img","alt":"n → 0 as n → ∞.","inline":true,"padRight":true},{"text":"This immediately implies","element":"span"}],[{"style":{"width":"36%"},"width":687,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-1.png","element":"img"}],[{"text":"where","element":"span"}],[{"style":{"width":"36%"},"width":689,"height":119,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-2.png","element":"img"}],[{"text":"Using the techniques of Section ","element":"span"},{"text":"B ","element":"span"},{"text":"of the Appendix, it can be shown that ","element":"span"},{"style":{"height":22.47},"width":423.12,"height":56.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-3.png","element":"img","alt":" EPf0{vg(x)} → f0(x)","inline":true,"padRight":true},{"text":"as ","element":"span"},{"style":{"height":10.4},"width":150.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-4.png","element":"img","alt":" n → ∞","inline":true},{"text":". Therefore, we have","element":"span"}],[{"style":{"width":"73%"},"width":1372,"height":324,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-5.png","element":"img"}],[{"text":"as ","element":"span"},{"style":{"height":10.4},"width":150.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-6.png","element":"img","alt":" n → ∞","inline":true},{"text":". A simple application of Chebychev’s inequality implies (","element":"span"},{"style":{"height":23.79},"width":509.72,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-7.png","element":"img","alt":"nhn)1/2|g(x)− ˆfn(x)| → 0","inline":true,"padRight":true},{"text":"in ","element":"span"},{"style":{"height":18.48},"width":61.56,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-8.png","element":"img","alt":" Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":10.4},"width":189.64,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-9.png","element":"img","alt":" n → ∞.","inline":true,"padRight":true},{"text":"Combining this with (","element":"span"},{"href":"#id-109","text":"37","element":"a"},{"text":") and (","element":"span"},{"href":"#id-110","text":"40","element":"a"},{"text":") and using Slutsky’s theorem, we obtain the desired result. This result means that pseudo-credible intervals can be considered frequentist confidence intervals on average asymptotically.","element":"span"}]]},{"heading":"E Proof of Theorem 3.8","paragraphs":[[{"text":"E.1 ","element":"span"},{"text":"A property of the ","element":"span"},{"style":{"height":19.26},"width":53.28,"height":48.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-10.png","element":"img","alt":" kn","inline":true},{"text":"-nearest neighbor distance","element":"span"}],[{"text":"Suppose ","element":"span"},{"style":{"height":23.55},"width":508.52,"height":58.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-11.png","element":"img","alt":" X1, . . . , Xniid∼ f0 with f0","inline":true,"padRight":true},{"text":"a density on ","element":"span"},{"style":{"height":13.2},"width":52.56,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-12.png","element":"img","alt":" Rp ","inline":true,"padRight":true},{"text":"satisfying Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3","element":"a"},{"text":". Denote the induced probability measure ","element":"span"},{"style":{"height":18.88},"width":191.24,"height":47.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-13.png","element":"img","alt":" Pf0 by P0","inline":true,"padRight":true},{"text":"for sake of convenience. We define the smoothed ","element":"span"},{"text":"k","element":"span"},{"text":"-nearest neighborhood of ","element":"span"},{"style":{"height":20.67},"width":1332.76,"height":51.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-14.png","element":"img","alt":" Xi as Bi = {x ∈ Rp : ||Xi−x|| ≤ Ri}, where Ri = ||Xi−Xi[kn]||2 is","inline":true,"padRight":true},{"text":"the Euclidean distance between ","element":"span"},{"style":{"height":16.48},"width":270.6,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-15.png","element":"img","alt":" Xi and the kn","inline":true},{"text":"-nearest neighbor of ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-16.png","element":"img","alt":" Xi","inline":true,"padRight":true},{"text":"amongst the data points ","element":"span"},{"style":{"height":17.68},"width":872.8,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-17.png","element":"img","alt":"X1, . . . , Xi−1, Xi+1, . . . , Xn for i = 1, . . . , n","inline":true},{"text":". It is immediate that ","element":"span"},{"style":{"height":16.8},"width":216.36,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-18.png","element":"img","alt":" R1, . . . , Rn","inline":true,"padRight":true},{"text":"are identically distributed from symmetry. ","element":"span"},{"text":"Suppose ","element":"span"},{"style":{"height":21.91},"width":315.6,"height":54.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-19.png","element":"img","alt":" rn = (kn/n)1/p ","inline":true,"padRight":true},{"text":"and define the quasi-neighborhood ˜","element":"span"},{"style":{"height":19.6},"width":716.16,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-20.png","element":"img","alt":"Bi(r) = {x ∈ Rp : ||Xi − x|| ≤ r}","inline":true},{"text":", where the random variables ","element":"span"},{"style":{"height":16.08},"width":47.52,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-21.png","element":"img","alt":" Ri","inline":true,"padRight":true},{"text":"have been replaced by","element":"span"}],[{"style":{"width":"71%"},"width":1336,"height":190,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-22.png","element":"img"}],[{"text":"The positive density condition on ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-23.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"obtained from Assumptions ","element":"span"},{"href":"#id-36","text":"3.1 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-39","text":"3.3 ","element":"a"},{"text":"(","element":"span"},{"href":"#id-35","referenceIndex":10,"text":"Evans et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-35","referenceIndex":10,"text":"2002","element":"a"},{"text":"; ","element":"span"},{"href":"#id-40","referenceIndex":9,"text":"Evans","element":"a"},{"text":", ","element":"span"},{"href":"#id-40","referenceIndex":9,"text":"2008","element":"a"},{"text":") ensures the existence of ","element":"span"},{"style":{"height":18},"width":300.52,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-24.png","element":"img","alt":" A > 1 and ρ >","inline":true,"padRight":true},{"text":"0 such that for all 0 ","element":"span"},{"style":{"height":18},"width":251.12,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/37-25.png","element":"img","alt":" ≤ r ≤ ρ and","inline":true}],[{"text":"for all ","element":"span"},{"style":{"height":19.2},"width":210.76,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-0.png","element":"img","alt":" x ∈ [0, 1]p,","inline":true}],[{"id":"id-112","style":{"width":"59%"},"width":1116,"height":93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-1.png","element":"img"}],[{"text":"We first state a Lemma proving some important properties of ","element":"span"},{"style":{"height":16.08},"width":52.52,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-2.png","element":"img","alt":" R1","inline":true},{"text":". Recall that two sequences (","element":"span"},{"style":{"height":19.6},"width":242.76,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-3.png","element":"img","alt":"an) and (bn","inline":true},{"text":") are said to be asymptotically equivalent if ","element":"span"},{"style":{"height":19.2},"width":653.32,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-4.png","element":"img","alt":" |an/bn| → c0 for some c0 > 0,","inline":true,"padRight":true},{"text":"denoted by ","element":"span"},{"style":{"height":16.48},"width":166.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-5.png","element":"img","alt":" an ∼ bn.","inline":true}],[{"id":"id-111","style":{"width":"100%"},"width":1872,"height":1312,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-6.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"(i) Note that ","element":"span"},{"style":{"height":16.8},"width":130.56,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-7.png","element":"img","alt":" cn ≤ ρ","inline":true,"padRight":true},{"text":"for sufficiently large ","element":"span"},{"text":"n","element":"span"},{"text":". From Lemma 4.1 of ","element":"span"},{"href":"#id-35","referenceIndex":10,"text":"Evans et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-35","referenceIndex":10,"text":"2002","element":"a"},{"text":") we have","element":"span"}],[{"style":{"width":"71%"},"width":1340,"height":451,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/38-8.png","element":"img"}],[{"style":{"width":"88%"},"width":1660,"height":253,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-0.png","element":"img"}],[{"text":"(ii) For ","element":"span"},{"style":{"height":21.5},"width":710.88,"height":53.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-1.png","element":"img","alt":" n > n0, we have pn = O{n−(1+Θn)}","inline":true,"padRight":true},{"text":"for a sequence Θ","element":"span"},{"style":{"height":17.2},"width":281.8,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-2.png","element":"img","alt":"n → ∞, Θn >","inline":true,"padRight":true},{"text":"0. This ensures that ","element":"span"},{"style":{"height":22.51},"width":358.12,"height":56.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-3.png","element":"img","alt":"�∞n=n0+1 pn < ∞.","inline":true}],[{"text":"(iii) Since ","element":"span"},{"style":{"height":20.97},"width":285.6,"height":52.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-4.png","element":"img","alt":"�∞n=1 pn < ∞","inline":true},{"text":", a direct application of the first Borel-Cantelli lemma proves the ","element":"span"},{"text":"statement.","element":"span"}],[{"style":{"width":"98%"},"width":1846,"height":546,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-5.png","element":"img"}],[{"text":"We now use the above Lemma to prove Theorem ","element":"span"},{"href":"#id-49","text":"3.8","element":"a"},{"text":". The key idea is to leverage the fact that ","element":"span"},{"style":{"height":17.2},"width":542.56,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-6.png","element":"img","alt":" Ri > cn for all i = 1, . . . , n","inline":true,"padRight":true},{"text":"with probability 1 for all but finite ","element":"span"},{"text":"n","element":"span"},{"text":".","element":"span"}],[{"text":"E.2 ","element":"span"},{"text":"Number of unique points in each neighborhood","element":"span"}],[{"text":"We now prove Theorem ","element":"span"},{"href":"#id-49","text":"3.8","element":"a"},{"text":".","element":"span"}],[{"style":{"width":"1%"},"width":25,"height":3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-7.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"Using (iii) from Lemma ","element":"span"},{"href":"#id-111","text":"E.1","element":"a"},{"text":", for ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":", we have an integer ","element":"span"},{"style":{"height":16.08},"width":49.44,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-8.png","element":"img","alt":"�Ni","inline":true,"padRight":true},{"text":"such that for all ","element":"span"},{"style":{"height":19.6},"width":394.92,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-9.png","element":"img","alt":"n ≥ �Ni, P0(Ri > cn","inline":true},{"text":") = 1. However, since ","element":"span"},{"style":{"height":16.8},"width":215.88,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-10.png","element":"img","alt":" R1, . . . , Rn","inline":true,"padRight":true},{"text":"are identically distributed, ","element":"span"},{"style":{"height":16.08},"width":226.6,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-11.png","element":"img","alt":"�N1 = . . . =","inline":true}],[{"style":{"height":16.08},"width":178.8,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-12.png","element":"img","alt":"Nn = �N","inline":true},{"text":", say. Thus, for all ","element":"span"},{"style":{"height":19.6},"width":701.64,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-13.png","element":"img","alt":" i = 1, . . . , n, we have P0(Ri > cn","inline":true},{"text":") = 1 for all ","element":"span"},{"style":{"height":16.4},"width":285.4,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-14.png","element":"img","alt":" n ≥ �N. This","inline":true,"padRight":true},{"text":"immediately implies that ","element":"span"},{"style":{"height":20.58},"width":1342.6,"height":51.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/39-15.png","element":"img","alt":" P0 [�ni=1{Ri > cn}] = 1 − P0 [�ni=1{Ri ≤ cn}] ≥ 1 − �ni=1 P0[Ri ≤","inline":true}],[{"style":{"height":11.68},"width":41.16,"height":29.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-0.png","element":"img","alt":"cn","inline":true},{"text":"] = 1 for all ","element":"span"},{"style":{"height":16},"width":133.68,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-1.png","element":"img","alt":" n ≥ �N","inline":true},{"text":", which shows ","element":"span"},{"style":{"height":20.58},"width":369.12,"height":51.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-2.png","element":"img","alt":" P0 [�ni=1{Ri > cn}","inline":true},{"text":"] = 1. Therefore, we have","element":"span"}],[{"style":{"width":"53%"},"width":1008,"height":586,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-3.png","element":"img"}],[{"text":"where ","element":"span"},{"href":"#id-112","style":{"height":24.94},"width":1735.44,"height":62.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-4.png","element":"img","alt":" θn(x) = 1 − ωx(cn), since X1, . . . , Xniid∼ f0. Using (41), we have θn(x) ≤ 1 − (cpn/A)","inline":true,"padRight":true},{"text":"for all ","element":"span"},{"style":{"height":19.2},"width":197.04,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-5.png","element":"img","alt":" x ∈ [0, 1]p","inline":true},{"text":". Given the conditions on ","element":"span"},{"text":"k","element":"span"},{"text":", it follows that as ","element":"span"},{"style":{"height":13.6},"width":163.24,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-6.png","element":"img","alt":" n → ∞,","inline":true}],[{"style":{"width":"49%"},"width":934,"height":118,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-7.png","element":"img"}],[{"text":"for all ","element":"span"},{"style":{"height":19.6},"width":949.48,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-8.png","element":"img","alt":" x ∈ [0, 1]p, where ξ = 1 − (1 + ǫ − i0)(1 + δ) >","inline":true,"padRight":true},{"text":"0. Therefore, we have","element":"span"}],[{"style":{"width":"45%"},"width":851,"height":451,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-9.png","element":"img"}],[{"id":"id-55","text":"as ","element":"span"},{"style":{"height":10.4},"width":150.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-10.png","element":"img","alt":" n → ∞","inline":true},{"text":". This proves the result.","element":"span"}]]},{"heading":"F Proof of Theorem 3.9 and Choice of α","paragraphs":[[{"text":"F.1 ","element":"span"},{"text":"Proof of Theorem ","element":"span"},{"href":"#id-53","text":"3.9","element":"a"}],[{"style":{"height":21.71},"width":636.84,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-11.png","element":"img","alt":"Proof. Let η(n) = (η1, . . . , ηn)T","inline":true},{"text":". Then, we have Θ = ","element":"span"},{"style":{"height":22.08},"width":745,"height":55.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-12.png","element":"img","alt":"�ni=1 πiηi = η(n)T π(n) where π(n) =","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":21.71},"width":692.68,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-13.png","element":"img","alt":"π1, . . . , πn)T satisfies π(n) | X (n) ∼","inline":true,"padRight":true},{"text":"Dirichlet(","element":"span"},{"style":{"height":21.71},"width":950.28,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-14.png","element":"img","alt":"α + 1, . . . , α + 1) and η(n) = (η1, . . . , ηn)T. We","inline":true,"padRight":true},{"text":"start out by observing that","element":"span"}],[{"id":"id-116","style":{"width":"90%"},"width":1698,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/40-15.png","element":"img"}],[{"text":"Let ","element":"span"},{"text":"n ","element":"span"},{"text":"be sufficiently large so that ","element":"span"},{"style":{"height":18},"width":1170.48,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-0.png","element":"img","alt":" k > 2 − γ0, since k → ∞ as n → ∞. For i = 1, . . . , n, let","inline":true}],[{"id":"id-114","style":{"width":"99%"},"width":1869,"height":279,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-1.png","element":"img"}],[{"text":"Since the pseudo-posterior covariance matrix of ","element":"span"},{"style":{"height":21.71},"width":858.6,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-2.png","element":"img","alt":" π(n) is Σπ = Vn{(1 − Cn)In + CnJn}, we","inline":true,"padRight":true},{"text":"obtain","element":"span"}],[{"id":"id-113","style":{"width":"80%"},"width":1502,"height":294,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-3.png","element":"img"}],[{"text":"where ¯","element":"span"},{"style":{"height":22.27},"width":1730.48,"height":55.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-4.png","element":"img","alt":"η = (1/n) �ni=1 ηi. Now, for i = 1, . . . , n, we have E(η2i | X (n)) = µ2i + vi, and","inline":true},{"style":{"height":21.71},"width":525.8,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-5.png","element":"img","alt":"E(¯η2 | X (n)) = (¯v/n) + ¯µ2","inline":true},{"text":". Putting these back in equation (","element":"span"},{"href":"#id-113","text":"44","element":"a"},{"text":") we get that","element":"span"}],[{"id":"id-115","style":{"width":"93%"},"width":1755,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-6.png","element":"img"}],[{"text":"Combining the results of equations (","element":"span"},{"href":"#id-114","text":"43","element":"a"},{"text":") and (","element":"span"},{"href":"#id-115","text":"45","element":"a"},{"text":"), putting them back in equation (","element":"span"},{"href":"#id-116","text":"42","element":"a"},{"text":"), and multiplying both sides by ","element":"span"},{"text":"n","element":"span"},{"text":", we get the result.","element":"span"}],[{"text":"F.2 ","element":"span"},{"text":"Choice of ","element":"span"},{"style":{"height":10.8},"width":35,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-7.png","element":"img","alt":" α","inline":true}],[{"text":"Suppose ","element":"span"},{"style":{"height":22.7},"width":57.72,"height":56.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-8.png","element":"img","alt":" σ2f0 ","inline":true,"padRight":true},{"text":"is the variance of the underlying true density ","element":"span"},{"style":{"height":17.6},"width":40.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-9.png","element":"img","alt":" f0","inline":true,"padRight":true},{"text":"satisfying Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3","element":"a"},{"text":". ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":23.6},"width":1201.16,"height":59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-10.png","element":"img","alt":" S2µ = (1/n) �ni=1(µi − ¯µ)2 and S2 = (1/n) �ni=1(Xi − ¯X)2","inline":true},{"text":". We start out by observing ","element":"span"},{"text":"that","element":"span"}],[{"style":{"width":"78%"},"width":1463,"height":304,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-11.png","element":"img"}],[{"text":"by the triangle inequality and the fact that ","element":"span"},{"style":{"height":19.6},"width":966.12,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-12.png","element":"img","alt":" |µi|, |Xi| ≤ 1 for i = 1, . . . , n. Since (µi)ni=1 are","inline":true,"padRight":true},{"text":"identically distributed and (","element":"span"},{"style":{"height":19.6},"width":125.48,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-13.png","element":"img","alt":"Xi)ni=1 ","inline":true,"padRight":true},{"text":"are identically distributed, we have","element":"span"}],[{"style":{"width":"62%"},"width":1163,"height":62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/41-14.png","element":"img"}],[{"text":"But ","element":"span"},{"style":{"height":24.25},"width":903.88,"height":60.64,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-0.png","element":"img","alt":" EPf0( |¯µ − ¯X| ) ≤ (1/n) �ni=1 EPf0( |µi − Xi|","inline":true,"padRight":true},{"text":") by the triangle inequality, from which it ","element":"span"},{"text":"follows that ","element":"span"},{"style":{"height":24.06},"width":985.92,"height":60.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-1.png","element":"img","alt":" EPf0( |¯µ − ¯X| ) ≤ EPf0( |µ1 − X1| ) since (µi − Xi","inline":true},{"text":") are identically distributed for ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":". Thus, we have","element":"span"}],[{"style":{"width":"69%"},"width":1309,"height":193,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-2.png","element":"img"}],[{"text":"using (","element":"span"},{"href":"#id-117","text":"25","element":"a"},{"text":"). Since ","element":"span"},{"style":{"height":22.7},"width":320.28,"height":56.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-3.png","element":"img","alt":" S2 → σ2f0 in Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":10.4},"width":151.2,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-4.png","element":"img","alt":" n → ∞","inline":true,"padRight":true},{"text":"by the weak law of large numbers, ","element":"span"},{"text":"we get that ","element":"span"},{"style":{"height":22.91},"width":327.48,"height":57.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-5.png","element":"img","alt":" S2µ → σ2f0 in Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":10.4},"width":156,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-6.png","element":"img","alt":" n → ∞","inline":true,"padRight":true},{"text":"as well. Equating the pseudo-posterior ","element":"span"},{"text":"variance of ","element":"span"},{"style":{"height":16.91},"width":78.44,"height":42.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-7.png","element":"img","alt":" n1/2","inline":true},{"text":"Θ from Theorem ","element":"span"},{"href":"#id-53","text":"3.9 ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":22.7},"width":57.72,"height":56.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-8.png","element":"img","alt":" σ2f0","inline":true},{"text":", we get after some rearranging,","element":"span"}],[{"style":{"width":"67%"},"width":1266,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-9.png","element":"img"}],[{"text":"As ","element":"span"},{"style":{"height":10.4},"width":166.08,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-10.png","element":"img","alt":" n → ∞","inline":true},{"text":", since each ","element":"span"},{"style":{"height":20.59},"width":650.04,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-11.png","element":"img","alt":" λi satisfies λ2i − h2 → 0 in Pf0","inline":true},{"text":"-probability for ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n ","element":"span"},{"text":"using ","element":"span"},{"text":"equation (","element":"span"},{"href":"#id-118","text":"29","element":"a"},{"text":") with ","element":"span"},{"style":{"height":16.4},"width":170.84,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-12.png","element":"img","alt":" p = 1, ¯v","inline":true,"padRight":true},{"text":"is well approximated by (","element":"span"},{"style":{"height":20.51},"width":257,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-13.png","element":"img","alt":"γnνn)−1(γ0δ20","inline":true},{"text":") in the sense that ","element":"span"},{"style":{"height":19.2},"width":97,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-14.png","element":"img","alt":" {¯v −","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":20.59},"width":550.2,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-15.png","element":"img","alt":"γnνn)−1(γ0δ20)} → 0 in Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":10.4},"width":162.72,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-16.png","element":"img","alt":" n → ∞","inline":true},{"text":". In particular, we have ¯","element":"span"},{"style":{"height":18.48},"width":292.96,"height":46.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-17.png","element":"img","alt":"v → 0 in Pf0-","inline":true,"padRight":true},{"text":"probability. Combining these with the fact that ","element":"span"},{"style":{"height":22.91},"width":322.2,"height":57.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-18.png","element":"img","alt":" S2µ → σ2f0 in Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":13.6},"width":234.6,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-19.png","element":"img","alt":" n → ∞, we","inline":true,"padRight":true},{"text":"obtain,","element":"span"}],[{"id":"id-119","style":{"width":"61%"},"width":1149,"height":110,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-20.png","element":"img"}],[{"text":"Since 1","element":"span"},{"text":"/n ","element":"span"},{"text":"can be asymptotically neglected in comparison to ¯","element":"span"},{"style":{"height":22.7},"width":105.72,"height":56.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-21.png","element":"img","alt":"v/σ2f0 ","inline":true,"padRight":true},{"text":"owing to the fact that ","element":"span"},{"href":"#id-119","style":{"height":20.51},"width":707.96,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-22.png","element":"img","alt":"k2/n → 0 as n → ∞ for p = 1, (48","inline":true},{"text":") implies the choice of ","element":"span"},{"style":{"height":9.2},"width":30,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-23.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"as described in equation (","element":"span"},{"href":"#id-54","text":"17","element":"a"},{"text":").","element":"span"}]]},{"heading":"G Proof of Consistency of fK(x)","paragraphs":[[{"text":"Define the standard multivariate t-density with ","element":"span"},{"text":"d > ","element":"span"},{"text":"0 degrees of freedom to be ","element":"span"},{"style":{"height":19.6},"width":163.72,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-24.png","element":"img","alt":" gd(x) =","inline":true},{"style":{"height":19.68},"width":541.32,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-25.png","element":"img","alt":"td(x; 0p, Ip). Since H = Hn","inline":true,"padRight":true},{"text":"as defined in Section ","element":"span"},{"text":"3.1 ","element":"span"},{"text":"is diagonal, it immediately follows that ","element":"span"},{"style":{"height":20.99},"width":888.96,"height":52.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-26.png","element":"img","alt":"tγn−p+1(x; x′, H) = h−pgγn−p+1{h−1(x − x′)}","inline":true},{"text":". The following lemma proves the consistency of any such generic kernel density estimator with t kernel depending on ","element":"span"},{"text":"n","element":"span"},{"text":", say","element":"span"}],[{"style":{"width":"40%"},"width":749,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-27.png","element":"img"}],[{"text":"where the bandwidth ","element":"span"},{"style":{"height":18.4},"width":1035.84,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-28.png","element":"img","alt":" w = wn satisfies wn → 0 and nwpn → ∞ as n → ∞","inline":true},{"text":", with independent ","element":"span"},{"text":"and identically distributed data ","element":"span"},{"style":{"height":17.6},"width":328.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-29.png","element":"img","alt":" X1, . . . , Xn ∼ f0","inline":true,"padRight":true},{"text":"satisfying Assumptions ","element":"span"},{"href":"#id-36","text":"3.1","element":"a"},{"text":"-","element":"span"},{"href":"#id-39","text":"3.3","element":"a"},{"text":".","element":"span"}],[{"id":"id-121","style":{"height":17.2},"width":649.8,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-30.png","element":"img","alt":"Lemma G.1. Suppose w = wn","inline":true,"padRight":true},{"text":"is a sequence satisfying ","element":"span"},{"style":{"height":14},"width":707.12,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-31.png","element":"img","alt":" w −→ 0 and nwp −→ ∞ as n −→ ∞.","inline":true,"padRight":true},{"text":"Let ","element":"span"},{"style":{"height":20.99},"width":1559.16,"height":52.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/42-32.png","element":"img","alt":" fK(x) = (nwp)−1 �ni=1 gγn−p+1{w−1(x − Xi)}. Then fK(x) → f0(x) in Pf0","inline":true},{"text":"-probability","element":"span"}],[{"style":{"width":"20%"},"width":391,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-0.png","element":"img"}],[{"text":"Proof. ","element":"span"},{"text":"It is enough to show that ","element":"span"},{"style":{"height":22.47},"width":1183.72,"height":56.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-1.png","element":"img","alt":" EPf0{fK(x)} −→ f0(x) and varPf0{fK(x)} −→ 0 as n −→ ∞.","inline":true,"padRight":true},{"text":"Let us start first with ","element":"span"},{"style":{"height":22.47},"width":464.04,"height":56.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-2.png","element":"img","alt":" EPf0{fK(x)}. We have","inline":true}],[{"style":{"width":"90%"},"width":1698,"height":749,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-3.png","element":"img"}],[{"text":"using the mean value theorem and Polya’s theorem (","element":"span"},{"href":"#id-120","referenceIndex":36,"text":"P´olya","element":"a"},{"text":", ","element":"span"},{"href":"#id-120","referenceIndex":36,"text":"1920","element":"a"},{"text":") along with Assumption ","element":"span"},{"href":"#id-37","text":"3.2 ","element":"a"},{"text":"to bound ","element":"span"},{"style":{"height":19.6},"width":406.56,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-4.png","element":"img","alt":" ∇f0(·). As n → ∞","inline":true},{"text":", this implies that ","element":"span"},{"style":{"height":22.47},"width":787.48,"height":56.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-5.png","element":"img","alt":" EPf0{fK(x)} → f0(x) since w → 0 as","inline":true},{"style":{"height":10.4},"width":163.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-6.png","element":"img","alt":"n → ∞.","inline":true}],[{"text":"The variance may be dealt with in a similar manner. Following the same steps as before we get","element":"span"}],[{"style":{"width":"94%"},"width":1764,"height":510,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-7.png","element":"img"}],[{"text":"which shows that the variance goes to 0 as ","element":"span"},{"style":{"height":16.4},"width":733.48,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-8.png","element":"img","alt":" n → ∞, since nwp → ∞ as n → ∞.","inline":true}],[{"text":"For the nearest neighbor-Dirichlet mixture, recall ","element":"span"},{"style":{"height":20.49},"width":778.32,"height":51.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-9.png","element":"img","alt":" fK(x) = (1/n) �ni=1 tγn−p+1(x; Xi, Hn)","inline":true,"padRight":true},{"text":"from Section ","element":"span"},{"text":"3.1 ","element":"span"},{"text":"of the main document, where ","element":"span"},{"style":{"height":20.59},"width":923.56,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-10.png","element":"img","alt":" Hn = h2nIp and h2n = {νn(γn −p+1)}−1{(νn +","inline":true,"padRight":true},{"text":"1)(","element":"span"},{"style":{"height":20.51},"width":264.68,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-11.png","element":"img","alt":"γ0−p+1)}δ20","inline":true},{"text":". Here, the bandwidth ","element":"span"},{"style":{"height":18.4},"width":1081.92,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-12.png","element":"img","alt":" hn satisfies hn → 0 and nhpn → ∞ as n → ∞. Lemma","inline":true,"padRight":true},{"href":"#id-121","text":"G.1 ","element":"a"},{"text":"then shows that ","element":"span"},{"style":{"height":19.6},"width":100.92,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-13.png","element":"img","alt":" fK(x","inline":true},{"text":") converges to ","element":"span"},{"style":{"height":19.68},"width":236.76,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-14.png","element":"img","alt":" f0(x) in Pf0","inline":true},{"text":"-probability as ","element":"span"},{"style":{"height":10.4},"width":163.24,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/43-15.png","element":"img","alt":" n → ∞.","inline":true}]]},{"heading":"H Cross-validation","paragraphs":[[{"text":"H.1 ","element":"span"},{"text":"Algorithm for leave-one-out cross-validation","element":"span"}],[{"text":"Consider independent and identically distributed data ","element":"span"},{"style":{"height":17.6},"width":739.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-0.png","element":"img","alt":" X1, . . . , Xn ∈ Rp ∼ f with f having","inline":true,"padRight":true},{"text":"the nearest neighbor-Dirichlet mixture formulation. In the univariate setting, the prior for each of the neighborhood specific parameters is ","element":"span"},{"style":{"height":20.51},"width":850.6,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-1.png","element":"img","alt":" θi = (ηi, σ2i ) ∼ NIG(µ0, ν0, γ0/2, γ0δ20/2).","inline":true,"padRight":true},{"text":"The equivalent prior in the general multivariate setting following Sections ","element":"span"},{"href":"#id-34","text":"2.2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-57","text":"2.3 ","element":"a"},{"text":"is (","element":"span"},{"style":{"height":20.59},"width":1854.12,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-2.png","element":"img","alt":"ηi, Σi) ∼ NIWp(µ0, ν0, γ0, Ψ0) where Ψ0 = (γ∗δ20) Ip with γ∗ = γ0 − p + 1. We use the","inline":true,"padRight":true},{"text":"pseudo-posterior mean in equations (","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":") and (","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":") to compute leave-one-out log-likelihoods ","element":"span"},{"style":{"height":20.51},"width":89.96,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-3.png","element":"img","alt":"L(δ20","inline":true},{"text":") for different choices of the hyperparameter ","element":"span"},{"style":{"height":23.95},"width":811.64,"height":59.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-4.png","element":"img","alt":" δ20, choosing δ20,CV = arg supδ20L(δ20) to","inline":true,"padRight":true},{"text":"maximize this criteria. The details of the computation of ","element":"span"},{"style":{"height":20.7},"width":89.96,"height":51.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-5.png","element":"img","alt":" L(δ20","inline":true},{"text":") for a fixed ","element":"span"},{"style":{"height":20.11},"width":39.56,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-6.png","element":"img","alt":" δ20 ","inline":true,"padRight":true},{"text":"are provided ","element":"span"},{"text":"in Algorithm ","element":"span"},{"href":"#id-122","text":"2","element":"a"},{"text":".","element":"span"}],[{"id":"id-122","style":{"width":"102%"},"width":1927,"height":945,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-7.png","element":"img"}],[{"text":"H.2 ","element":"span"},{"text":"Fast Implementation of cross-validation","element":"span"}],[{"text":"In Algorithm ","element":"span"},{"href":"#id-122","text":"2","element":"a"},{"text":", the nearest neighborhood specification for each ","element":"span"},{"style":{"height":15.51},"width":79.68,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-8.png","element":"img","alt":" X −i ","inline":true,"padRight":true},{"text":"is different for ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":". ","element":"span"},{"text":"However, we bypass this computation by initially forming a neighborhood of size (","element":"span"},{"text":"k ","element":"span"},{"text":"+ 1) for each data point using the entire data and storing the respective neighborhood means and covariance matrices. Suppose for ","element":"span"},{"style":{"height":16.08},"width":50.88,"height":40.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-9.png","element":"img","alt":" Xi","inline":true},{"text":", the indices of the (","element":"span"},{"text":"k ","element":"span"},{"text":"+ 1)-nearest neighbors are given by ","element":"span"},{"style":{"height":21.07},"width":1358.56,"height":52.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-10.png","element":"img","alt":" Ni = {j ∈ {1, . . . , n} : ||Xi − Xj||2 ≤ ||Xi − Xi[k+1]||2}, ar-","inline":true,"padRight":true},{"text":"ranged in increasing order according to their distance from ","element":"span"},{"style":{"height":19.68},"width":593.64,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/44-11.png","element":"img","alt":" Xi with Xi[1] = Xi. Define","inline":true,"padRight":true},{"text":"the neighborhood mean ","element":"span"},{"style":{"height":22.37},"width":586.92,"height":55.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-0.png","element":"img","alt":" mi = {1/(k + 1)} �j∈Ni Xi[j]","inline":true,"padRight":true},{"text":"and the neighborhood covariance ma- ","element":"span"},{"text":"trix ","element":"span"},{"style":{"height":23.28},"width":939.36,"height":58.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-1.png","element":"img","alt":" Si = (k +1)−1{�j∈Ni(Xi[j] −mi)(Xi[j]−mi)T}","inline":true},{"text":". Then, to form a ","element":"span"},{"text":"k","element":"span"},{"text":"-nearest neighborhood ","element":"span"},{"text":"for the new data ","element":"span"},{"style":{"height":15.51},"width":79.68,"height":38.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-2.png","element":"img","alt":" X −i","inline":true},{"text":", a single pass over the initial neighborhoods ","element":"span"},{"style":{"height":17.68},"width":51.36,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-3.png","element":"img","alt":" Ni","inline":true,"padRight":true},{"text":"is sufficient to update the new neighborhood means and covariance matrices. Below, we describe the update for the neighborhood means ","element":"span"},{"style":{"height":26.59},"width":104.68,"height":66.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-4.png","element":"img","alt":" m(−i)j","inline":true,"padRight":true},{"text":"and covariance matrices ","element":"span"},{"style":{"height":26.59},"width":695.08,"height":66.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-5.png","element":"img","alt":" S(−i)j for j = 1, . . . , n and j ̸= i,","inline":true,"padRight":true},{"text":"considering the data ","element":"span"},{"style":{"height":19.71},"width":850.12,"height":49.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-6.png","element":"img","alt":" X −i. For j = 1, . . . , n and j ̸= i, we have,","inline":true}],[{"style":{"width":"89%"},"width":1677,"height":361,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-7.png","element":"img"}]]},{"heading":"I Algorithm with Gaussian Kernels for Univariate Data","paragraphs":[[{"text":"Suppose we have independent and identically distributed observations ","element":"span"},{"style":{"height":21.71},"width":415.92,"height":54.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-8.png","element":"img","alt":" X (n) = (X1, . . . , Xn)","inline":true,"padRight":true},{"text":"from the density ","element":"span"},{"style":{"height":18},"width":615.52,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-9.png","element":"img","alt":" f, where Xi ∈ ℜ, i = 1, . . . , n","inline":true},{"text":". In the nearest neighbor-Dirichlet mixture framework for univariate data with the Gaussian kernel ","element":"span"},{"style":{"height":20.51},"width":180.2,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-10.png","element":"img","alt":" φ(· ; η, σ2","inline":true},{"text":"), neighborhood specific parameters ","element":"span"},{"style":{"height":20.51},"width":985.12,"height":51.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-11.png","element":"img","alt":" θi = (ηi, σ2i ) ∼ NIG(µ0, ν0, γ0/2, γ0δ20/2) a priori","inline":true,"padRight":true},{"text":"independently for ","element":"span"},{"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"text":", . . . , n","element":"span"},{"text":". ","element":"span"},{"text":"Monte Carlo samples for the estimated density at any point ","element":"span"},{"text":"x ","element":"span"},{"text":"can be generated following the steps of Algorithm ","element":"span"},{"href":"#id-123","text":"3","element":"a"},{"text":".","element":"span"}],[{"id":"id-123","style":{"width":"103%"},"width":1939,"height":1030,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/45-12.png","element":"img"}]]},{"heading":"J Inverse Wishart Parametrization","paragraphs":[[{"text":"The parametrization of the inverse Wishart density defined on the set of all ","element":"span"},{"style":{"height":16.4},"width":295,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-0.png","element":"img","alt":" p × p matrices","inline":true,"padRight":true},{"text":"with real entries used in this article is given as follows. Suppose ","element":"span"},{"style":{"height":14.4},"width":161.32,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-1.png","element":"img","alt":" γ > p −","inline":true,"padRight":true},{"text":"1 and Ψ is a ","element":"span"},{"style":{"height":13.2},"width":108.08,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-2.png","element":"img","alt":" p × p","inline":true,"padRight":true},{"text":"positive definite matrix. If Σ ","element":"span"},{"style":{"height":19.68},"width":193,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-3.png","element":"img","alt":" ∼ IWp(γ,","inline":true,"padRight":true},{"text":"Ψ), then Σ has the following density function:","element":"span"}],[{"style":{"width":"84%"},"width":1582,"height":287,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-4.png","element":"img"}],[{"text":"where Γ","element":"span"},{"style":{"height":19.68},"width":49.96,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-5.png","element":"img","alt":"p(·","inline":true},{"text":") is the multivariate gamma function defined by","element":"span"}],[{"style":{"width":"37%"},"width":710,"height":138,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-6.png","element":"img"}],[{"text":"for ","element":"span"},{"style":{"height":19.6},"width":243.32,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-7.png","element":"img","alt":" a ≥ (p−1)/","inline":true},{"text":"2 and the function etr (","element":"span"},{"text":"A","element":"span"},{"text":") = exp ","element":"span"},{"text":"{","element":"span"},{"text":"tr(","element":"span"},{"text":"A","element":"span"},{"text":")","element":"span"},{"text":"} ","element":"span"},{"text":"for a square matrix ","element":"span"},{"text":"A","element":"span"},{"text":". When ","element":"span"},{"text":"p ","element":"span"},{"text":"= 1, the IW","element":"span"},{"style":{"height":19.68},"width":77.32,"height":49.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-8.png","element":"img","alt":"p(γ,","inline":true,"padRight":true},{"text":"Ψ) density is the same as the IG(","element":"span"},{"style":{"height":20.11},"width":185.72,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-9.png","element":"img","alt":"γ/2, γδ2/","inline":true},{"text":"2) density, where ","element":"span"},{"style":{"height":20.11},"width":204.04,"height":50.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/46-10.png","element":"img","alt":" δ2 = Ψ/γ.","inline":true}]]},{"heading":"K L1 Error Tables in Sections 4.2 and 4.3","paragraphs":[[{"style":{"width":"80%"},"width":1504,"height":1647,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/47-0.png","element":"img"}],[{"text":"Table 3: Comparison of the methods in terms of ","element":"figcaption","subtype":"caption"},{"style":{"height":16.48},"width":50.12,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/47-1.png","element":"img","alt":" L1","inline":true,"padRight":true},{"text":"error in the univariate case. Number of test points and replications considered are ","element":"figcaption","subtype":"caption"},{"style":{"height":16.48},"width":318.24,"height":41.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/47-2.png","element":"img","alt":" nt = 500 and R","inline":true,"padRight":true},{"text":"= 20, respectively.","element":"figcaption","subtype":"caption"}],[{"id":"id-73","style":{"width":"105%"},"width":2590,"height":1144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/48-0.png","element":"img"}]]},{"heading":"References","paragraphs":[[{"id":"id-6","text":"Abramson, I. S. (1982). On bandwidth variation in kernel estimates-a square root law. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", pages 1217–1223.","element":"span"}],[{"id":"id-71","text":"Azzalini, A. (2005). The skew-normal distribution and related multivariate families. ","element":"span"},{"text":"Scandinavian Journal of Statistics","element":"span"},{"text":", 32(2):159–188.","element":"span"}],[{"id":"id-2","text":"Biau, G. and Devroye, L. (2015). ","element":"span"},{"text":"Lectures on the Nearest Neighbor Method","element":"span"},{"text":". Springer.","element":"span"}],[{"id":"id-13","text":"Blei, D. M. and Jordan, M. I. (2006). Variational inference for Dirichlet process mixtures. ","element":"span"},{"text":"Bayesian Analysis","element":"span"},{"text":", 1(1):121–143.","element":"span"}],[{"id":"id-28","text":"Bowman, A. W. (1984). An alternative method of cross-validation for the smoothing of ","element":"span"},{"text":"density estimates. ","element":"span"},{"text":"Biometrika","element":"span"},{"text":", 71(2):353–360.","element":"span"}],[{"id":"id-5","text":"Breiman, L., Meisel, W., and Purcell, E. (1977). Variable kernel estimates of multivariate ","element":"span"},{"text":"densities. ","element":"span"},{"text":"Technometrics","element":"span"},{"text":", 19(2):135–144.","element":"span"}],[{"id":"id-58","text":"Devroye, L. and Gyorfi, L. (1985). ","element":"span"},{"style":{"height":17.6},"width":1143.88,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/49-0.png","element":"img","alt":" Nonparametric Density Estimation: the L1 view. Wiley","inline":true,"padRight":true},{"text":"Series in Probability and Statistics.","element":"span"}],[{"id":"id-64","text":"Duong, T. (2020). ","element":"span"},{"text":"ks: Kernel Smoothing","element":"span"},{"text":". R package version 1.11.7.","element":"span"}],[{"id":"id-40","text":"Evans, D. (2008). A law of large numbers for nearest neighbour statistics. ","element":"span"},{"text":"Proceedings of the Royal Society A: Mathematical, Physical and Engineering Sciences","element":"span"},{"text":", 464(2100):3175–3192.","element":"span"}],[{"id":"id-35","text":"Evans, D., Jones, A. J., and Schmidt, W. M. (2002). Asymptotic moments of near–neighbour ","element":"span"},{"text":"distance distributions. ","element":"span"},{"text":"Proceedings of the Royal Society of London. Series A: Mathematical, Physical and Engineering Sciences","element":"span"},{"text":", 458(2028):2839–2849.","element":"span"}],[{"id":"id-32","text":"Ghosal, S., Ghosh, J. K., Ramamoorthi, R., et al. (1999). Posterior consistency of Dirichlet ","element":"span"},{"text":"mixtures in density estimation. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", 27(1):143–158.","element":"span"}],[{"id":"id-46","text":"Ghosal, S. and Van der Vaart, A. (2017). ","element":"span"},{"text":"Fundamentals of nonparametric Bayesian inference","element":"span"},{"text":", volume 44. Cambridge University Press.","element":"span"}],[{"id":"id-33","text":"Ghosal, S., Van Der Vaart, A., et al. (2007). Posterior convergence rates of Dirichlet mixtures ","element":"span"},{"text":"at smooth densities. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", 35(2):697–723.","element":"span"}],[{"id":"id-81","text":"Gneiting, T. and Raftery, A. E. (2007). Strictly proper scoring rules, prediction, and esti- ","element":"span"},{"text":"mation. ","element":"span"},{"text":"Journal of the American Statistical Association","element":"span"},{"text":", 102(477):359–378.","element":"span"}],[{"id":"id-85","text":"Golub, G. H. and van Loan, C. F. (1996). ","element":"span"},{"text":"Matrix Computations","element":"span"},{"text":". John Hopkins University Press, 3 edition.","element":"span"}],[{"id":"id-65","text":"Hahn, P. R., Martin, R., and Walker, S. G. (2018). On recursive bayesian predictive distri- ","element":"span"},{"text":"butions. ","element":"span"},{"text":"Journal of the American Statistical Association","element":"span"},{"text":", 113(523):1085–1093.","element":"span"}],[{"id":"id-27","text":"Hall, P. (1987). On Kullback-Leibler loss and density estimation. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", 15(4):1491–1519.","element":"span"}],[{"id":"id-9","text":"Hjort, N. L. and Jones, M. C. (1996). Locally parametric nonparametric density estimation. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", pages 1619–1647.","element":"span"}],[{"id":"id-60","text":"J. Ross, G. and Markwick, D. (2019). ","element":"span"},{"text":"dirichletprocess: Build Dirichlet Process Objects for Bayesian Modelling","element":"span"},{"text":". R package version 0.3.1.","element":"span"}],[{"id":"id-66","text":"Jara, A., Hanson, T., Quintana, F., M¨uller, P., and Rosner, G. (2011). DPpackage: Bayesian ","element":"span"},{"text":"semi- and nonparametric modeling in R. ","element":"span"},{"text":"Journal of Statistical Software","element":"span"},{"text":", 40(5):1–30.","element":"span"}],[{"id":"id-88","text":"Keith, M., Jameson, A., Van Straten, W., Bailes, M., Johnston, S., Kramer, M., Possenti, ","element":"span"},{"text":"A., Bates, S., Bhat, N., Burgay, M., et al. (2010). The High Time Resolution Universe Pulsar Survey I, System configuration and initial discoveries. ","element":"span"},{"text":"Monthly Notices of the Royal Astronomical Society","element":"span"},{"text":", 409(2):619–627.","element":"span"}],[{"id":"id-16","text":"Lavine, M. (1992). Some aspects of Polya tree distributions for statistical modelling. ","element":"span"},{"text":"Annals of Statistics","element":"span"},{"text":", 20(3):1222–1235.","element":"span"}],[{"id":"id-17","text":"Lavine, M. (1994). More aspects of Polya tree distributions for statistical modelling. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", 22(3):1161–1176.","element":"span"}],[{"id":"id-8","text":"Loader, C. (2006). ","element":"span"},{"text":"Local regression and likelihood","element":"span"},{"text":". Springer Science & Business Media.","element":"span"}],[{"id":"id-7","text":"Loader, C. R. (1996). ","element":"span"},{"text":"Local likelihood density estimation. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", 24(4):1602–1618.","element":"span"}],[{"id":"id-0","text":"Loftsgaarden, D. O. and Quesenberry, C. P. (1965). A nonparametric estimate of a multi- ","element":"span"},{"text":"variate density function. ","element":"span"},{"text":"The Annals of Mathematical Statistics","element":"span"},{"text":", 36(3):1049–1051.","element":"span"}],[{"id":"id-89","text":"Lorimer, D. R. and Kramer, M. (2012). ","element":"span"},{"text":"Handbook of Pulsar Astronomy","element":"span"},{"text":".","element":"span"}],[{"id":"id-90","text":"Lyon, R. J. (2016). ","element":"span"},{"text":"Why are pulsars hard to find? ","element":"span"},{"text":"PhD thesis, The University of Manchester (United Kingdom).","element":"span"}],[{"id":"id-84","text":"Ma, H. and Li, J. (2019). A true O(","element":"span"},{"text":"n ","element":"span"},{"text":"log ","element":"span"},{"text":"n","element":"span"},{"text":") algorithm for the all-k-nearest-neighbors problem. In ","element":"span"},{"text":"International Conference on Combinatorial Optimization and Applications","element":"span"},{"text":", pages 362– 374. Springer.","element":"span"}],[{"id":"id-1","text":"Mack, Y. and Rosenblatt, M. (1979). Multivariate k-nearest neighbor density estimates. ","element":"span"},{"text":"Journal of Multivariate Analysis","element":"span"},{"text":", 9(1):1–15.","element":"span"}],[{"id":"id-67","text":"Mildenberger, T. and Weinert, H. (2012). The benchden package: Benchmark densities for ","element":"span"},{"text":"nonparametric density estimation. ","element":"span"},{"text":"Journal of Statistical Software","element":"span"},{"text":", 46(14):1–14.","element":"span"}],[{"id":"id-96","text":"Miller, J. W. and Dunson, D. B. (2019). Robust Bayesian inference via coarsening. ","element":"span"},{"text":"Journal of the American Statistical Association","element":"span"},{"text":", 114(527):1113–1125.","element":"span"}],[{"id":"id-15","text":"Newton, M. A. (2002). On a nonparametric recursive estimator of the mixing distribution. ","element":"span"},{"style":{"height":17.6},"width":1067.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2003.07953/images/51-0.png","element":"img","alt":"Sankhy¯a: The Indian Journal of Statistics, Series A","inline":true},{"text":", 64(2):306–322.","element":"span"}],[{"id":"id-14","text":"Newton, M. A. and Zhang, Y. (1999). A recursive algorithm for nonparametric analysis with ","element":"span"},{"text":"missing data. ","element":"span"},{"text":"Biometrika","element":"span"},{"text":", 86(1):15–26.","element":"span"}],[{"id":"id-95","text":"Ni, Y., Ji, Y., and M¨uller, P. (2020). Consensus monte carlo for random subsets using shared ","element":"span"},{"text":"anchors. ","element":"span"},{"text":"Journal of Computational and Graphical Statistics","element":"span"},{"text":", 29(4):1–12.","element":"span"}],[{"id":"id-120","text":"P´olya, G. (1920). ","element":"span"},{"text":"¨Uber den zentralen grenzwertsatz der wahrscheinlichkeitsrechnung und das momentenproblem. ","element":"span"},{"text":"Mathematische Zeitschrift","element":"span"},{"text":", 8(3-4):171–181.","element":"span"}],[{"id":"id-59","text":"R Core Team (2018). ","element":"span"},{"text":"R: A Language and Environment for Statistical Computing","element":"span"},{"text":". R Foundation for Statistical Computing, Vienna, Austria.","element":"span"}],[{"id":"id-52","text":"Rivoirard, V. and Rousseau, J. (2012). Bernstein–von Mises theorem for linear functionals ","element":"span"},{"text":"of the density. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", 40(3):1489–1523.","element":"span"}],[{"id":"id-10","text":"Rousseau, J. and Mengersen, K. (2011). Asymptotic behaviour of the posterior distribution ","element":"span"},{"text":"in overfitted mixture models. ","element":"span"},{"text":"Journal of the Royal Statistical Society: Series B (Statistical Methodology)","element":"span"},{"text":", 73(5):689–710.","element":"span"}],[{"id":"id-3","text":"Scott, D. W. (2015). ","element":"span"},{"text":"Multivariate Density Estimation: Theory, Practice, and Visualization","element":"span"},{"text":". John Wiley & Sons.","element":"span"}],[{"id":"id-62","text":"Sheather, S. J. and Jones, M. C. (1991). A reliable data-based bandwidth selection method ","element":"span"},{"text":"for kernel density estimation. ","element":"span"},{"text":"Journal of the Royal Statistical Society: Series B (Methodological)","element":"span"},{"text":", 53(3):683–690.","element":"span"}],[{"id":"id-94","text":"Song, H., Wang, Y., and Dunson, D. B. (2020). Distributed Bayesian clustering using finite ","element":"span"},{"text":"mixture of mixtures. ","element":"span"},{"text":"arXiv preprint","element":"span"},{"text":", page arXiv:2003.13936.","element":"span"}],[{"id":"id-72","text":"Song, P. X.-K. (2000). ","element":"span"},{"text":"Multivariate dispersion models generated from Gaussian copula. ","element":"span"},{"text":"Scandinavian Journal of Statistics","element":"span"},{"text":", 27(2):305–320.","element":"span"}],[{"id":"id-4","text":"Terrell, G. R. and Scott, D. W. (1992). Variable kernel density estimation. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", pages 1236–1265.","element":"span"}],[{"id":"id-97","text":"Teschl, G. (2009). ","element":"span"},{"text":"Mathematical methods in quantum mechanics. ","element":"span"},{"text":"Graduate Studies in Mathematics","element":"span"},{"text":", 99:106.","element":"span"}],[{"id":"id-38","text":"Tsybakov, A. B. (2004). Introduction to nonparametric estimation, 2009. ","element":"span"},{"text":"URL https://doi. org/10.1007/b13794. Revised and extended from the","element":"span"},{"text":".","element":"span"}],[{"id":"id-83","text":"Vaidya, P. M. (1986). An optimal algorithm for the all-nearest-neighbors problem. In ","element":"span"},{"text":"27th Annual Symposium on Foundations of Computer Science","element":"span"},{"text":", pages 117–122.","element":"span"}],[{"id":"id-63","text":"Wand, M. P. and Jones, M. C. (1994). Multivariate plug-in bandwidth selection. ","element":"span"},{"text":"Computational Statistics","element":"span"},{"text":", 9(2):97–116.","element":"span"}],[{"id":"id-11","text":"Wang, L. and Dunson, D. B. (2011). Fast Bayesian inference in Dirichlet process mixture ","element":"span"},{"text":"models. ","element":"span"},{"text":"Journal of Computational and Graphical Statistics","element":"span"},{"text":", 20(1):196–216.","element":"span"}],[{"id":"id-61","text":"West, M. (1992). ","element":"span"},{"text":"Hyperparameter estimation in Dirichlet process mixture models","element":"span"},{"text":". Duke University ISDS Discussion Paper# 92-A03.","element":"span"}],[{"id":"id-18","text":"Wong, W. H. and Ma, L. (2010). Optional Polya tree and Bayesian inference. ","element":"span"},{"text":"The Annals of Statistics","element":"span"},{"text":", 38(3):1433–1459.","element":"span"}],[{"id":"id-12","text":"Zhang, X., Nott, D. J., Yau, C., and Jasra, A. (2014). A sequential algorithm for fast fitting ","element":"span"},{"text":"of Dirichlet process mixture models. ","element":"span"},{"text":"Journal of Computational and Graphical Statistics","element":"span"},{"text":", 23(4):1143–1162.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]