1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMi4xMjYwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2021-12-17T21:31:08.000Z","paperID":"2002.12606","published":"2020-02-28T09:20:41.000Z","authors":"[\"Benjamin G. Stokell\",\"Rajen D. Shah\",\"Ryan J. Tibshirani\"]","title":"Modelling High-Dimensional Categorical Data Using Nonconvex Fusion Penalties","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-04T13:57:21.563Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9tb2RlbGxpbmctaGlnaC1kaW1lbnNpb25hbC1jYXRlZ29yaWNhbC1kYXRhIn0=","type":"pwc","url":"https://paperswithcode.com/paper/modelling-high-dimensional-categorical-data","data":null}],"reposConnection":{"edges":[{"official":null,"node":{"id":"eyJyZXBvSUQiOiIzNjIyNDEzMzQiLCJzb3VyY2UiOiJnaXRodWIifQ==","source":"github","repoID":"362241334","url":"https://github.com/bgs25/scope-experiments","title":"scope-experiments","language":"r","stars":0,"forks":0,"framework":null,"scoreTrending":null,"updated":null,"created":null,"downloads":null,"likes":null,"owner":[{"username":"bgs25","avatar":"https://avatars.githubusercontent.com/u/6967651?v=4"}]}}]},"models":[],"tags":[{"id":"eyJuYW1lIjoiY2x1c3RlcmluZyIsInR5cGUiOiJ0YXNrIn0=","name":"clustering","description":"In clustering, the input is a set of data points, and the output is the grouping of these points into clusters based on similarity. This task is used in market segmentation, social network analysis, and image segmentation where similar items need to be grouped together.","scoreTrending":null,"count":{"stars":5724,"papers":2723,"models":1382},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[]},"__typename":"paper","authorArray":["Benjamin G. Stokell","Rajen D. Shah","Ryan J. Tibshirani"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2002.12606","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2002.12606","publisher":"arxiv","paperJSON":{"title":"Modelling High-Dimensional Categorical Data Using Nonconvex Fusion Penalties","paperID":"2002.12606","avgLineHeight":13.55,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"$31","element":"span"},{"style":{"fontFamily":"monospace"},"text":"CatReg ","element":"span"},{"text":"implementing SCOPE for linear models and also a version for logistic regression is available on CRAN.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"Categorical data arise in a number of application areas. For example, electronic health data typically contain records of diagnoses received by patients coded within controlled vocabularies and also prescriptions, both of which give rise to categorical variables with large numbers of levels ","element":"span"},{"href":"#id-0","referenceIndex":18,"text":"[Jensen et al., ","element":"a"},{"href":"#id-0","referenceIndex":18,"text":"2012]","element":"a"},{"text":". ","element":"span"},{"text":"Vehicle insurance claim data also contain a large number of categorical variables detailing properties of the vehicles and parties involved ","element":"span"},{"href":"#id-1","referenceIndex":16,"text":"[Hu et al., ","element":"a"},{"href":"#id-1","referenceIndex":16,"text":"2018]","element":"a"},{"text":". When performing regression with such data as covariates, it is often helpful, both for improved predictive performance and interpretation of the fit, to fuse the levels of several categories together in the sense that the estimated coefficients corresponding to these levels have exactly the same value.","element":"span"}],[{"text":"To fix ideas, consider the following ANOVA model relating response vector ","element":"span"},{"style":{"height":19.53},"width":383.8,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/0-0.png","element":"img","alt":" Y = (Y1, . . . , Yn)T ∈","inline":true},{"style":{"height":12},"width":52.51,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/0-1.png","element":"img","alt":"Rn ","inline":true,"padRight":true},{"text":"to categorical predictors ","element":"span"},{"style":{"height":18.22},"width":574.22,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/0-2.png","element":"img","alt":" Xij ∈ {1, . . . Kj}, j = 1, . . . , p:","inline":true}],[{"id":"id-2","style":{"width":"99%"},"width":1801,"height":171,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/0-3.png","element":"img"}],[{"text":"Here the ","element":"span"},{"style":{"height":10.62},"width":32.35,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-0.png","element":"img","alt":" εi","inline":true,"padRight":true},{"text":"are independent zero mean random errors, ","element":"span"},{"style":{"height":18.73},"width":43.29,"height":46.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-1.png","element":"img","alt":" µ0 ","inline":true,"padRight":true},{"text":"is a global intercept and ","element":"span"},{"style":{"height":22.45},"width":180.92,"height":56.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-2.png","element":"img","alt":" θ0jk is the","inline":true,"padRight":true},{"text":"contribution to the response of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"th level of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"th predictor; we will later place restrictions on the parameters to ensure they are identifiable. We are interested in the setting where the coefficients corresponding to any given predictor are clustered, so defining","element":"span"}],[{"style":{"width":"61%"},"width":1108,"height":60,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-3.png","element":"img"}],[{"text":"we have ","element":"span"},{"style":{"height":17.02},"width":157.92,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-4.png","element":"img","alt":" sj ≪ Kj","inline":true},{"text":", at least when ","element":"span"},{"style":{"height":17.02},"width":52.06,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-5.png","element":"img","alt":" Kj","inline":true,"padRight":true},{"text":"is large. Note that our setup can include high-dimensional settings where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"is large and many of the predictors do not contribute at all to the response: when ","element":"span"},{"style":{"height":13.02},"width":35.45,"height":32.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-6.png","element":"img","alt":" sj","inline":true,"padRight":true},{"text":"= 1, the contribution of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"th predictor is effectively null as it may be absorbed by the intercept term.","element":"span"}],[{"id":"id-21","style":{"fontWeight":"bold"},"text":"1.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Background and motivation","element":"span"}],[{"text":"Early work on collapsing levels together in low-dimensional models of the form ","element":"span"},{"href":"#id-2","text":"(1) ","element":"a"},{"text":"focused on performing a variety of significance tests for whether certain sets of parameters were equal ","element":"span"},{"href":"#id-3","referenceIndex":38,"text":"[Tukey, ","element":"a"},{"href":"#id-3","referenceIndex":38,"text":"1949, ","element":"a"},{"href":"#id-4","referenceIndex":32,"text":"Scott and Knott, ","element":"a"},{"href":"#id-4","referenceIndex":32,"text":"1974, ","element":"a"},{"href":"#id-5","referenceIndex":6,"text":"Calinski and Corsten, ","element":"a"},{"href":"#id-5","referenceIndex":6,"text":"1985]","element":"a"},{"text":". A more modern and algorithmic method based on these ideas is Delete or merge regressors ","element":"span"},{"href":"#id-6","referenceIndex":25,"text":"[Maj-Ka´nska et al., ","element":"a"},{"href":"#id-6","referenceIndex":25,"text":"2015]","element":"a"},{"text":", which involves agglomerative clustering based on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"-statistics for differences between levels.","element":"span"}],[{"text":"The CART algorithm ","element":"span"},{"href":"#id-7","referenceIndex":5,"text":"[Breiman et al., ","element":"a"},{"href":"#id-7","referenceIndex":5,"text":"1984] ","element":"a"},{"text":"for building decision trees effectively starts with all levels of the variables fused together and greedily selects which levels to split. One potential drawback of these greedy approaches is that in high-dimensional settings where the search space is very large, they may fail to find good groupings of the levels. The popular random forest procedure ","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"[Breiman, ","element":"a"},{"href":"#id-8","referenceIndex":4,"text":"2001] ","element":"a"},{"text":"uses randomisation to alleviate the issues with the greedy nature of the splits, but sacrifices interpretability of the fitted model.","element":"span"}],[{"text":"An alternative to greedy approaches in high-dimensional settings is using penalty-based methods such as the Lasso ","element":"span"},{"href":"#id-9","referenceIndex":35,"text":"[Tibshirani, ","element":"a"},{"href":"#id-9","referenceIndex":35,"text":"1996]","element":"a"},{"text":". This can be applied to continuous or binary data and involves optimising an objective for which global minimisation is computationally tractable, thereby avoiding some of the pitfalls of greedy optimisation. In contrast to random forest, the fitted models are sparse and interpretable. ","element":"span"},{"text":"Inspired by the success of the Lasso and related methods for high-dimensional regression, a variety of approaches have proposed estimating ","element":"span"},{"style":{"height":23.19},"width":629.23,"height":57.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-7.png","element":"img","alt":" θ0 = (θ0jk)j=1,...,p, k=1,...,Kj and µ0","inline":true,"padRight":true},{"text":"via optimising over (","element":"span"},{"style":{"height":16},"width":70.69,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-8.png","element":"img","alt":"µ, θ","inline":true},{"text":") a sum of a least squares ","element":"span"},{"text":"criterion","element":"span"}],[{"id":"id-15","style":{"width":"99%"},"width":1800,"height":353,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-9.png","element":"img"}],[{"text":"This is the CAS-ANOVA penalty of ","element":"span"},{"href":"#id-10","referenceIndex":1,"text":"Bondell and Reich ","element":"a"},{"href":"#id-10","referenceIndex":1,"text":"[2009]","element":"a"},{"text":". The weights ","element":"span"},{"style":{"height":13.24},"width":82.78,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/1-10.png","element":"img","alt":" wj,kl","inline":true,"padRight":true},{"text":"can be chosen to balance the effects of having certain levels of categories more prevalent than others in the data. The penalty is an ‘all-pairs’ version of the fused Lasso and closely related to so-called convex clustering ","element":"span"},{"href":"#id-11","referenceIndex":15,"text":"[Hocking et al., ","element":"a"},{"href":"#id-11","referenceIndex":15,"text":"2011, ","element":"a"},{"href":"#id-12","referenceIndex":8,"text":"Chiquet et al., ","element":"a"},{"href":"#id-12","referenceIndex":8,"text":"2017]","element":"a"},{"text":". We note that there are several other approaches besides using penalty functions. For instance, ","element":"span"},{"href":"#id-13","referenceIndex":28,"text":"Pauger and Wagner ","element":"a"},{"href":"#id-13","referenceIndex":28,"text":"[2019] ","element":"a"},{"text":"proposes a Bayesian modelling procedure using sparsity-inducing prior distributions to encourage fusion of levels. See also ","element":"span"},{"href":"#id-14","referenceIndex":39,"text":"Tutz and Gertheiss ","element":"a"},{"href":"#id-14","referenceIndex":39,"text":"[2016] ","element":"a"},{"text":"and references therein for a review of other methods including those based on mixture models and kernels.","element":"span"}],[{"text":"The fact that the optimisation problem resulting from ","element":"span"},{"href":"#id-15","text":"(4) ","element":"a"},{"text":"is convex makes the procedure attractive. However, a drawback is that it may not give a desirable form of shrinkage. Indeed, consider the case where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1, and dropping subscripts for simplicity, all ","element":"span"},{"style":{"height":10.84},"width":59.73,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-0.png","element":"img","alt":" wkl","inline":true,"padRight":true},{"text":"= 1. This would typically be the case if all levels were equally prevalent. Further suppose for simplicity that the number of levels ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"is even. Then if the coefficients are clustered into two groups where one contains only a single isolated coefficient, the number of non-zero summands in ","element":"span"},{"href":"#id-15","text":"(4) ","element":"a"},{"text":"is only ","element":"span"},{"style":{"height":12},"width":114.35,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-1.png","element":"img","alt":" K−1.","inline":true,"padRight":true},{"text":"This almost doubles to 2(","element":"span"},{"style":{"height":12},"width":77.9,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-2.png","element":"img","alt":"K −","inline":true},{"text":"2) when one of the two groups is of size 2. The extreme case where the two groups are of equal size yields (","element":"span"},{"style":{"height":19.13},"width":115.36,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-3.png","element":"img","alt":"K/2)2 ","inline":true,"padRight":true},{"text":"non-zero summands. This particular property of all-pairs penalties, which results in them favouring groups of unequal sizes, is illustrated schematically in Figure ","element":"span"},{"href":"#id-16","text":"1. ","element":"a"},{"text":"We can see the impact of this in the following concrete example.","element":"span"}],[{"style":{"width":"80%"},"width":1445,"height":933,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-4.png","element":"img"}],[{"id":"id-16","text":"Figure 1: Illustration of the number of non-zero summands in ","element":"figcaption","subtype":"caption"},{"href":"#id-15","text":"(4) ","element":"a","subtype":"caption"},{"text":"when ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"p ","element":"figcaption","subtype":"caption"},{"text":"= 1, ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"K ","element":"figcaption","subtype":"caption"},{"text":"= 16 and coefficients are clustered into two groups of equal size (right), and where one contains a single coefficient (left) and two coefficients (middle).","element":"figcaption","subtype":"caption"}],[{"text":"Suppose ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"= 20 levels are clustered into four groups with","element":"span"}],[{"style":{"width":"48%"},"width":879,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-5.png","element":"img"}],[{"text":"If the coefficient estimates satisfy ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.85},"width":875.62,"height":49.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-6.png","element":"img","alt":"θ1 = · · · = ˆθ4 < ˆθ5 = · · · = ˆθ10 ≤ ˆθk for all k ≥","inline":true,"padRight":true},{"text":"11, so the first two groups have distinct coefficients, then moving any coefficient from the first group towards the second, and so increasing the number of estimated groups, actually ","element":"span"},{"style":{"fontStyle":"italic"},"text":"decreases ","element":"span"},{"text":"the penalty contribution in ","element":"span"},{"href":"#id-15","text":"(4)","element":"a"},{"text":". Specifically, if the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"th coefficient for some ","element":"span"},{"style":{"height":21.41},"width":622.65,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-7.png","element":"img","alt":" k ∈ {1, . . . , 4} moves to ˆθk +t for","inline":true},{"style":{"height":21.41},"width":256.09,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-8.png","element":"img","alt":"t ∈ [0, ˆθ5 − ˆθ4","inline":true},{"text":"] with all other coefficients kept fixed, the penalty contribution decreases by 13","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". In this case then, CAS-ANOVA will struggle to keep the groups intact, especially smaller ones. We see this in Figure ","element":"span"},{"href":"#id-17","text":"2, ","element":"a"},{"text":"which shows the result of applying CAS-ANOVA to data generated according to ","element":"span"},{"href":"#id-2","text":"(1) ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":19.07},"width":397.09,"height":47.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-9.png","element":"img","alt":" p = 1, θ0 as above, n","inline":true,"padRight":true},{"text":"= 20 (so we have a single observation corresponding to each level), and ","element":"span"},{"style":{"height":23.09},"width":218.76,"height":57.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-10.png","element":"img","alt":" εii.i.d.∼ N(0,","inline":true,"padRight":true},{"text":"1). There is no value of the tuning parameter ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-11.png","element":"img","alt":" λ","inline":true,"padRight":true},{"text":"where the true groups are recovered.","element":"span"}],[{"text":"As in the standard regression setting, the bias introduced by all-pairs ","element":"span"},{"style":{"height":15.02},"width":35.18,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-12.png","element":"img","alt":" ℓ1","inline":true},{"text":"-type penalties may be reduced by choosing data-adaptive weights analogously to the adaptive Lasso ","element":"span"},{"href":"#id-18","referenceIndex":44,"text":"[Zou, ","element":"a"},{"href":"#id-18","referenceIndex":44,"text":"2006]","element":"a"},{"text":", or replacing the absolute value ","element":"span"},{"style":{"height":18.44},"width":656.72,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/2-13.png","element":"img","alt":" |θjk − θjl| by ρ(|θjk − θjl|) where ρ","inline":true,"padRight":true},{"text":"is a concave and non-decreasing penalty function ","element":"span"},{"href":"#id-19","referenceIndex":27,"text":"[Oelker et al., ","element":"a"},{"href":"#id-19","referenceIndex":27,"text":"2015, ","element":"a"},{"href":"#id-20","referenceIndex":24,"text":"Ma and Huang, ","element":"a"},{"href":"#id-20","referenceIndex":24,"text":"2017]","element":"a"},{"text":". However, this does not","element":"span"}],[{"style":{"width":"100%"},"width":1806,"height":567,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-0.png","element":"img"}],[{"id":"id-17","text":"Figure 2: Solution paths as the tuning parameter varies in a univariate example where there ","element":"figcaption","subtype":"caption"},{"text":"are four true groups. From left to right: CAS-ANOVA, the range penalty and SCOPE with ","element":"figcaption","subtype":"caption"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-1.png","element":"img","alt":"γ","inline":true,"padRight":true},{"text":"= 8. The setup is as described in the main text of Section ","element":"figcaption","subtype":"caption"},{"href":"#id-21","text":"1.1, ","element":"a","subtype":"caption"},{"text":"with the different colours corresponding to the different true groups. The tuning parameter varies along the ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"y ","element":"figcaption","subtype":"caption"},{"text":"axis. In this example, only SCOPE identifies the 4 correct groups at any point along its solution path.","element":"figcaption","subtype":"caption"}],[{"text":"address the basic issue of a preference for groups of unequal sizes. Additionally, optimising an objective involving a penalty with ","element":"span"},{"style":{"height":31.6},"width":267.09,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-2.png","element":"img","alt":" O��pj=1 K2j�","inline":true},{"text":"summands can be computationally challenging, particularly in the case where ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-3.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"text":"is not convex, both in terms of runtime and memory. To help motivate the new approach we are proposing in this paper, let us consider the setting where the predictors are ordinal rather than nominal, so there is an obvious ordering among the levels. In these settings, it is natural to consider a fused Lasso ","element":"span"},{"href":"#id-22","referenceIndex":36,"text":"[Tibshirani et al., ","element":"a"},{"href":"#id-22","referenceIndex":36,"text":"2005] ","element":"a"},{"text":"penalty of the form","element":"span"}],[{"style":{"width":"64%"},"width":1160,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":12.62},"width":39.87,"height":31.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-5.png","element":"img","alt":" πj","inline":true,"padRight":true},{"text":"is a permutation of ","element":"span"},{"style":{"height":18.22},"width":217.19,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-6.png","element":"img","alt":" {1, . . . , Kj}","inline":true,"padRight":true},{"text":"specifying the given order; this is done in ","element":"span"},{"href":"#id-23","referenceIndex":14,"text":"Gertheiss ","element":"a"},{"href":"#id-23","referenceIndex":14,"text":"and Tutz ","element":"a"},{"href":"#id-23","referenceIndex":14,"text":"[2010] ","element":"a"},{"text":"who advocate using it conjunction with the all-pairs-type CAS-ANOVA penalty for nominal categories.","element":"span"}],[{"text":"If however we treat the nominal variable setting as analogous to having ordinal variables with unknown orderings ","element":"span"},{"style":{"height":12.62},"width":39.88,"height":31.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-7.png","element":"img","alt":" πj","inline":true},{"text":", one might initially think of choosing ","element":"span"},{"style":{"height":12.62},"width":39.88,"height":31.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-8.png","element":"img","alt":" πj","inline":true,"padRight":true},{"text":"corresponding to the order of the estimates ","element":"span"},{"style":{"height":24.25},"width":270.19,"height":60.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-9.png","element":"img","alt":" θj := (θjk)Kjk=1","inline":true},{"text":", such that ","element":"span"},{"style":{"height":19.9},"width":662,"height":49.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-10.png","element":"img","alt":" θjπj(k) = θj(k), where θj(k) is the k","inline":true},{"text":"th smallest entry ","element":"span"},{"text":"in ","element":"span"},{"style":{"height":17.02},"width":40.91,"height":42.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-11.png","element":"img","alt":" θj","inline":true},{"text":". This however leads to what we refer to as the ‘range’ penalty:","element":"span"}],[{"id":"id-24","style":{"width":"71%"},"width":1290,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/3-12.png","element":"img"}],[{"text":"Whilst this shrinks the largest and smallest of the estimated coefficients together, the remaining coefficients lying in the open interval between these are unpenalised and so no grouping of the estimates is encouraged, as we observe in Figure ","element":"span"},{"href":"#id-17","text":"2; ","element":"a"},{"text":"see also ","element":"span"},{"href":"#id-19","referenceIndex":27,"text":"Oelker et al. ","element":"a"},{"href":"#id-19","referenceIndex":27,"text":"[2015] ","element":"a"},{"text":"for a discussion of this issue in the context of ordinal variables.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"1.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Our contributions and organisation of the paper","element":"span"}],[{"text":"Given how all-pairs penalties have an intrinsic and undesirable preference for unequal group sizes, and how the fused Lasso applied to ordered coefficients ","element":"span"},{"href":"#id-24","text":"(6) ","element":"a"},{"text":"does not result in grouping of","element":"span"}],[{"text":"the coefficients, we propose the following solution. Our approach is to use the penalty","element":"span"}],[{"style":{"width":"28%"},"width":509,"height":142,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/4-0.png","element":"img"}],[{"text":"for concave (and nonconvex) non-decreasing penalty functions ","element":"span"},{"style":{"height":13.02},"width":37.56,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/4-1.png","element":"img","alt":" ρj","inline":true},{"text":", which, for computational reasons we discuss in Section ","element":"span"},{"text":"3, ","element":"span"},{"text":"we base on the minimax concave penalty (MCP) ","element":"span"},{"href":"#id-25","referenceIndex":42,"text":"[Zhang, ","element":"a"},{"href":"#id-25","referenceIndex":42,"text":"2010]","element":"a"},{"text":". In Section ","element":"span"},{"text":"2 ","element":"span"},{"text":"we formally introduce our method, which we call SCOPE, standing for ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"text":"parse ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"text":"oncave ","element":"span"},{"style":{"fontWeight":"bold"},"text":"O","element":"span"},{"text":"rdering & ","element":"span"},{"style":{"fontWeight":"bold"},"text":"P","element":"span"},{"text":"enalisation ","element":"span"},{"style":{"fontWeight":"bold"},"text":"E","element":"span"},{"text":"stimator.","element":"span"}],[{"text":"Note that whereas in conventional high-dimensional regression, the use of nonconvex penalties has been primarily motivated by a need to reduce bias in the estimation of large coefficients ","element":"span"},{"href":"#id-26","referenceIndex":11,"text":"[Fan and Li, ","element":"a"},{"href":"#id-26","referenceIndex":11,"text":"2001]","element":"a"},{"text":", here the purpose is very different: in our setting a nonconvex penalty is in fact even necessary for shrinkage to sparse solutions to occur (see Proposition ","element":"span"},{"href":"#id-27","text":"1)","element":"a"},{"text":". Because of these fundamental differences, the rich algorithmic and statistical theory concerning high-dimensional regression with nonconvex penalties (see for example ","element":"span"},{"href":"#id-28","referenceIndex":21,"text":"Loh and Wainwright ","element":"a"},{"href":"#id-28","referenceIndex":21,"text":"[2012, ","element":"a"},{"href":"#id-29","referenceIndex":22,"text":"2015]","element":"a"},{"text":", ","element":"span"},{"href":"#id-30","referenceIndex":40,"text":"Wang et al. ","element":"a"},{"href":"#id-30","referenceIndex":40,"text":"[2014]","element":"a"},{"text":", ","element":"span"},{"href":"#id-31","referenceIndex":12,"text":"Fan et al. ","element":"a"},{"href":"#id-31","referenceIndex":12,"text":"[2018]","element":"a"},{"text":", ","element":"span"},{"href":"#id-32","referenceIndex":43,"text":"Zhao et al. ","element":"a"},{"href":"#id-32","referenceIndex":43,"text":"[2018] ","element":"a"},{"text":"and references therein) is not directly applicable to our setting.","element":"span"}],[{"text":"In Section ","element":"span"},{"text":"3, ","element":"span"},{"text":"we therefore introduce a new dynamic programming approach that recovers the global minimum of the resulting objective function exactly in the univariate case, i.e. when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1. We then build this into a blockwise coordinate descent approach to tackle the multivariate setting.","element":"span"}],[{"text":"In Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"we study the theoretical properties of SCOPE and give sufficient conditions for the estimator to coincide with the least squares solution with oracular knowledge of the level fusions in the univariate case. These conditions involve a minimal separation between unequal coefficients that is, up to constant factors, minimax optimal. ","element":"span"},{"text":"Our result contrasts sharply with Theorem 2 of ","element":"span"},{"href":"#id-20","referenceIndex":24,"text":"Ma and Huang ","element":"a"},{"href":"#id-20","referenceIndex":24,"text":"[2017] ","element":"a"},{"text":"for an all-pairs nonconvex penalty. The latter instead shows the existence of a local optimum that coincides with the oracle least squares solution. Whilst in conventional high-dimensional regression settings, it is known that under certain conditions, all local optima have favourable properties ","element":"span"},{"href":"#id-29","referenceIndex":22,"text":"[Loh and Wainwright, ","element":"a"},{"href":"#id-29","referenceIndex":22,"text":"2015]","element":"a"},{"text":", we note that the separation requirements in ","element":"span"},{"href":"#id-20","referenceIndex":24,"text":"Ma and Huang ","element":"a"},{"href":"#id-20","referenceIndex":24,"text":"[2017] ","element":"a"},{"text":"are substantially weaker than those indicated by the minimax lower bound, and so cannot be extended to a particular local optimum determined by the data; see the discussion following Theorem ","element":"span"},{"href":"#id-33","text":"5.","element":"a"}],[{"text":"We use our univariate result to show that the oracle least squares solution is a fixed point of our blockwise coordinate descent algorithm in the multivariate case. In Section ","element":"span"},{"text":"5 ","element":"span"},{"text":"we outline some extensions of our methodology including a scheme for handling settings when there is a hierarchy among the categorical variables. Section ","element":"span"},{"text":"6 ","element":"span"},{"text":"contains numerical experiments that demonstrate the favourable performance of our method compared to a range of competitors on both simulated and real data. We conclude with a discussion in Section ","element":"span"},{"text":"7. ","element":"span"},{"text":"Further details of our algorithm can be found in the Appendix. The supplementary material contains additional information on the runtime of our algorithm, and an approximate version suitable for very large-scale settings, all the proofs, and additional information on the experiments in Section ","element":"span"},{"text":"6.","element":"span"}]]},{"heading":"2 SCOPE methodology","paragraphs":[[{"text":"Recall that our goal is to estimate parameters (","element":"span"},{"style":{"height":19.47},"width":107.52,"height":48.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/4-2.png","element":"img","alt":"µ0, θ0","inline":true},{"text":") in model ","element":"span"},{"href":"#id-2","text":"(1)","element":"a"},{"text":". Let us first consolidate some notation. For any ","element":"span"},{"style":{"height":24.25},"width":982.21,"height":60.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/4-3.png","element":"img","alt":" θ ∈ RK1 ×· · ·×RKp, we define θj := (θjk)Kjk=1 ∈ RKj","inline":true},{"text":". We will study the ","element":"span"},{"text":"univariate setting where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1 separately, and so it will be helpful to introduce some simplified notation for this case, dropping any extraneous subscripts. We thus write ","element":"span"},{"style":{"height":15.2},"width":364,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/4-4.png","element":"img","alt":" K ≡ K1, Xi ≡ Xi1","inline":true}],[{"text":"and ","element":"span"},{"style":{"height":12.4},"width":120.3,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-0.png","element":"img","alt":" ρ ≡ ρ1","inline":true},{"text":". Additionally, we let ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":14.84},"width":43.33,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-1.png","element":"img","alt":"Yk","inline":true,"padRight":true},{"text":"denote the average of the ","element":"span"},{"style":{"height":15.02},"width":296.75,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-2.png","element":"img","alt":" Yi with Xi = k:","inline":true}],[{"id":"id-35","style":{"width":"61%"},"width":1115,"height":123,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-3.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":24.2},"width":374.5,"height":60.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-4.png","element":"img","alt":" nk = �ni=1 1{Xi=k}.","inline":true,"padRight":true},{"text":"In order to avoid an arbitrary choice of corner point constraint, we instead impose the following to ensure that ","element":"span"},{"style":{"height":15.48},"width":42.91,"height":38.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-5.png","element":"img","alt":" θ0 ","inline":true,"padRight":true},{"text":"is identifiable: for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , p ","element":"span"},{"text":"we have","element":"span"}],[{"id":"id-92","style":{"width":"82%"},"width":1490,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-6.png","element":"img"}],[{"text":"Let Θ","element":"span"},{"style":{"height":20.15},"width":526.06,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-7.png","element":"img","alt":"j = {θj ∈ RKj : gj(θj) = 0}","inline":true},{"text":", and let Θ = Θ","element":"span"},{"style":{"height":17.42},"width":227.68,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-8.png","element":"img","alt":"1 × · · · × Θp","inline":true},{"text":". We will construct estimators by minimising over ","element":"span"},{"style":{"height":16.4},"width":277.56,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-9.png","element":"img","alt":" µ ∈ R and θ ∈","inline":true,"padRight":true},{"text":"Θ an objective function of the form","element":"span"}],[{"style":{"width":"48%"},"width":884,"height":141,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-10.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":12.8},"width":18,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-11.png","element":"img","alt":" ℓ","inline":true,"padRight":true},{"text":"is the least squares loss function ","element":"span"},{"href":"#id-15","text":"(3) ","element":"a"},{"text":"and ","element":"span"},{"style":{"height":19.9},"width":358.03,"height":49.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-12.png","element":"img","alt":" θj(1) ≤ · · · ≤ θj(Kj)","inline":true,"padRight":true},{"text":"are the order statistics of ","element":"span"},{"style":{"height":17.02},"width":40.91,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-13.png","element":"img","alt":"θj","inline":true},{"text":". We allow for different penalty functions ","element":"span"},{"style":{"height":13.02},"width":37.56,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-14.png","element":"img","alt":" ρj","inline":true,"padRight":true},{"text":"for each predictor in order to help balance the effects of varying numbers of levels ","element":"span"},{"style":{"height":17.02},"width":52.06,"height":42.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-15.png","element":"img","alt":" Kj","inline":true},{"text":". The identifiability constraint that ","element":"span"},{"style":{"height":12.8},"width":68.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-16.png","element":"img","alt":" θ ∈","inline":true,"padRight":true},{"text":"Θ ensures that the estimated intercept ˆ","element":"span"},{"style":{"height":12},"width":26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-17.png","element":"img","alt":"µ","inline":true,"padRight":true},{"text":":= arg min","element":"span"},{"style":{"height":23},"width":151.8,"height":57.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-18.png","element":"img","alt":"µ ˜Q(µ, θ","inline":true},{"text":") satisfies ˆ","element":"span"},{"style":{"height":18.8},"width":293.48,"height":47.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-19.png","element":"img","alt":"µ = �ni=1 Yi/n.","inline":true}],[{"text":"We note that whilst the form of the identifiability constraint would not have a bearing on the fitted values of unregularised least squares regression, this is not necessarily the case when regularisation is imposed. For example, consider the simple univariate setting with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1 and the corner point constraint ","element":"span"},{"style":{"height":15.02},"width":37.48,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-20.png","element":"img","alt":" θ1","inline":true,"padRight":true},{"text":"= 0. Then the fitted value for an observation with level 1 would simply be the average ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":14.62},"width":42.34,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-21.png","element":"img","alt":"Y1","inline":true},{"text":", coinciding with that of unpenalised least squares. However the fitted values with observations with other level ","element":"span"},{"style":{"height":14.8},"width":74.53,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-22.png","element":"img","alt":" k ≥","inline":true,"padRight":true},{"text":"2 would be subject to regularisation and in general be different to ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":14.84},"width":43.34,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-23.png","element":"img","alt":"Yk","inline":true},{"text":". This inequitable treatment of the levels is clearly undesirable as they may have been labelled in an arbitrary way. Our identifiability constraint treats the levels more symmetrically, but also takes into account the prevalence of levels, so the fitted values corresponding to more prevalent levels effectively undergo less regularisation.","element":"span"}],[{"text":"As the estimated intercept ˆ","element":"span"},{"style":{"height":12},"width":26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-24.png","element":"img","alt":"µ","inline":true,"padRight":true},{"text":"does not depend on the tuning parameters, we define","element":"span"}],[{"id":"id-34","style":{"width":"90%"},"width":1631,"height":141,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-25.png","element":"img"}],[{"text":"We will take the regularisers ","element":"span"},{"style":{"height":18.22},"width":382.54,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-26.png","element":"img","alt":" ρj : [0, ∞) → [0, ∞","inline":true},{"text":") in ","element":"span"},{"href":"#id-34","text":"(9) ","element":"a"},{"text":"to be concave (and nonconvex); as discussed in the introduction and formalised in Proposition ","element":"span"},{"href":"#id-27","text":"1 ","element":"a"},{"text":"below, a nonconvex penalty is necessary for fusion to occur.","element":"span"}],[{"id":"id-27","style":{"fontWeight":"bold"},"text":"Proposition 1. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the univariate case with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":". Suppose the subaverages ","element":"span"},{"text":"( ","element":"span"},{"text":"¯","element":"span"},{"href":"#id-35","style":{"height":20.45},"width":197.75,"height":51.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-27.png","element":"img","alt":"Yk)Kk=1 (7)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are all distinct, and that ","element":"span"},{"style":{"height":12.4},"width":122.67,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-28.png","element":"img","alt":" ρ1 ≡ ρ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is convex. Then any minimiser ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21},"width":587.74,"height":52.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-29.png","element":"img","alt":"θ of Q has ˆθk ̸= ˆθl for all k ̸= l","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.76},"width":851.34,"height":59.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-30.png","element":"img","alt":"θ(1) < ¯Yk − ˆµ < ˆθ(K) or ˆθ(1) < ¯Yl − ˆµ < ˆθ(K).","inline":true}],[{"style":{"width":"96%"},"width":1734,"height":47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-31.png","element":"img"}],[{"href":"#id-25","referenceIndex":42,"text":"2010]","element":"a"},{"text":":","element":"span"}],[{"style":{"width":"39%"},"width":717,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/5-32.png","element":"img"}],[{"text":"where (","element":"span"},{"style":{"height":24.2},"width":288.4,"height":60.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-0.png","element":"img","alt":"u)+ = u1{u≥0}","inline":true},{"text":". This is a piecewise quadratic function with gradient ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-1.png","element":"img","alt":" λ","inline":true,"padRight":true},{"text":"at 0 and flat beyond ","element":"span"},{"style":{"height":16},"width":51.05,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-2.png","element":"img","alt":" γλ","inline":true},{"text":". For computational reasons which we discuss in Section ","element":"span"},{"text":"3, ","element":"span"},{"text":"the simple piecewise quadratic form of this is particularly helpful. ","element":"span"},{"text":"In the multivariate case we take ","element":"span"},{"style":{"height":14.79},"width":202.69,"height":36.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-3.png","element":"img","alt":" ρj = ργ,λj","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":20.8},"width":229.22,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-4.png","element":"img","alt":" λj = λ�Kj","inline":true},{"text":". This choice of scaling is motivated by requiring that when ","element":"span"},{"style":{"height":15.48},"width":42.91,"height":38.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-5.png","element":"img","alt":" θ0 ","inline":true,"padRight":true},{"text":"= 0 we also have ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-6.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"= 0 with high probability; see Lemma ","element":"span"},{"href":"#id-36","text":"10 ","element":"a"},{"text":"in the Supplementary material. We discuss the choice of the tuning parameters ","element":"span"},{"style":{"height":16},"width":155.45,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-7.png","element":"img","alt":" λ and γ","inline":true,"padRight":true},{"text":"in Section ","element":"span"},{"href":"#id-37","text":"3.3, ","element":"a"},{"text":"but first turn to the problem of optimising ","element":"span"},{"href":"#id-34","text":"(9)","element":"a"},{"text":".","element":"span"}]]},{"heading":"3 Computation","paragraphs":[[{"text":"In this section we include details of how SCOPE is computed. Section ","element":"span"},{"href":"#id-38","text":"3.1 ","element":"a"},{"text":"motivates and describes the dynamic programming algorithm we use to compute global minimiser of the SCOPE objective, which is highly non-convex. Section ","element":"span"},{"href":"#id-39","text":"3.2 ","element":"a"},{"text":"contains details of how this is used to solve the multivariate objective by embedding it within a blockwise coordinate descent routine. Discussion of practical considerations is contained in Section ","element":"span"},{"href":"#id-37","text":"3.3.","element":"a"}],[{"id":"id-38","style":{"fontWeight":"bold"},"text":"3.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Univariate model","element":"span"}],[{"id":"id-93","style":{"fontWeight":"bold"},"text":"3.1.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Preliminaries","element":"span"}],[{"text":"We now consider the univariate case (","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1) and explain how the solutions are computed. In this case, we may rewrite the least squares loss contribution to the objective function in the following way.","element":"span"}],[{"style":{"width":"97%"},"width":1764,"height":342,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-8.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.6},"width":209.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-9.png","element":"img","alt":" wk = nk/n","inline":true},{"text":". Thus the optimisation problem ","element":"span"},{"href":"#id-34","text":"(9) ","element":"a"},{"text":"can be written equivalently as","element":"span"}],[{"id":"id-40","style":{"width":"80%"},"width":1462,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-10.png","element":"img"}],[{"text":"suppressing the dependence of the MCP ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-11.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"text":"on tuning parameters ","element":"span"},{"style":{"height":16},"width":147.54,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-12.png","element":"img","alt":" γ and λ","inline":true},{"text":". In fact, it is straightforward to see that the constraint that the solution lies in Θ will be automatically satisfied, so we may replace Θ with ","element":"span"},{"style":{"height":15.13},"width":61.52,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-13.png","element":"img","alt":" RK","inline":true},{"text":". Two challenging aspects of the optimisation problem above are the presence of the nonconvex ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-14.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"text":"and the order statistics. The latter however are easily dealt with using the result below, which holds more generally whenever ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-15.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"text":"is a concave function.","element":"span"}],[{"id":"id-95","style":{"fontWeight":"bold"},"text":"Proposition 2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the univariate optimisation ","element":"span"},{"href":"#id-40","text":"(11) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"with ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-16.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"any concave function such that a minimiser ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-17.png","element":"img","alt":"θ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"exists. If for ","element":"span"},{"style":{"height":20.21},"width":650.94,"height":50.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-18.png","element":"img","alt":" k, l we have ¯Yk > ¯Yl, then ˆθk ≥ ˆθl.","inline":true}],[{"text":"This observation substantially simplifies the optimisation: after re-indexing such that ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":14.62},"width":90.53,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-19.png","element":"img","alt":"Y1 ≤","inline":true,"padRight":true},{"text":"¯","element":"span"},{"style":{"height":17.51},"width":266.87,"height":43.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-20.png","element":"img","alt":"Y2 ≤ · · · ≤ ¯YK","inline":true},{"text":", we may re-express ","element":"span"},{"href":"#id-40","text":"(11) ","element":"a"},{"text":"as,","element":"span"}],[{"id":"id-97","style":{"width":"82%"},"width":1490,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/6-21.png","element":"img"}],[{"text":"We use the following intermediate functions to structure the algorithm:","element":"span"}],[{"style":{"width":"86%"},"width":1569,"height":304,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-0.png","element":"img"}],[{"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"= 2","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , K","element":"span"},{"text":"; here sarg min refers to the smallest minimiser in the case that it is not unique. Invariably however this will be unique, as the following result indicates.","element":"span"}],[{"id":"id-54","style":{"fontWeight":"bold"},"text":"Proposition 3. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The set of ","element":"span"},{"text":"( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":20.45},"width":124.6,"height":51.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-1.png","element":"img","alt":"Yk)Kk=1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"that yields distinct solutions to ","element":"span"},{"href":"#id-40","text":"(11) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"has Lebesgue measure ","element":"span"},{"style":{"fontStyle":"italic"},"text":"zero as a subset of ","element":"span"},{"style":{"height":15.13},"width":77.33,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-2.png","element":"img","alt":" RK.","inline":true}],[{"text":"We will thus tacitly assume uniqueness in some of the discussion that follows, though this is not required for our algorithm to return a global minimiser. Observe now that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":15.1},"width":181.11,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-3.png","element":"img","alt":"θK is the","inline":true,"padRight":true},{"text":"minimiser of the univariate objective function ","element":"span"},{"style":{"height":16.4},"width":51.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-4.png","element":"img","alt":" fK","inline":true},{"text":": indeed for ","element":"span"},{"style":{"height":15.6},"width":116.09,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-5.png","element":"img","alt":" k ≥ 2,","inline":true}],[{"id":"id-102","style":{"width":"89%"},"width":1619,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-6.png","element":"img"}],[{"text":"Furthermore, we have ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.41},"width":273.75,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-7.png","element":"img","alt":"θK−1 = bK(ˆθK","inline":true},{"text":"), and more generally ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.41},"width":693.13,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-8.png","element":"img","alt":"θk = bk+1(ˆθk+1) for k = K −1, . . . , 1.","inline":true,"padRight":true},{"text":"Thus provided ","element":"span"},{"style":{"height":16.4},"width":51.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-9.png","element":"img","alt":" fK","inline":true,"padRight":true},{"text":"can be minimised efficiently (which we shall see is indeed the case), given this and the functions ","element":"span"},{"style":{"height":15.6},"width":183.35,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-10.png","element":"img","alt":" b2, . . . , bK","inline":true,"padRight":true},{"text":"we can iteratively compute ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.2},"width":303.72,"height":50.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-11.png","element":"img","alt":"θK, ˆθK−1, . . . , ˆθ1","inline":true},{"text":". In order to make use of these properties, we must be able to compute ","element":"span"},{"style":{"height":16.4},"width":267.74,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-12.png","element":"img","alt":" fK and the bk","inline":true,"padRight":true},{"text":"efficiently; we explain how to do this in the following subsection.","element":"span"}],[{"id":"id-90","style":{"fontWeight":"bold"},"text":"3.1.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Computation of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"style":{"fontWeight":"bold"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , b","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"}],[{"text":"The simple piecewise quadratic form of the MCP-based penalty is crucial to our approach for computing the ","element":"span"},{"style":{"height":16.4},"width":276.86,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-13.png","element":"img","alt":" fK and the bk","inline":true},{"text":". Some important consequences of this piecewise quadratic property are summarised in the following lemma.","element":"span"}],[{"id":"id-41","style":{"fontWeight":"bold"},"text":"Lemma 4. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"For each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"style":{"fontStyle":"italic"},"text":",","element":"span"}],[{"style":{"width":"84%"},"width":1524,"height":134,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-14.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"(iii) for each ","element":"span"},{"style":{"height":16.44},"width":187.04,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-15.png","element":"img","alt":" θk+1 ∈ R","inline":true},{"style":{"fontStyle":"italic"},"text":", if a minimiser ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":20.61},"width":1002.58,"height":51.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-16.png","element":"img","alt":"θk = ˜θk(θk+1) of θk �→ fk(θk) + ρ(θk+1 − θk) over","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":20.61},"width":721.64,"height":51.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-17.png","element":"img","alt":"−∞, θk+1] satisfies ˜θk < θk+1, then fk","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"must be differentiable at ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":15.24},"width":53.96,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-18.png","element":"img","alt":"θk.","inline":true}],[{"text":"Properties (i) and (ii) above permit exact representation of ","element":"span"},{"style":{"height":16.4},"width":184.73,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-19.png","element":"img","alt":" fk and bk","inline":true,"padRight":true},{"text":"with finitely many quantities. The key task then is to form the collection of intervals and corresponding coefficients of quadratic functions for","element":"span"}],[{"style":{"width":"73%"},"width":1325,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-20.png","element":"img"}],[{"text":"given a similar piecewise quadratic representation of ","element":"span"},{"style":{"height":16.4},"width":39.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-21.png","element":"img","alt":" fk","inline":true},{"text":"; and also the same for the linear functions composing ","element":"span"},{"style":{"height":15.24},"width":36.73,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-22.png","element":"img","alt":" bk","inline":true},{"text":". A piecewise quadratic representation of ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-23.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"would then be straightforward to compute, and we can iterate this process. To take advantage of property (iii) above, in computing ","element":"span"},{"style":{"height":17.6},"width":140.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-24.png","element":"img","alt":" gk(θk+1","inline":true},{"text":") we can separately search for minimisers at stationary points in (","element":"span"},{"style":{"height":17.6},"width":281.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/7-25.png","element":"img","alt":"−∞, θk+1) and","inline":true,"padRight":true},{"text":"compare the corresponding function values with ","element":"span"},{"style":{"height":17.6},"width":141.12,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-0.png","element":"img","alt":" fk(θk+1","inline":true},{"text":"); the fact that we need only consider potential minimisers at points of differentiability will simplify things as we shall see below.","element":"span"}],[{"text":"Suppose ","element":"span"},{"style":{"height":18.75},"width":284.68,"height":46.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-1.png","element":"img","alt":" Ik,1, . . . , Ik,m(k)","inline":true,"padRight":true},{"text":"are intervals that partition ","element":"span"},{"text":"R ","element":"span"},{"text":"(closed on the left) and ","element":"span"},{"style":{"height":14.75},"width":285.27,"height":36.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-2.png","element":"img","alt":" qk,1, . . . , qk,m(k)","inline":true,"padRight":true},{"text":"are corresponding quadratic functions such that ","element":"span"},{"style":{"height":18.44},"width":553.13,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-3.png","element":"img","alt":" fk(θk) = qk,r(θk) for θk ∈ Ik,r","inline":true},{"text":". Let us write","element":"span"}],[{"style":{"width":"34%"},"width":628,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-4.png","element":"img"}],[{"text":"We may then express ","element":"span"},{"style":{"height":18.44},"width":503.83,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-5.png","element":"img","alt":" fk as fk(θk) = minr ˜qk,r(θk","inline":true},{"text":"). We can also express the penalty ","element":"span"},{"style":{"height":17.24},"width":204.3,"height":43.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-6.png","element":"img","alt":" ρ = ργ,λ in","inline":true,"padRight":true},{"text":"a similar fashion. Let","element":"span"}],[{"style":{"width":"79%"},"width":1427,"height":122,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-7.png","element":"img"}],[{"text":"Then ","element":"span"},{"style":{"height":17.6},"width":678.03,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-8.png","element":"img","alt":" ρ(x) = mint ˜ρt(x) for x ≥ 0. Let Dk","inline":true,"padRight":true},{"text":"be the set of points at which ","element":"span"},{"style":{"height":16.4},"width":39.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-9.png","element":"img","alt":" fk","inline":true,"padRight":true},{"text":"is differentiable. We then have, using Lemma ","element":"span"},{"href":"#id-41","text":"4 ","element":"a"},{"text":"(iii) that","element":"span"}],[{"id":"id-88","style":{"width":"86%"},"width":1560,"height":269,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-10.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"˜min denotes the minimum if it exists and ","element":"span"},{"style":{"height":8},"width":44,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-11.png","element":"img","alt":" ∞","inline":true,"padRight":true},{"text":"otherwise. ","element":"span"},{"text":"The fact that in the inner minimisation we are permitted to consider only points in ","element":"span"},{"style":{"height":14.84},"width":54.13,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-12.png","element":"img","alt":" Dk","inline":true,"padRight":true},{"text":"simplifies the form of","element":"span"}],[{"id":"id-44","style":{"width":"78%"},"width":1409,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-13.png","element":"img"}],[{"text":"We show in Section ","element":"span"},{"href":"#id-42","text":"A.1 ","element":"a"},{"text":"of the Appendix that this is finite only on an interval and there takes the value of a quadratic function; coefficients for this function and the interval endpoints have closed form expressions that are elementary functions of the coefficients and intervals corresponding to ˜","element":"span"},{"style":{"height":13.24},"width":62.37,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-14.png","element":"img","alt":"qk,r","inline":true},{"text":". With this, we have an explicit representation of ","element":"span"},{"style":{"height":12},"width":38.81,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-15.png","element":"img","alt":" gk","inline":true,"padRight":true},{"text":"as the minimum of a collection of functions that are quadratic on intervals and ","element":"span"},{"style":{"height":8},"width":44,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-16.png","element":"img","alt":" ∞","inline":true,"padRight":true},{"text":"everywhere else. Let us refer to these intervals (closed on the left) and corresponding quadratic functions as ","element":"span"},{"style":{"height":19.15},"width":660.06,"height":47.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-17.png","element":"img","alt":" Jk,1, . . . , Jk,n(k) and pk,1, . . . , pk,n(k)","inline":true,"padRight":true},{"text":"respectively.","element":"span"}],[{"text":"In order to produce a representation of ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-18.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"for use in future iterations, we must express ","element":"span"},{"style":{"height":12},"width":38.81,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-19.png","element":"img","alt":" gk","inline":true,"padRight":true},{"text":"as a collection of quadratics defined on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"disjoint ","element":"span"},{"text":"intervals. To this end, define for each ","element":"span"},{"style":{"height":13.2},"width":183.99,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-20.png","element":"img","alt":" x ∈ R the","inline":true},{"style":{"height":18.44},"width":687.5,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-21.png","element":"img","alt":"active set at x, A(x) = {r : x ∈ Jk,r}","inline":true},{"text":". Note that the endpoints of the intervals ","element":"span"},{"style":{"height":17.24},"width":67.09,"height":43.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-22.png","element":"img","alt":" Jk,r","inline":true,"padRight":true},{"text":"are the points where the active set changes and it is thus straightforward to determine ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") at each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") be the index such that ","element":"span"},{"style":{"height":19.95},"width":313.8,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-23.png","element":"img","alt":" gk(x) = pk,r(x)(x","inline":true},{"text":"). For large negative values of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") will contain a single index and for such ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"this must be ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"). Consider also for each ","element":"span"},{"style":{"height":17.6},"width":424.53,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-24.png","element":"img","alt":" r ∈ A(x) \\ {r(x)}, the","inline":true,"padRight":true},{"text":"horizontal coordinate ","element":"span"},{"style":{"height":8.4},"width":40.94,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-25.png","element":"img","alt":" x′ ","inline":true,"padRight":true},{"text":"of the first intersection beyond ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"(if it exists) between ","element":"span"},{"style":{"height":19.15},"width":292.28,"height":47.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-26.png","element":"img","alt":" pk,r and pk,r(x).","inline":true,"padRight":true},{"text":"We refer to the collection of all such tuples (","element":"span"},{"style":{"height":17.6},"width":642.13,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-27.png","element":"img","alt":"x′, r) as the intersection set at x","inline":true,"padRight":true},{"text":"and denote it by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"). Given ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") can be computed easily. The intersection set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") then in turn helps to determine the smallest ","element":"span"},{"style":{"height":17.6},"width":477.11,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-28.png","element":"img","alt":" x′ > x where r(x′) ̸= r(x","inline":true},{"text":") changes, that is the next knot of ","element":"span"},{"style":{"height":12},"width":38.81,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-29.png","element":"img","alt":" gk","inline":true,"padRight":true},{"text":"beyond ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":", as we now explain. Suppose at a point ","element":"span"},{"style":{"height":10.84},"width":70.28,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-30.png","element":"img","alt":" xold","inline":true},{"text":", we have computed ","element":"span"},{"style":{"height":17.6},"width":356.11,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-31.png","element":"img","alt":" rold = r(xold). We","inline":true,"padRight":true},{"text":"set ","element":"span"},{"style":{"height":10.84},"width":202.46,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-32.png","element":"img","alt":" xcur = xold","inline":true,"padRight":true},{"text":"and perform the following.","element":"span"}],[{"text":"1. Given ","element":"span"},{"style":{"height":17.6},"width":461.07,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-33.png","element":"img","alt":" r(xcur), compute N(xcur","inline":true},{"text":") and set (","element":"span"},{"style":{"height":11.2},"width":146.76,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-34.png","element":"img","alt":"xint, rint","inline":true},{"text":") = arg min","element":"span"},{"style":{"height":15.39},"width":258.26,"height":38.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-35.png","element":"img","alt":"(x,r)∈N(xcur) x.","inline":true}],[{"text":"2. If there are no changes in the active set between ","element":"span"},{"style":{"height":15.02},"width":243.68,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-36.png","element":"img","alt":" xcur and xint","inline":true},{"text":", we have found the next knot point at ","element":"span"},{"style":{"height":17.6},"width":421.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/8-37.png","element":"img","alt":" xint and rint = r(xint).","inline":true}],[{"text":"3. If instead the active set changes, move ","element":"span"},{"style":{"height":10.62},"width":71.8,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-0.png","element":"img","alt":" xcur","inline":true,"padRight":true},{"text":"to the leftmost change point. We have that ","element":"span"},{"style":{"height":17.6},"width":539.82,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-1.png","element":"img","alt":"r(x) = rold for x ∈ [xold, xcur","inline":true},{"text":"). To determine if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") changes at ","element":"span"},{"style":{"height":10.62},"width":71.8,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-2.png","element":"img","alt":" xcur","inline":true},{"text":", we check if","element":"span"}],[{"style":{"width":"59%"},"width":1073,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-3.png","element":"img"}],[{"text":"(ii) some ","element":"span"},{"style":{"height":10.62},"width":78.55,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-4.png","element":"img","alt":" rnew","inline":true,"padRight":true},{"text":"enters the active set at ","element":"span"},{"style":{"height":10.62},"width":71.8,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-5.png","element":"img","alt":" xcur","inline":true,"padRight":true},{"text":"and ‘beats’ ","element":"span"},{"style":{"height":17.6},"width":611.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-6.png","element":"img","alt":" rold, so rnew ∈ A(xcur) \\ A(xold)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.44},"width":788.8,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-7.png","element":"img","alt":" pk,rnew(xcur + ϵ) < pk,rold(xcur + ϵ) for ϵ >","inline":true,"padRight":true},{"text":"0 sufficiently small.","element":"span"}],[{"text":"If either hold ","element":"span"},{"style":{"height":10.62},"width":71.8,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-8.png","element":"img","alt":" xcur","inline":true,"padRight":true},{"text":"is a knot and ","element":"span"},{"style":{"height":17.6},"width":109.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-9.png","element":"img","alt":" r(xcur","inline":true},{"text":") may be computed via ","element":"span"},{"style":{"height":17.6},"width":109.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-10.png","element":"img","alt":" r(xcur","inline":true},{"text":") = arg min","element":"span"},{"style":{"height":20.59},"width":349.61,"height":51.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-11.png","element":"img","alt":"r∈A(xcur) pk,r(xcur).","inline":true,"padRight":true},{"text":"If neither hold, we conclude that ","element":"span"},{"style":{"height":17.6},"width":252.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-12.png","element":"img","alt":" r(xcur) = rold","inline":true,"padRight":true},{"text":"and go to step 1 once more.","element":"span"}],[{"text":"Hence we can proceed from one knot of ","element":"span"},{"style":{"height":12},"width":38.81,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-13.png","element":"img","alt":" gk","inline":true,"padRight":true},{"text":"to the next by comparing the values and intersections of a small collection of quadratic functions, and thereby form a piecewise quadratic representation of ","element":"span"},{"style":{"height":12},"width":38.82,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-14.png","element":"img","alt":" gk","inline":true,"padRight":true},{"text":"in a finite number of steps. Figure ","element":"span"},{"href":"#id-43","text":"3 ","element":"a"},{"text":"illustrates the steps outlined above. The pieces of ","element":"span"},{"style":{"height":15.24},"width":36.73,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-15.png","element":"img","alt":" bk","inline":true,"padRight":true},{"text":"may be computed in a similar fashion.","element":"span"}],[{"text":"We note there are several modifications that can speed up the algorithm: for example, for each ","element":"span"},{"style":{"height":13.24},"width":141,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-16.png","element":"img","alt":" r, uk,r,2","inline":true,"padRight":true},{"href":"#id-44","text":"(17) ","element":"a"},{"text":"is a constant function where it is finite (see ","element":"span"},{"style":{"height":13.24},"width":66.85,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-17.png","element":"img","alt":" pk,2","inline":true,"padRight":true},{"text":"in the figure), and these can be dealt with more efficiently. For further details including pseudocode see Section ","element":"span"},{"href":"#id-45","text":"A.2 ","element":"a"},{"text":"of the Appendix.","element":"span"}],[{"style":{"width":"100%"},"width":1806,"height":999,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-18.png","element":"img"}],[{"id":"id-43","text":"Figure 3: Illustration of the optimisation problem and our algorithm, to be interpreted with ","element":"figcaption","subtype":"caption"},{"text":"reference to steps 1, 2, 3 in the main text. Shading indicates regions where the active set, displayed at the bottom of the plot, is invariant, and vertical dotted lines signify changes. Dotted curves correspond to parts of quadratic functions ","element":"figcaption","subtype":"caption"},{"style":{"height":13.24},"width":59.85,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-19.png","element":"img","alt":" pk,l","inline":true,"padRight":true},{"text":"lying outside their associated intervals ","element":"figcaption","subtype":"caption"},{"style":{"height":24.13},"width":1628.92,"height":60.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-20.png","element":"img","alt":" Jk,l. At xold, we have r(xold) = 1, A(xold) = {1, 2} and N(xold) = {(x(1)int, 2)}. Since","inline":true,"padRight":true},{"text":"the active set changes between ","element":"figcaption","subtype":"caption"},{"style":{"height":24.13},"width":518.79,"height":60.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-21.png","element":"img","alt":" xold and x(1)int, we move xcur","inline":true,"padRight":true},{"text":"to the first change point ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"P ","element":"figcaption","subtype":"caption"},{"text":"and see ","element":"figcaption","subtype":"caption"},{"text":"neither (i) nor (ii) occur. We therefore return to step 1 and compute ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":128.59,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-22.png","element":"img","alt":" N(xcur","inline":true},{"text":") which additionally contains (","element":"figcaption","subtype":"caption"},{"style":{"height":24.13},"width":82.21,"height":60.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-23.png","element":"img","alt":"x(2)int,","inline":true,"padRight":true},{"text":"2). As the active set is unchanged between ","element":"figcaption","subtype":"caption"},{"style":{"height":24.13},"width":247.02,"height":60.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-24.png","element":"img","alt":" xcur and x(2)int","inline":true},{"text":", we have determined ","element":"figcaption","subtype":"caption"},{"text":"the next knot point ","element":"figcaption","subtype":"caption"},{"style":{"height":24.13},"width":68.04,"height":60.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-25.png","element":"img","alt":" x(2)int ","inline":true,"padRight":true},{"text":"and minimising quadratic ","element":"figcaption","subtype":"caption"},{"style":{"height":13.24},"width":80.78,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-26.png","element":"img","alt":" pk,3.","inline":true}],[{"text":"In summary, our algorithm produces a piecewise quadratic representation of ","element":"span"},{"style":{"height":16.4},"width":258.23,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/9-27.png","element":"img","alt":" fK, which we","inline":true}],[{"text":"can minimise efficiently to obtain ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":15.1},"width":50.48,"height":37.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-0.png","element":"img","alt":"θK","inline":true},{"text":". We also have piecewise linear representations of functions ","element":"span"},{"style":{"height":15.6},"width":183.35,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-1.png","element":"img","alt":"b2, . . . , bK","inline":true,"padRight":true},{"text":"through which we may iteratively obtain ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.41},"width":709.54,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-2.png","element":"img","alt":"θk = bk+1(ˆθk+1) for k = K − 1, . . . , 1.","inline":true}],[{"text":"It seems challenging to obtain meaningful bounds on the number of computations that must be performed at each stage of this process in terms of parameters of the data. However, to give an indication of the scalability of this algorithm, we ran a simple example with 3 true levels and found that with 50 categories the runtime was under 10","element":"span"},{"style":{"height":8.4},"width":43.34,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-3.png","element":"img","alt":"−3 ","inline":true,"padRight":true},{"text":"seconds; with 2000 categories it was still well under half a second. More details on computation time can be found in Sections ","element":"span"},{"href":"#id-46","text":"S1.3 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-47","text":"S3.2 ","element":"a"},{"text":"of the Supplementary material. In Section ","element":"span"},{"href":"#id-48","text":"S1.4 ","element":"a"},{"text":"of the Supplementary material, we describe an approximate version of the algorithm that can be used for fast computation in very large-scale settings.","element":"span"}],[{"id":"id-39","style":{"fontWeight":"bold"},"text":"3.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Multivariate model","element":"span"}],[{"text":"Using our dynamic programming algorithm for the univariate problem, we can attempt to minimise the objective ","element":"span"},{"href":"#id-34","text":"(9) ","element":"a"},{"text":"for the multivariate problem using block coordinate descent. This has been shown empirically to be a successful strategy for minimising objectives for high-dimensional regression with nonconvex penalties such as the MCP ","element":"span"},{"href":"#id-49","referenceIndex":2,"text":"[Breheny and Huang, ","element":"a"},{"href":"#id-49","referenceIndex":2,"text":"2011, ","element":"a"},{"href":"#id-50","referenceIndex":26,"text":"Mazumder ","element":"a"},{"href":"#id-50","referenceIndex":26,"text":"et al., ","element":"a"},{"href":"#id-50","referenceIndex":26,"text":"2011, ","element":"a"},{"href":"#id-51","referenceIndex":3,"text":"Breheny and Huang, ","element":"a"},{"href":"#id-51","referenceIndex":3,"text":"2015]","element":"a"},{"text":", and we take this approach here. ","element":"span"},{"text":"Considering the multivariate case, we iteratively minimise the objective ","element":"span"},{"style":{"height":24.25},"width":515.37,"height":60.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-4.png","element":"img","alt":" Q over θj := (θjk)Kjk=1 ∈ Θj","inline":true,"padRight":true},{"text":"keeping all ","element":"span"},{"text":"other parameters fixed. Then for a given (","element":"span"},{"style":{"height":16},"width":70.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-5.png","element":"img","alt":"γ, λ","inline":true},{"text":") and initial estimate ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":22.08},"width":117.66,"height":55.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-6.png","element":"img","alt":"θ(0) ∈","inline":true,"padRight":true},{"text":"Θ, we repeat the following until a suitable convergence criterion is met:","element":"span"}],[{"text":"1. Initialise ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"= 1, and set for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , n","element":"span"}],[{"style":{"width":"39%"},"width":706,"height":133,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-7.png","element":"img"}],[{"text":"2. For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , p","element":"span"},{"text":", compute","element":"span"}],[{"id":"id-53","style":{"width":"91%"},"width":1656,"height":457,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-8.png","element":"img"}],[{"text":"3. Increment ","element":"span"},{"style":{"height":13.2},"width":231.66,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-9.png","element":"img","alt":" m → m + 1.","inline":true}],[{"text":"We define a blockwise optimum of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"text":"to be any ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12.8},"width":67.03,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-10.png","element":"img","alt":"θ ∈","inline":true,"padRight":true},{"text":"Θ, such that for each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , p","element":"span"},{"text":",","element":"span"}],[{"id":"id-56","style":{"width":"73%"},"width":1329,"height":91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-11.png","element":"img"}],[{"text":"This is equivalent to ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-12.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"being a fixed point of the block coordinate descent algorithm above. Provided ","element":"span"},{"style":{"height":16},"width":185.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-13.png","element":"img","alt":" γ > 0, Q","inline":true,"padRight":true},{"text":"is continuous in ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-14.png","element":"img","alt":" θ","inline":true},{"text":". As a consequence of ","element":"span"},{"href":"#id-52","referenceIndex":37,"text":"Tseng ","element":"a"},{"href":"#id-52","referenceIndex":37,"text":"[2001]","element":"a"},{"text":", Theorem 4.1 (c), provided the minimisers ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":28},"width":82.04,"height":69.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-15.png","element":"img","alt":"θ(m)j","inline":true,"padRight":true},{"text":"in ","element":"span"},{"href":"#id-53","text":"(19) ","element":"a"},{"text":"are unique for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"(which will invariably be the case when the responses are realisations of continuous random variables; see Proposition ","element":"span"},{"href":"#id-54","text":"3)","element":"a"},{"text":", then all limit points of the sequence (","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":25.96},"width":174.48,"height":64.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/10-16.png","element":"img","alt":"θ(m))∞m=0 ","inline":true,"padRight":true},{"text":"are blockwise optima.","element":"span"}],[{"id":"id-37","style":{"fontWeight":"bold"},"text":"3.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Practicalities","element":"span"}],[{"text":"In practice the block coordinate descent procedure described above must be performed over a grid of (","element":"span"},{"style":{"height":16},"width":70.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-0.png","element":"img","alt":"γ, λ","inline":true},{"text":") values to facilitate tuning parameter selection by cross-validation. In line with analogous recommendations for other penalised regression optimisation procedures ","element":"span"},{"href":"#id-49","referenceIndex":2,"text":"[Breheny and ","element":"a"},{"href":"#id-49","referenceIndex":2,"text":"Huang, ","element":"a"},{"href":"#id-49","referenceIndex":2,"text":"2011, ","element":"a"},{"href":"#id-55","referenceIndex":13,"text":"Friedman et al., ","element":"a"},{"href":"#id-55","referenceIndex":13,"text":"2010]","element":"a"},{"text":", we propose, for each fixed ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-1.png","element":"img","alt":" γ","inline":true},{"text":", to iteratively obtain solutions for an exponentially decreasing sequence of ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-2.png","element":"img","alt":" λ","inline":true,"padRight":true},{"text":"values, warm starting each application of block coordinate descent at the solution for the previous ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-3.png","element":"img","alt":" λ","inline":true},{"text":". It is our experience that this scheme speeds up convergence and helps to guide the resulting estimates to statistically favourable local optima, as has been shown theoretically for certain nonconvex settings ","element":"span"},{"href":"#id-30","referenceIndex":40,"text":"[Wang et al., ","element":"a"},{"href":"#id-30","referenceIndex":40,"text":"2014]","element":"a"},{"text":".","element":"span"}],[{"text":"The grid of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-4.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"values can be chosen to be fairly coarse as the solutions appear to be less sensitive to this tuning parameter; in fact fixing ","element":"span"},{"style":{"height":17.6},"width":220.46,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-5.png","element":"img","alt":" γ ∈ {8, 32}","inline":true,"padRight":true},{"text":"yields competitive performance across a range of settings (see Section ","element":"span"},{"text":"6)","element":"span"},{"text":". ","element":"span"},{"text":"The choice ","element":"span"},{"style":{"height":16},"width":68.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-6.png","element":"img","alt":" γ ↓","inline":true,"padRight":true},{"text":"0, which mimics the ","element":"span"},{"style":{"height":16.4},"width":207.29,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-7.png","element":"img","alt":" ℓ0 penalty,","inline":true,"padRight":true},{"text":"has good statistical properties (see Theorem ","element":"span"},{"href":"#id-33","text":"5 ","element":"a"},{"text":"and following discussion). However the global optimum typically has a smaller basin of attraction and can be prohibitively hard to locate, particularly in low signal to noise ratio settings where larger ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-8.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"tends to dominate.","element":"span"}]]},{"heading":"4 Theory","paragraphs":[[{"text":"In this section, we study the theoretical properties of SCOPE. Recall our model","element":"span"}],[{"id":"id-58","style":{"width":"67%"},"width":1218,"height":141,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-9.png","element":"img"}],[{"text":"for ","element":"span"},{"style":{"height":18.67},"width":456.85,"height":46.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-10.png","element":"img","alt":" i = 1, . . . , n, where θ0 ∈","inline":true,"padRight":true},{"text":"Θ. We will assume the errors (","element":"span"},{"style":{"height":18.09},"width":105.71,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-11.png","element":"img","alt":"εi)ni=1 ","inline":true,"padRight":true},{"text":"have mean zero, are indepen- ","element":"span"},{"text":"dent and sub-Gaussian with parameter ","element":"span"},{"style":{"height":12.4},"width":121.65,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-12.png","element":"img","alt":" σ. Let","inline":true}],[{"style":{"width":"55%"},"width":1000,"height":57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-13.png","element":"img"}],[{"text":"and define the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"oracle least squares estimate","element":"span"}],[{"id":"id-57","style":{"width":"78%"},"width":1419,"height":166,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-14.png","element":"img"}],[{"text":"This is the least squares estimate of ","element":"span"},{"style":{"height":15.47},"width":42.91,"height":38.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-15.png","element":"img","alt":" θ0 ","inline":true,"padRight":true},{"text":"with oracular knowledge of which categorical levels are fused in ","element":"span"},{"style":{"height":15.47},"width":56.84,"height":38.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-16.png","element":"img","alt":" θ0.","inline":true}],[{"text":"Note that in the case where the errors have equal variance ","element":"span"},{"style":{"height":15.13},"width":39.72,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-17.png","element":"img","alt":" v2","inline":true},{"text":", the expected mean squared prediction error of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.48},"width":205.33,"height":51.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-18.png","element":"img","alt":"θ0 satisfies","inline":true}],[{"style":{"width":"82%"},"width":1490,"height":166,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-19.png","element":"img"}],[{"text":"with equality when ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":42.91,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-20.png","element":"img","alt":"θ0","inline":true,"padRight":true},{"text":"is unique.","element":"span"}],[{"text":"Our results below establish conditions under which ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":42.91,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-21.png","element":"img","alt":"θ0","inline":true,"padRight":true},{"text":"is a blockwise optimum ","element":"span"},{"href":"#id-56","text":"(20) ","element":"a"},{"text":"of the SCOPE objective function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"href":"#id-34","text":"(9)","element":"a"},{"text":", or in the univariate case when this in fact coincides with SCOPE. The minimum differences between the signals defined for each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"by","element":"span"}],[{"style":{"width":"69%"},"width":1256,"height":76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/11-22.png","element":"img"}],[{"style":{"width":"99%"},"width":1805,"height":275,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-0.png","element":"img"}],[{"text":"these latter two quantities are the minimum and maximum number of observations corresponding to a set of fused levels in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"th predictor respectively.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Univariate model","element":"span"}],[{"text":"We first consider the univariate case, where as usual we will drop the subscript ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"for simplicity. The following result establishes conditions for recovery of the oracle least squares estimate ","element":"span"},{"href":"#id-57","text":"(22)","element":"a"},{"text":".","element":"span"}],[{"id":"id-33","style":{"fontWeight":"bold"},"text":"Theorem 5. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the model ","element":"span"},{"href":"#id-58","text":"(21) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"in the univariate case with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":". Suppose there exists","element":"span"}],[{"style":{"width":"99%"},"width":1802,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-1.png","element":"img"}],[{"text":"max","element":"span"},{"style":{"height":17.6},"width":131.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-2.png","element":"img","alt":"{γ, ηs}","inline":true},{"style":{"fontStyle":"italic"},"text":". Suppose further that","element":"span"}],[{"id":"id-59","style":{"width":"66%"},"width":1192,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-3.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Then with probability at least","element":"span"}],[{"style":{"width":"69%"},"width":1248,"height":109,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"the oracle least squares estimate ","element":"span"},{"text":"ˆ","element":"span"},{"href":"#id-57","style":{"height":24.48},"width":138.06,"height":61.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-5.png","element":"img","alt":"θ0 (22)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the global optimum of ","element":"span"},{"href":"#id-34","text":"(9)","element":"a"},{"style":{"fontStyle":"italic"},"text":", so ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":141.93,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-6.png","element":"img","alt":"θ = ˆθ0.","inline":true}],[{"text":"For a choice of the tuning parameters (","element":"span"},{"style":{"height":17.6},"width":457.01,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-7.png","element":"img","alt":"γ, λ) with γ ≤ ηs and λ","inline":true,"padRight":true},{"text":"such that equality holds in ","element":"span"},{"href":"#id-59","text":"(24)","element":"a"},{"text":", we have, writing ∆ ","element":"span"},{"style":{"height":24.48},"width":409.42,"height":61.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-8.png","element":"img","alt":" ≡ ∆(θ0), that ˆθ = ˆθ0","inline":true,"padRight":true},{"text":"with probability at least","element":"span"}],[{"style":{"width":"39%"},"width":717,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-9.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"is an absolute constant. The quantity ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-10.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"reflects how equal the number of observations in the true fused levels are: in settings where the prevalences of the underlying true levels are roughly equal, we would expect this to be closer to 1.","element":"span"}],[{"text":"Consider now an asymptotic regime where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"and 1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"∆ are allowed to diverge with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":", ","element":"span"},{"style":{"height":17.6},"width":231.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-11.png","element":"img","alt":"nmin ≍ n/K","inline":true},{"text":", so all levels have roughly the same prevalence, and ","element":"span"},{"style":{"height":12},"width":22,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-12.png","element":"img","alt":" η","inline":true,"padRight":true},{"text":"is bounded away from zero, so all true underlying levels also have roughly the same prevalence. Then in order for ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":131.84,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-13.png","element":"img","alt":"θ = ˆθ0","inline":true,"padRight":true},{"text":"with high probability, we require ∆ ","element":"span"},{"style":{"height":20.8},"width":343.93,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-14.png","element":"img","alt":" ≳ σ�K log(K)/n","inline":true},{"text":". This requirement cannot be weakened for any estimator; this fact comes as a consequence of minimax lower bounds on mis-clustering errors in Gaussian mixture models ","element":"span"},{"href":"#id-60","referenceIndex":23,"text":"[Lu and Zhou, ","element":"a"},{"href":"#id-60","referenceIndex":23,"text":"2016, ","element":"a"},{"text":"Theorem 3.3].","element":"span"}],[{"text":"We remark that our result here concerning properties of the global minimiser of our objective is very different from existing results on local minimisers of objectives involving all-pairs-type penalties. For example, in the setting above where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":", Theorem 2 of ","element":"span"},{"href":"#id-20","referenceIndex":24,"text":"Ma and Huang ","element":"a"},{"href":"#id-20","referenceIndex":24,"text":"[2017] ","element":"a"},{"text":"gives that provided ","element":"span"},{"style":{"height":21.01},"width":962.38,"height":52.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-15.png","element":"img","alt":" s = o(n1/3(log n)−1/3) and ∆ ≫ σs3/2n−1/2�log(n","inline":true},{"text":"), there exists a sequence of local minimisers converging to the oracle least-squares estimate with high probability. This is significantly weaker than the condition ∆ ","element":"span"},{"style":{"height":20.8},"width":216.31,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/12-16.png","element":"img","alt":" ≳ σ�log(n","inline":true},{"text":") required for any estimator to recover oracle least-squares in this setting, illustrating the substantial difference between results on local and global optima here.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Multivariate model","element":"span"}],[{"text":"When the number of variables is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p > ","element":"span"},{"text":"1, models can become high-dimensional, with ordinary least squares estimation failing to provide a unique solution. We will however assume that the solution for ","element":"span"},{"style":{"height":15.02},"width":185.63,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-0.png","element":"img","alt":" θ ∈ Θ0 to","inline":true}],[{"style":{"width":"42%"},"width":769,"height":141,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-1.png","element":"img"}],[{"text":"is unique, which occurs if and only if the oracle least squares estimate ","element":"span"},{"href":"#id-57","text":"(22) ","element":"a"},{"text":"is unique. In this case, we note that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":190.88,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-2.png","element":"img","alt":"θ0 = AY","inline":true,"padRight":true},{"text":"for a fixed matrix ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":". ","element":"span"},{"text":"A necessary condition for this is that ","element":"span"},{"style":{"height":20.76},"width":307.04,"height":51.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-3.png","element":"img","alt":"�j(sj − 1) < n.","inline":true}],[{"text":"Our result below provides a bound on the probability that the oracle least squares estimate is a blockwise optimum of the SCOPE objective ","element":"span"},{"href":"#id-34","text":"(9) ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":14.79},"width":232.26,"height":36.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-4.png","element":"img","alt":" ρj = ργj,λj.","inline":true,"padRight":true},{"text":"This is much more meaningful than an equivalent bound for ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":42.91,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-5.png","element":"img","alt":"θ0","inline":true,"padRight":true},{"text":"to be a local optimum as the number of local optima will be enormous. In general though there may be several blockwise optima, and it seems challenging to obtain a result giving conditions under which our blockwise coordinate descent procedure is guaranteed to converge to ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":42.91,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-6.png","element":"img","alt":"θ0","inline":true},{"text":". Our empirical results (Section ","element":"span"},{"text":"6) ","element":"span"},{"text":"however show that the fixed points computed in practice tend to give good performance.","element":"span"}],[{"id":"id-91","style":{"fontWeight":"bold"},"text":"Theorem 6. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the model ","element":"span"},{"href":"#id-58","text":"(21) ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and assume ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":169.74,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-7.png","element":"img","alt":"θ0 = AY","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":". Suppose that there exists ","element":"span"},{"style":{"height":17.6},"width":168.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-8.png","element":"img","alt":" η ∈ (0, 1]","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that ","element":"span"},{"style":{"height":22.02},"width":1617.78,"height":55.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-9.png","element":"img","alt":" η/sj ≤ n0j,min/n ≤ n0j,max/n ≤ 1/ηsj for all j = 1, . . . , p. Let γ∗j = min{γj, ηsj} and","inline":true},{"style":{"height":20.49},"width":347.89,"height":51.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-10.png","element":"img","alt":"γ∗j = max{γj, ηsj}","inline":true},{"style":{"fontStyle":"italic"},"text":". Further suppose that","element":"span"}],[{"id":"id-61","style":{"width":"66%"},"width":1204,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Then letting ","element":"span"},{"style":{"height":19.53},"width":468.66,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-12.png","element":"img","alt":" cmin := (maxl(AAT )ll)−1","inline":true},{"style":{"fontStyle":"italic"},"text":", with probability at least","element":"span"}],[{"style":{"width":"77%"},"width":1402,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-13.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"the oracle least squares estimate ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":42.91,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-14.png","element":"img","alt":"θ0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is a blockwise optimum of ","element":"span"},{"href":"#id-34","text":"(9)","element":"a"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"text":"Now suppose ","element":"span"},{"style":{"height":17.42},"width":309.14,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-15.png","element":"img","alt":" γj ≤ ηsj and λj","inline":true,"padRight":true},{"text":"are such that equality holds in ","element":"span"},{"href":"#id-61","text":"(26) ","element":"a"},{"text":"for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":". Then writing ","element":"span"},{"style":{"height":22.19},"width":1166.26,"height":55.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-16.png","element":"img","alt":"Kmax = maxj Kj, nmin = minj nj,min and ∆min = minj ∆(θ0j","inline":true},{"text":"), we have that ˆ","element":"span"},{"style":{"height":20.08},"width":42.91,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-17.png","element":"img","alt":"θ0","inline":true,"padRight":true},{"text":"is a blockwise optimum of ","element":"span"},{"href":"#id-34","text":"(9) ","element":"a"},{"text":"with probability at least","element":"span"}],[{"style":{"width":"55%"},"width":1001,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-18.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"is an absolute constant. Consider now an analogous asymptotic regime to that described in the previous section for the univariate case. Specifically assume ","element":"span"},{"style":{"height":17.6},"width":519.1,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-19.png","element":"img","alt":" nmin ≍ n/Kmax and cmin ≳","inline":true},{"style":{"height":10.62},"width":82.81,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-20.png","element":"img","alt":"nmin","inline":true,"padRight":true},{"text":"for simplicity. We then see that in order for ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":42.91,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-21.png","element":"img","alt":"θ0","inline":true,"padRight":true},{"text":"to be a blockwise optimum with high probability, it is sufficient that ∆","element":"span"},{"style":{"height":20.8},"width":570.5,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/13-22.png","element":"img","alt":"min ≳ σ�Kmax log(Kmaxp)/n.","inline":true}]]},{"heading":"5 Extensions","paragraphs":[[{"text":"In this section, we describe some extensions of our SCOPE methodology.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Continuous covariates. ","element":"span"},{"text":"If some of the covariates are continuous rather than categorical, we can apply any penalty function of choice to these, and perform a regression by optimising the sum of a least squares objective, our SCOPE penalty and these additional penalty functions, using (block) coordinate descent.","element":"span"}],[{"style":{"width":"96%"},"width":1735,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-0.png","element":"img"}],[{"style":{"height":15.93},"width":182.66,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-1.png","element":"img","alt":"Z ∈ Rn×d ","inline":true,"padRight":true},{"text":"be the centred design matrix for these covariates with ","element":"span"},{"style":{"height":17.75},"width":298.03,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-2.png","element":"img","alt":" ith row Zi ∈ Rd","inline":true},{"text":". One can fit a model with SCOPE penalising the categorical covariates, and the Lasso with tuning parameter ","element":"span"},{"style":{"height":10.4},"width":74.19,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-3.png","element":"img","alt":"α >","inline":true,"padRight":true},{"text":"0 penalising the continuous covariates, resulting in the following objective over ","element":"span"},{"style":{"height":18.73},"width":217.62,"height":46.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-4.png","element":"img","alt":" β ∈ Rd and","inline":true}],[{"style":{"width":"95%"},"width":1719,"height":221,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-5.png","element":"img"}],[{"text":"This sort of integration of continuous covariates is less straightforward when attempting to use tree-based methods to handle categorical covariates, for example.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Generalised linear models. ","element":"span"},{"text":"Sometimes a generalised linear model may be appropriate. Although a quadratic loss function is critical for our exact optimisation algorithm described in Section ","element":"span"},{"href":"#id-38","text":"3.1, ","element":"a"},{"text":"we can iterate local quadratic approximations to the loss term in the objective and minimise this. This results in a proximal Newton algorithm and is analogous to the standard approach for solving ","element":"span"},{"style":{"height":15.02},"width":35.18,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-6.png","element":"img","alt":" ℓ1","inline":true},{"text":"-penalised generalised linear models ","element":"span"},{"href":"#id-55","referenceIndex":13,"text":"[Friedman et al., ","element":"a"},{"href":"#id-55","referenceIndex":13,"text":"2010, ","element":"a"},{"text":"Section 3]. An implementation of this scheme in the case of logistic regression for binary responses is available in the accompanying R package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"CatReg","element":"span"},{"text":". We remark that when computing logistic regression models with a SCOPE penalty it is advisable to use a larger value of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-7.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"than with a continuous response to aid convergence of the proximal Newton step; we recommend a default setting of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-8.png","element":"img","alt":"γ","inline":true,"padRight":true},{"text":"= 100. In Section ","element":"span"},{"href":"#id-62","text":"6.2 ","element":"a"},{"text":"we use the approach described above to perform a logistic regression using SCOPE on US census data.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Hierarchical categories. ","element":"span"},{"text":"Often certain predictors may have levels that are effectively subdivisions of the levels of other predictors. Examples include category of item in e-commerce or geographical data with predictors for continent, countries and district. For simplicity, we will illustrate how such settings may be dealt with by considering a case with two predictors, but this may easily be generalised to more complex hierarchical structures. Suppose there is a partition ","element":"span"},{"style":{"height":17.67},"width":565.42,"height":44.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-9.png","element":"img","alt":" G1 ∪ · · · ∪ GK1 of {1, . . . , K2}","inline":true,"padRight":true},{"text":"such that for all ","element":"span"},{"style":{"height":15.6},"width":269.05,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-10.png","element":"img","alt":" k = 1, . . . , K1,","inline":true}],[{"style":{"width":"23%"},"width":427,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-11.png","element":"img"}],[{"text":"so the levels of the second predictor in ","element":"span"},{"style":{"height":15.24},"width":52.31,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-12.png","element":"img","alt":" Gk","inline":true,"padRight":true},{"text":"represent subdivisions of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"th level of the first predictor. Let ","element":"span"},{"style":{"height":17.6},"width":455.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-13.png","element":"img","alt":" K2k := |Gk| and let θ2k","inline":true,"padRight":true},{"text":"refer to the subvector (","element":"span"},{"style":{"height":18.29},"width":658.66,"height":45.73,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-14.png","element":"img","alt":"θ2l)l∈Gk for each k = 1, . . . , K1, so","inline":true,"padRight":true},{"text":"components of ","element":"span"},{"style":{"height":14.84},"width":60.84,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-15.png","element":"img","alt":" θ2k","inline":true,"padRight":true},{"text":"are the coefficients corresponding to the levels in ","element":"span"},{"style":{"height":19.55},"width":490.56,"height":48.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-16.png","element":"img","alt":" Gk. Also let θ2k(r) denote","inline":true,"padRight":true},{"text":"the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"th order statistic within ","element":"span"},{"style":{"height":14.84},"width":60.85,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-17.png","element":"img","alt":" θ2k","inline":true},{"text":". It is natural to encourage fusion among levels within ","element":"span"},{"style":{"height":15.24},"width":52.31,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-18.png","element":"img","alt":" Gk","inline":true,"padRight":true},{"text":"more strongly than for levels in different elements of the partition. To do this we can modify our objective function so the penalty takes the form","element":"span"}],[{"style":{"width":"58%"},"width":1059,"height":133,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-19.png","element":"img"}],[{"text":"We furthermore enforce the identifiability constraints that","element":"span"}],[{"style":{"width":"60%"},"width":1094,"height":137,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/14-20.png","element":"img"}],[{"text":"As well as yielding the desired shrinkage properties, an additional advantage of this approach is that the least squares criterion is separable in ","element":"span"},{"style":{"height":16.48},"width":245.15,"height":41.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-0.png","element":"img","alt":" θ21, . . . , θ2K1","inline":true,"padRight":true},{"text":"so the blockwise update of ","element":"span"},{"style":{"height":14.62},"width":125.16,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-1.png","element":"img","alt":" θ2 can","inline":true,"padRight":true},{"text":"be performed in parallel. This can lead to a substantial reduction in computation time if ","element":"span"},{"style":{"height":14.62},"width":99.1,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-2.png","element":"img","alt":" K2 is","inline":true,"padRight":true},{"text":"large.","element":"span"}]]},{"heading":"6 Numerical experiments","paragraphs":[[{"text":"In this section we explore the empirical properties of SCOPE. We first present results on the performance on simulated data, and then in Sections ","element":"span"},{"href":"#id-62","text":"6.2 ","element":"a"},{"text":"to ","element":"span"},{"href":"#id-63","text":"6.5 ","element":"a"},{"text":"present analyses and experiments on US census data, insurance data and COVID-19 modelling data.","element":"span"}],[{"text":"We denote SCOPE with a specific choice of ","element":"span"},{"style":{"height":16},"width":275.95,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-3.png","element":"img","alt":" γ as SCOPE-γ","inline":true},{"text":", and write SCOPE-CV to denote SCOPE with a cross-validated choice of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-4.png","element":"img","alt":" γ","inline":true},{"text":". SCOPE solutions are computed using our R ","element":"span"},{"href":"#id-64","referenceIndex":31,"text":"[R ","element":"a"},{"href":"#id-64","referenceIndex":31,"text":"Core Team, ","element":"a"},{"href":"#id-64","referenceIndex":31,"text":"2020] ","element":"a"},{"text":"package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"CatReg ","element":"span"},{"href":"#id-65","referenceIndex":33,"text":"[Stokell, ","element":"a"},{"href":"#id-65","referenceIndex":33,"text":"2021]","element":"a"},{"text":", using 5-fold cross-validation to select ","element":"span"},{"style":{"height":12.8},"width":149.18,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-5.png","element":"img","alt":" λ for all","inline":true,"padRight":true},{"text":"examples except those in Section ","element":"span"},{"href":"#id-63","text":"6.5. ","element":"a"},{"text":"We compare SCOPE to linear or logistic regression where appropriate and a range of existing methods, including CAS-ANOVA ","element":"span"},{"href":"#id-10","referenceIndex":1,"text":"[Bondell and Reich, ","element":"a"},{"href":"#id-10","referenceIndex":1,"text":"2009] ","element":"a"},{"href":"#id-15","text":"(4)","element":"a"},{"text":", and an adaptive version where the weights ","element":"span"},{"style":{"height":13.24},"width":82.78,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-6.png","element":"img","alt":" wj,kl","inline":true,"padRight":true},{"text":"are multiplied by a factor proportional to the ","element":"span"},{"style":{"height":27.79},"width":482.7,"height":69.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-7.png","element":"img","alt":" |ˆθinitjk − ˆθinitjl |−1, where ˆθinit","inline":true,"padRight":true},{"text":"is an initial CAS-ANOVA estimate. For these methods the tuning parameter ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-8.png","element":"img","alt":" λ","inline":true,"padRight":true},{"text":"was also selected by 5-fold cross-validation. As well as this, we include Delete or merge regressors (DMR) ","element":"span"},{"href":"#id-6","referenceIndex":25,"text":"[Maj-Ka´nska et al., ","element":"a"},{"href":"#id-6","referenceIndex":25,"text":"2015] ","element":"a"},{"text":"and Bayesian effect fusion (BEF) ","element":"span"},{"href":"#id-13","referenceIndex":28,"text":"[Pauger ","element":"a"},{"href":"#id-13","referenceIndex":28,"text":"and Wagner, ","element":"a"},{"href":"#id-13","referenceIndex":28,"text":"2019] ","element":"a"},{"text":"in some experiments. With the former, models were fitted using ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"DMRnet ","element":"span"},{"href":"#id-66","referenceIndex":30,"text":"[Prochenka-So�ltys and Pokarowski, ","element":"a"},{"href":"#id-66","referenceIndex":30,"text":"2018] ","element":"a"},{"text":"and selected by 5-fold cross-validation where possible; otherwise an information criterion was used. ","element":"span"},{"text":"With BEF, coefficients were modelled with a Gaussian mixture model with posterior mean estimated using 1000 samples using ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"effectFusion ","element":"span"},{"href":"#id-67","referenceIndex":29,"text":"[Pauger et al., ","element":"a"},{"href":"#id-67","referenceIndex":29,"text":"2019]","element":"a"},{"text":". We also include comparison to the tree-based approaches CART ","element":"span"},{"href":"#id-7","referenceIndex":5,"text":"[Breiman ","element":"a"},{"href":"#id-7","referenceIndex":5,"text":"et al., ","element":"a"},{"href":"#id-7","referenceIndex":5,"text":"1984] ","element":"a"},{"text":"and random forests (RF) ","element":"span"},{"href":"#id-8","referenceIndex":4,"text":"[Breiman, ","element":"a"},{"href":"#id-8","referenceIndex":4,"text":"2001]","element":"a"},{"text":". Lastly, in some experiments, models were also fitted using the Lasso ","element":"span"},{"href":"#id-9","referenceIndex":35,"text":"[Tibshirani, ","element":"a"},{"href":"#id-9","referenceIndex":35,"text":"1996]","element":"a"},{"text":". CART was implemented using ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"rpart ","element":"span"},{"href":"#id-68","referenceIndex":34,"text":"[Therneau ","element":"a"},{"href":"#id-68","referenceIndex":34,"text":"and Atkinson, ","element":"a"},{"href":"#id-68","referenceIndex":34,"text":"2019] ","element":"a"},{"text":"with pruning according to the one standard error rule. Random forests and Lasso were implemented using the default settings in ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"randomForest ","element":"span"},{"href":"#id-69","referenceIndex":20,"text":"[Liaw and Wiener, ","element":"a"},{"href":"#id-69","referenceIndex":20,"text":"2002] ","element":"a"},{"text":"and ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"glmnet ","element":"span"},{"href":"#id-55","referenceIndex":13,"text":"[Friedman et al., ","element":"a"},{"href":"#id-55","referenceIndex":13,"text":"2010] ","element":"a"},{"text":"packages respectively. For full details of the specific versions of these methods and software used in the numerical experiments, see Section ","element":"span"},{"href":"#id-70","text":"S3.1 ","element":"a"},{"text":"of the Supplementary material.","element":"span"}],[{"id":"id-140","style":{"fontWeight":"bold"},"text":"6.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Simulations","element":"span"}],[{"id":"id-83","style":{"width":"99%"},"width":1805,"height":803,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/15-9.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"g ","element":"span"},{"text":"is the true regression function, ˆ","element":"span"},{"style":{"fontStyle":"italic"},"text":"g ","element":"span"},{"text":"an estimate, and the expectation is taken over the covariate vector ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":".","element":"span"}],[{"id":"id-132","style":{"fontWeight":"bold"},"text":"6.1.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Low-dimensional experiments","element":"span"}],[{"text":"Results are presented for three settings with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"= 500, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 10 given below.","element":"span"}],[{"style":{"width":"85%"},"width":1552,"height":254,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/16-0.png","element":"img"}],[{"text":"3. As Setting 1, but with ","element":"span"},{"style":{"height":15.6},"width":148.5,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/16-1.png","element":"img","alt":" ρ = 0.8.","inline":true}],[{"style":{"width":"100%"},"width":1813,"height":584,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/16-2.png","element":"img"}],[{"text":"RF ","element":"span"},{"text":"9","element":"span"},{"style":{"height":12.8},"width":1324.23,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/16-3.png","element":"img","alt":".621(0.5)10.944(0.5)13.217(0.7) 16.344(0.9) 8.947(0.3) 9.747(0.4)11.249(0.6) 13.646(0.8)","inline":true,"padRight":true},{"text":"Setting 3 ","element":"span"},{"style":{"height":12.49},"width":674.76,"height":31.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/16-4.png","element":"img","alt":"σ2: 1 6.25 25 100","inline":true,"padRight":true},{"text":"SNR: ","element":"span"},{"text":"7.3 ","element":"span"},{"text":"2.9 ","element":"span"},{"text":"1.5 ","element":"span"},{"text":"0.73","element":"span"}],[{"style":{"width":"61%"},"width":1113,"height":493,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/16-5.png","element":"img"}],[{"id":"id-71","text":"Table 1: Mean squared prediction errors (and standard deviations thereof) of various methods ","element":"figcaption","subtype":"caption"},{"text":"on the settings described.","element":"figcaption","subtype":"caption"}],[{"text":"Each of these experiments were performed with noise variance ","element":"span"},{"style":{"height":15.13},"width":43.5,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/16-6.png","element":"img","alt":" σ2 ","inline":true,"padRight":true},{"text":"= 1, 6.25, 25 and 100. Note that the variance of the signal varies across each setting, and signal-to-noise ratio (SNR) for each experiment is displayed in Table ","element":"span"},{"href":"#id-71","text":"1. ","element":"a"},{"text":"Methods included for comparison were SCOPE-8, SCOPE-32, SCOPE-CV, linear regression, vanilla and adaptive CAS-ANOVA, DMR, Bayesian effect fusion, CART and random forests. Also included are the results from the oracle least squares estimator ","element":"span"},{"href":"#id-57","text":"(22)","element":"a"},{"text":".","element":"span"}],[{"text":"Results are shown in Table ","element":"span"},{"href":"#id-71","text":"1 ","element":"a"},{"text":"and further details are given in Section ","element":"span"},{"href":"#id-72","text":"S3.2.1 ","element":"a"},{"text":"of the Supplementary material. Across all experiments, SCOPE with a cross-validated choice of ","element":"span"},{"style":{"height":16},"width":190.43,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/16-7.png","element":"img","alt":" γ exhibits","inline":true,"padRight":true},{"text":"prediction performance at least as good as the optimal approaches, and in all but the lowest noise settings performs better than the other methods that were included. In these exceptions, we see that fixing ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/17-0.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"to be a small value (corresponding to high-concavity) provides leading performance.","element":"span"}],[{"text":"In these low noise settings, we see that the methods based on first estimating the clusterings of the levels and then estimating the coefficients without introducing further shrinkage, such as DMR or Bayesian effect Fusion, perform well. However they tend to struggle when the noise is larger. In contrast the tree-based methods perform poorly in low noise settings but exhibit competitive performance in high noise settings.","element":"span"}],[{"id":"id-135","style":{"fontWeight":"bold"},"text":"6.1.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"High-dimensional experiments","element":"span"}],[{"text":"We considered 8 settings as detailed below, each with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"= 500, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 100 and simulated 500 times.","element":"span"}],[{"style":{"width":"96%"},"width":1747,"height":165,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/17-1.png","element":"img"}],[{"text":"2. As Setting 1, but with ","element":"span"},{"style":{"height":16},"width":148.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/17-2.png","element":"img","alt":" ρ = 0.5.","inline":true}],[{"style":{"width":"96%"},"width":1749,"height":508,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/17-3.png","element":"img"}],[{"text":"6. As Setting 5, but with ","element":"span"},{"style":{"height":16},"width":148.5,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/17-4.png","element":"img","alt":" ρ = 0.5.","inline":true}],[{"style":{"width":"96%"},"width":1749,"height":337,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/17-5.png","element":"img"}],[{"text":"Models were fitted using SCOPE-8, SCOPE-32, SCOPE-CV, DMR, CART, Random forests and the Lasso. Table ","element":"span"},{"href":"#id-73","text":"2 ","element":"a"},{"text":"gives the mean squared prediction errors across each of the settings.","element":"span"}],[{"text":"As well as prediction performance, it is interesting to see how the methods perform in terms of variable selection performance. With categorical covariates, there are two potential ways of evaluating this. The first is to consider the number of false positives and false negatives across the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 100 categorical variables, defining a variable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"to have been selected if ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.82},"width":140.88,"height":44.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/17-6.png","element":"img","alt":"θj ̸= 0.","inline":true,"padRight":true},{"text":"These results are shown in Table ","element":"span"},{"href":"#id-74","text":"3. ","element":"a"},{"text":"This definition of a false positive can be considered quite conservative; typically one can find that often the false signal variables have only two levels, each with quite small coefficients. This means that the false positive rate can increase substantially with only a small increase in the dimension of the estimated linear model.","element":"span"}],[{"text":"The second is to see within the signal variables (i.e., the ","element":"span"},{"style":{"height":22.19},"width":292.46,"height":55.47,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-0.png","element":"img","alt":" j for which θ0j ̸","inline":true},{"text":"= 0), how closely ","element":"span"},{"text":"the estimated clustering resembles the true structure. To quantify this, we use the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"adjusted Rand index ","element":"span"},{"href":"#id-75","referenceIndex":17,"text":"[Hubert and Arabie, ","element":"a"},{"href":"#id-75","referenceIndex":17,"text":"1985]","element":"a"},{"text":". This is the proportion of all pairs of observations that are either (i) in different true clusters and different estimated clusters, or (ii) in the same true cluster and estimated cluster; this is then corrected to ensure that its value is zero when exactly one of the clusterings is ‘all-in-one’. In Table ","element":"span"},{"href":"#id-76","text":"4 ","element":"a"},{"text":"we report the average adjusted Rand index over the true signal variables in each setting.","element":"span"}],[{"style":{"width":"97%"},"width":1761,"height":449,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-1.png","element":"img"}],[{"id":"id-73","text":"Table 2: Mean squared prediction errors (and standard deviations thereof) of each of the meth- ","element":"figcaption","subtype":"caption"},{"text":"ods in the 8 high-dimensional settings considered.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"100%"},"width":1819,"height":272,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-2.png","element":"img"}],[{"id":"id-74","text":"Table 3: (False positive rate)/(False negative rate) of linear modelling methods considered in ","element":"figcaption","subtype":"caption"},{"text":"the high-dimensional settings.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"63%"},"width":1149,"height":264,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-3.png","element":"img"}],[{"id":"id-76","text":"Table 4: Average adjusted Rand index among true signal variables for the high-dimensional ","element":"figcaption","subtype":"caption"},{"text":"settings.","element":"figcaption","subtype":"caption"}],[{"text":"Further details can be found in Section ","element":"span"},{"href":"#id-77","text":"S3.2.2 ","element":"a"},{"text":"of the Supplementary material. ","element":"span"},{"text":"In particular we include a table with the distribution of cross-validated choices of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-4.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"(from a grid ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"4","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"8","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"16","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"32","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"64","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":") for each experimental setting. Note that a choice of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-5.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"= 4 is close to the setting of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-6.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"= 3 recommended in ","element":"span"},{"href":"#id-25","referenceIndex":42,"text":"Zhang ","element":"a"},{"href":"#id-25","referenceIndex":42,"text":"[2010]","element":"a"},{"text":", though the problem of categorical covariates is very different in nature than the vanilla variable selection problem considered there. Our results there suggest that for SCOPE, a larger value of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-7.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"is preferable across a range of settings, which is also visible in the comparison between ","element":"span"},{"style":{"height":16},"width":228.41,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-8.png","element":"img","alt":" γ = 8 and γ","inline":true,"padRight":true},{"text":"= 32 in Table ","element":"span"},{"href":"#id-73","text":"2.","element":"a"}],[{"text":"Across all the settings in this study, SCOPE performs better than any of the other methods included. This is regardless of which of the three ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-9.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"regimes is chosen, although cross-validating ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-10.png","element":"img","alt":"γ","inline":true,"padRight":true},{"text":"gives the strongest performance overall. Comparing the results for ","element":"span"},{"style":{"height":16.4},"width":450.78,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-11.png","element":"img","alt":" γ = 8 and γ = 32 sug-","inline":true,"padRight":true},{"text":"gests that a larger (low-concavity) choice of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/18-12.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"is preferable for higher-dimensional settings. In setting 6, we see from Tables ","element":"span"},{"href":"#id-74","text":"3 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-76","text":"4 ","element":"a"},{"text":"that SCOPE obtains the true underlying groupings of the coefficients and obtains the oracle least-squares estimate in every case, giving these striking results. This is also achieved for some of the experiments in setting 5. In contrast, DMR, which initially applies a group Lasso ","element":"span"},{"href":"#id-78","referenceIndex":41,"text":"[Yuan and Lin, ","element":"a"},{"href":"#id-78","referenceIndex":41,"text":"2006] ","element":"a"},{"text":"to screen the categorical variables and give a low-dimensional model, necessarily misses some signal variables in this first stage and hence struggles here.","element":"span"}],[{"id":"id-62","style":{"fontWeight":"bold"},"text":"6.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Adult dataset analysis","element":"span"}],[{"text":"The ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Adult dataset","element":"span"},{"text":", available from the UCI Machine Learning Repository ","element":"span"},{"href":"#id-79","referenceIndex":10,"text":"[Dua and Graff, ","element":"a"},{"href":"#id-79","referenceIndex":10,"text":"2019]","element":"a"},{"text":", contains a sample of 45 222 observations based on information from the 1994 US census. The binary response variable is 0 if the individual earns at most ","element":"span"},{"text":"$$","element":"span"},{"text":"50 000 a year, and 1 otherwise. There are 2 continuous and 8 categorical variables; some such as ‘native country’ have large numbers of levels, bringing the total dimension to 93. An advantage of using SCOPE here over black-box predictive tools such as Random forests is the interpretability of the fitted model.","element":"span"}],[{"text":"In Table ","element":"span"},{"href":"#id-80","text":"5, ","element":"a"},{"text":"we show the 25-dimensional fitted model. Within the Education category, we see that six distinct levels have been identified. These agree almost exactly with the stratification one would expect, with all school dropouts before 12th grade being grouped together at the lowest level.","element":"span"}],[{"style":{"width":"100%"},"width":1806,"height":591,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/19-0.png","element":"img"}],[{"text":"Figure 4: Prediction performance and fitted model dimension (respectively) of various methods on the Adult dataset: (A) SCOPE-100; (B) SCOPE-250; (C) Logistic regression; (D) CASANOVA; (E) Adaptive CAS-ANOVA; (F) DMR; (G) BEF; (H) CART; (I) RF.","element":"figcaption","subtype":"caption"}],[{"text":"Here we assess performance in the challenging setting when the training set is quite small by randomly selecting 1% (452) of the total observations for training, and using the remainder as a test set. Any observations containing levels not in the training set were removed. Models were fitted with SCOPE-100, SCOPE-250, logistic regression, vanilla and adaptive CAS-ANOVA, DMR, Bayesian effect fusion, CART and random forests.","element":"span"}],[{"text":"We see that both SCOPE-100 and SCOPE-250 are competitive, with CART and Random forests also performing well, though the latter two include interactions in their fits. ","element":"span"},{"text":"CASANOVA also performs fairly well, the misclassification error is larger that for both versions of SCOPE, and the average fitted model size is larger.","element":"span"}],[{"style":{"width":"100%"},"width":1816,"height":1638,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/20-0.png","element":"img"}],[{"id":"id-80","text":"Table 5: Coefficients of SCOPE model trained on the full dataset. Here, ","element":"figcaption","subtype":"caption"},{"style":{"height":16},"width":372,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/20-1.png","element":"img","alt":" γ = 100 and λ was","inline":true,"padRight":true},{"text":"selected by 5-fold cross-validation (with cross-validation error of 16.82%). ","element":"figcaption","subtype":"caption"},{"text":"Countries, aside from those in the UK, are referred to by their (possibly historical) internet top-level domains. *Relation with which the subject lives.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"85%"},"width":1551,"height":447,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/20-2.png","element":"img"}],[{"id":"id-99","text":"Table 6: Results of experiments on the Adult dataset.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"90%"},"width":1640,"height":738,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/21-0.png","element":"img"}],[{"id":"id-81","text":"Figure 5: Misclassification error and dimensions of models fitted on a sample of the ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"Adult dataset ","element":"figcaption","subtype":"caption"},{"text":"when levels have been artificially split ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"m ","element":"figcaption","subtype":"caption"},{"text":"times.","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"6.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Adult dataset with artificially split levels","element":"span"}],[{"text":"To create a more challenging example, we artificially created additional levels in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Adult dataset ","element":"span"},{"text":"as follows. For each categorical variable we recursively selected a level with probability proportional to its prevalence in the data and then split it into two by appending “-0” or “-1” to the level for each observation independently and with equal probabilities. We repeated this until the total number of levels reached ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"times the original number of levels for that variable for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"= 2","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"3","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"4. ","element":"span"},{"text":"This process simulates for example responses to a survey, where different respondents might answer ‘US’, ‘U.S.’, ‘USA’, ‘U.S.A.’, ‘United States’ or ‘United States of America’ to a question, which would naively all be treated as different answers.","element":"span"}],[{"text":"We used 2.5% (1130) of the observations for training and the remainder for testing and applied SCOPE with ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/21-1.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"= 100 and logistic regression. Results were averaged over 250 training and test splits. Figure ","element":"span"},{"href":"#id-81","text":"5 ","element":"a"},{"text":"shows that as the number of levels increases, the misclassification error of SCOPE increases only slightly and the fitted model dimension remains almost unchanged, whereas both increase with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"for logistic regression.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"6.4 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Insurance data example","element":"span"}],[{"text":"The Prudential Life Insurance Assessment challenge was a prediction competition run on ","element":"span"},{"href":"#id-82","referenceIndex":19,"text":"Kaggle ","element":"a"},{"href":"#id-82","referenceIndex":19,"text":"[2015]","element":"a"},{"text":". By more accurately predicting risk, the burden of extensive tests and check-ups for life insurance policyholders could potentially be reduced. For this experiment, we use the training set that was provided for entrants of the competition.","element":"span"}],[{"text":"We removed a small number of variables due to excessive missingness, leaving 5 continuous variables and 108 categorical variables, most with 2 or 3 levels but with some in the hundreds (and the largest with 579 levels). Rather than using the response from the original dataset, which is ordinal, to better suit the regression setting we are primarily concerned with in this work, we artificially generated a continuous response. To construct this signal, firstly 10 of the categorical variables were selected at random, with probability proportional to the number of levels. For the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"th of these, writing ","element":"span"},{"style":{"height":17.02},"width":52.06,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/21-2.png","element":"img","alt":" Kj","inline":true,"padRight":true},{"text":"for the number of levels, we set ","element":"span"},{"style":{"height":21.29},"width":453.04,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/21-3.png","element":"img","alt":" sj := ⌊2 + 12 log Kj⌋ and","inline":true,"padRight":true},{"text":"assigned each level a coefficient in 1","element":"span"},{"style":{"height":13.02},"width":132.42,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/21-4.png","element":"img","alt":", . . . , sj","inline":true,"padRight":true},{"text":"uniformly at random, thus yielding ","element":"span"},{"style":{"height":13.02},"width":35.46,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/21-5.png","element":"img","alt":" sj","inline":true,"padRight":true},{"text":"true levels. The coefficients for the 5 continuous covariates were generated as draws from ","element":"span"},{"style":{"height":18.4},"width":270.9,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/21-6.png","element":"img","alt":" N5(0, I). The","inline":true,"padRight":true},{"text":"response was then scaled to have unit variance, after which standard normal noise was added.","element":"span"}],[{"style":{"width":"92%"},"width":1672,"height":446,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-0.png","element":"img"}],[{"id":"id-84","text":"Figure 6: Mean squared prediction error on the example based on the Prudential Life Insurance ","element":"figcaption","subtype":"caption"},{"text":"Assessment dataset. Methods used are: (A) SCOPE-8; (B) SCOPE-32; (C) SCOPE-CV; (D) CART; (E) RF; (F) Lasso.","element":"figcaption","subtype":"caption"}],[{"text":"We used 10% (","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"= 5938) of the 59 381 total number of observations for training, and the remainder to compute an estimated MSPE ","element":"span"},{"href":"#id-83","text":"(28) ","element":"a"},{"text":"by taking an average over these observations. We repeated this 1000 times, sampling 10% of the observations and generating the coefficients as above anew in each repetition. The average mean squared prediction errors achieved by the various methods under comparison are given in Figure ","element":"span"},{"href":"#id-84","text":"6. ","element":"a"},{"text":"We see that SCOPE with a cross-validated choice of ","element":"span"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-1.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"performs best, followed by the Lasso and SCOPE-32.","element":"span"}],[{"id":"id-63","style":{"fontWeight":"bold"},"text":"6.5 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"COVID-19 Forecast Hub example","element":"span"}],[{"text":"As well as the prediction performance experiments in the rest of this section, we include an exploratory data analysis example based on data relating to the ongoing (at time of writing) global COVID-19 pandemic. The ","element":"span"},{"href":"#id-85","referenceIndex":9,"text":"COVID-19 Forecast Hub ","element":"a"},{"href":"#id-85","referenceIndex":9,"text":"[2020] ","element":"a"},{"style":{"height":16},"width":548.45,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-2.png","element":"img","alt":" ‘. . . serves as a central repos-","inline":true},{"style":{"height":16.4},"width":1497.8,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-3.png","element":"img","alt":"itory of forecasts and predictions from over 50 international research groups.’","inline":true,"padRight":true},{"text":"A collection of different research groups publish forecasts every week of case incidence in each US state for some number of weeks into the future.","element":"span"}],[{"text":"In order to understand some of the difficulties of this challenging forecasting problem, we fitted an error decomposition model of the form","element":"span"}],[{"style":{"width":"78%"},"width":1418,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"w ","element":"span"},{"text":"is the week that the forecast is for, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l ","element":"span"},{"text":"is the state, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"indexes the forecasting model, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"is the ‘target’ number of weeks in the future the forecast is for, ","element":"span"},{"style":{"height":13.25},"width":127.28,"height":33.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-5.png","element":"img","alt":" ηm,t,w,l","inline":true,"padRight":true},{"text":"is an error term, and cases","element":"span"},{"style":{"height":11.2},"width":44.59,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-6.png","element":"img","alt":"w,l","inline":true,"padRight":true},{"text":"and est.cases","element":"span"},{"style":{"height":11.2},"width":105.61,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-7.png","element":"img","alt":"m,t,w,l","inline":true,"padRight":true},{"text":"are the observed and estimated cases respectively. This decomposition allows an interaction term between time and location, which is important given that the pandemic was known to be more severe at different times for different areas. An interaction between model and forecasting distance was also included in order to capture the effect of some models potentially being more ‘short-sighted’ than others. The inclusion of the +1 on the left-hand side is to avoid numerators or denominators of zero.","element":"span"}],[{"text":"We used data from 6 April 2020 to 19 October 2020, giving a total of 100 264 (","element":"span"},{"style":{"fontStyle":"italic"},"text":"m, t, w, l","element":"span"},{"text":")-tuples. We applied a SCOPE penalty with ","element":"span"},{"style":{"height":18.04},"width":262.99,"height":45.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-8.png","element":"img","alt":" γ = 8 to βw,l","inline":true},{"text":", which had 1428 levels. The ","element":"span"},{"style":{"height":13.02},"width":79.29,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-9.png","element":"img","alt":" αm,t","inline":true,"padRight":true},{"text":"coefficients, which amounted to 170 levels, were left unpenalised. The additional tuning parameter ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-10.png","element":"img","alt":" λ","inline":true,"padRight":true},{"text":"was selected using the Extended Bayesian Information Criterion ","element":"span"},{"href":"#id-86","referenceIndex":7,"text":"[Chen and Chen, ","element":"a"},{"href":"#id-86","referenceIndex":7,"text":"2008] ","element":"a"},{"text":"rather than cross-validation, as it was more suited to this sort of exploratory analysis on data with a chronological structure.","element":"span"}],[{"text":"The resulting estimates ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":18.04},"width":69.27,"height":45.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-11.png","element":"img","alt":"βw,l","inline":true,"padRight":true},{"text":"had 8 levels. We measured the ‘similarity’ of two US states ","element":"span"},{"style":{"height":15.24},"width":161.78,"height":38.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/22-12.png","element":"img","alt":"la and lb","inline":true,"padRight":true},{"text":"over a period of time by computing the proportion of weeks at which their estimates","element":"span"}],[{"style":{"width":"100%"},"width":1806,"height":1640,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/23-0.png","element":"img"}],[{"id":"id-87","text":"Figure 7: Similarity matrix for US states computed based on data relating to the second ‘wave’ ","element":"figcaption","subtype":"caption"},{"text":"of the COVID-19 pandemic in the US, taken to be from 26 June 2020 to 29 August 2020.","element":"figcaption","subtype":"caption"}],[{"text":"ˆ","element":"span"},{"style":{"height":22.25},"width":232.84,"height":55.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/23-1.png","element":"img","alt":"βw,la = ˆβw,lb","inline":true,"padRight":true},{"text":"coincided. The similarity matrix presented in Figure ","element":"span"},{"href":"#id-87","text":"7 ","element":"a"},{"text":"was constructed based on the second ‘wave’ of the epidemic which occurred in Summer 2020, with clusters identified by applying spectral clustering on the similarity matrix and plotted in order of decreasing within-cluster median pairwise similarity.","element":"span"}],[{"text":"The resulting clusters are at once interpretable and interesting. Roughly speaking, the top 3 clusters (‘top’ when ordered according to median pairwise within-cluster agreement) correspond to states that experienced notable pandemic activity in the second, first, and third ‘waves’ of the U.S. coronavirus pandemic, respectively. The first cluster features several southern States (e.g., Georgia, Florida, Texas) which experienced a surge of COVID cases in June–July. The second cluster features east coast states (e.g., New Jersey and New York) which experienced an enormous pandemic toll in April–May. And the third features midwestern states (e.g., Kentucky, Indiana, Nebraska) which had upticks most recently in September-October.","element":"span"}]]},{"heading":"7 Discussion","paragraphs":[[{"text":"In this work we have introduced a new penalty-based method for performing regression on categorical data. ","element":"span"},{"text":"An attractive feature of a penalty-based approach is that it can be integrated easily with existing methods for regression with continuous data, such as the Lasso. Our penalty function is nonconvex, but in contrast to the use of nonconvex penalties in standard high-dimensional regression problems, the nonconvexity here is necessary in order to obtain sparse solutions, that is fusions of levels. Whilst computing the global optimum of nonconvex problems is typically very challenging, for the case with a single categorical variable with several hundred levels, our dynamic programming algorithm can typically solve the resulting optimisation problem in less than a second on a standard laptop computer. The algorithm is thus fast enough to be embedded within a block coordinate descent procedure for handling multiple categorical variables.","element":"span"}],[{"text":"We give sufficient conditions for SCOPE to recover the oracle least squares solution when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"= 1 involving a minimal separation between unequal coefficients that is optimal up to constant factors. For the multivariate case where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p > ","element":"span"},{"text":"1, we show that oracle least squares is a fixed point of our block coordinate descent algorithm, with high probability.","element":"span"}],[{"text":"Our work offers several avenues for further work. ","element":"span"},{"text":"On the theoretical front, it would be interesting to obtain guarantees for block coordinate descent to converge to a local optimum with good statistical properties, a phenomenon that we observe empirically. On the methodology side, it would be useful to generalise the penalty to allow for clustering multivariate coefficient vectors; such clustering could be helpful in the context of mixtures of regressions models, for example.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-10","text":"H. D. Bondell and B. J. Reich. Simultaneous factor selection and collapsing levels in ANOVA. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Biometrics","element":"span"},{"text":", 65(1):169–177, 2009.","element":"span"}],[{"id":"id-49","text":"P. Breheny and J. Huang. Coordinate descent algorithms for nonconvex penalized regression, ","element":"span"},{"text":"with applications to biological feature selection. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Applied Statistics","element":"span"},{"text":", 5(1):232, 2011.","element":"span"}],[{"id":"id-51","text":"P. Breheny and J. Huang. Group descent algorithms for nonconvex penalized linear and logistic ","element":"span"},{"text":"regression models with grouped predictors. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Statistics and Computing","element":"span"},{"text":", 25(2):173–187, 2015.","element":"span"}],[{"id":"id-8","text":"L. Breiman. Random forests. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine Learning","element":"span"},{"text":", 45(1):5–32, 2001.","element":"span"}],[{"id":"id-7","text":"L. Breiman, J. Friedman, C. Stone, and R. Olshen. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Classification and Regression Trees","element":"span"},{"text":". The Wadsworth and Brooks-Cole statistics-probability series. Taylor & Francis, 1984.","element":"span"}],[{"id":"id-5","text":"T. Calinski and L. Corsten. Clustering means in anova by simultaneous testing. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Biometrics","element":"span"},{"text":", pages 39–48, 1985.","element":"span"}],[{"id":"id-86","text":"J. Chen and Z. Chen. Extended Bayesian information criteria for model selection with large ","element":"span"},{"text":"model spaces. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Biometrika","element":"span"},{"text":", 95(3):759–71, 2008.","element":"span"}],[{"id":"id-12","text":"J. Chiquet, P. Gutierrez, and G. Rigaill. Fast tree inference with weighted fusion penalties. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Computational and Graphical Statistics","element":"span"},{"text":", 26(1):205–216, 2017.","element":"span"}],[{"id":"id-85","text":"COVID-19 Forecast Hub. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"URL: https://covid19forecasthub.org","element":"span"},{"text":", 2020.","element":"span"}],[{"id":"id-79","text":"D. Dua and C. Graff. UCI machine learning repository, 2019.","element":"span"}],[{"id":"id-26","text":"J. Fan and R. Li. Variable selection via nonconcave penalized likelihood and its oracle properties. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the American Statistical Association","element":"span"},{"text":", 96(456):1348–1360, 2001.","element":"span"}],[{"id":"id-31","text":"J. Fan, H. Liu, Q. Sun, and T. Zhang. I-LAMM for sparse learning: Simultaneous control of ","element":"span"},{"text":"algorithmic complexity and statistical error. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Statistics","element":"span"},{"text":", 46(2):814–841, 2018.","element":"span"}],[{"id":"id-55","text":"J. Friedman, T. Hastie, and R. Tibshirani. Regularization paths for generalized linear models ","element":"span"},{"text":"via coordinate descent. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Statistical Software","element":"span"},{"text":", 33(1):1–22, 2010.","element":"span"}],[{"id":"id-23","text":"J. Gertheiss and G. Tutz. Sparse modeling of categorial explanatory variables. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Applied Statistics","element":"span"},{"text":", 4(4):2150–2180, 2010.","element":"span"}],[{"id":"id-11","text":"T. D. Hocking, A. Joulin, F. Bach, and J.-P. Vert. Clusterpath an algorithm for clustering using ","element":"span"},{"text":"convex fusion penalties. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"28th International Conference on Machine Learning","element":"span"},{"text":", page 1, 2011.","element":"span"}],[{"id":"id-1","text":"S. Hu, A. O’Hagan, and T. B. Murphy. Motor insurance claim modelling with factor collapsing ","element":"span"},{"text":"and bayesian model averaging. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Stat","element":"span"},{"text":", 7(1):e180, 2018.","element":"span"}],[{"id":"id-75","text":"L. Hubert and P. Arabie. Comparing partitions. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Classification","element":"span"},{"text":", 2(1):193–218, 1985.","element":"span"}],[{"id":"id-0","text":"P. B. Jensen, L. J. Jensen, and S. Brunak. Mining electronic health records: towards better ","element":"span"},{"text":"research applications and clinical care. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Nature Reviews Genetics","element":"span"},{"text":", 13(6):395, 2012.","element":"span"}],[{"id":"id-82","text":"Kaggle. Prudential Life Insurance Assessment. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"URL: https://www.kaggle.com/c/prudential-life-insurance-assessment/data","element":"span"},{"text":", 2015.","element":"span"}],[{"id":"id-69","text":"A. Liaw and M. Wiener. Classification and regression by randomforest. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R News","element":"span"},{"text":", 2(3):18–22, 2002.","element":"span"}],[{"id":"id-28","text":"P.-L. Loh and M. J. Wainwright. High-dimensional regression with noisy and missing data: ","element":"span"},{"text":"Provable guarantees with nonconvexity. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Statistics","element":"span"},{"text":", 40(3):1637–1664, 2012.","element":"span"}],[{"id":"id-29","text":"P.-L. Loh and M. J. Wainwright. Regularized M-estimators with nonconvexity: Statistical and ","element":"span"},{"text":"algorithmic theory for local optima. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research","element":"span"},{"text":", 16(19):559–616, 2015.","element":"span"}],[{"id":"id-60","text":"Y. Lu and H. H. Zhou. Statistical and computational guarantees of lloyd’s algorithm and its ","element":"span"},{"text":"variants. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint ","element":"span"},{"href":"http://arxiv.org/abs/1612.02099","style":{"fontStyle":"italic"},"text":"arXiv:1612.02099","element":"a"},{"text":", 2016.","element":"span"}],[{"id":"id-20","text":"S. Ma and J. Huang. A concave pairwise fusion approach to subgroup analysis. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the American Statistical Association","element":"span"},{"text":", 112(517):410–423, 2017.","element":"span"}],[{"id":"id-6","text":"A. Maj-Ka´nska, P. Pokarowski, A. Prochenka, et al. Delete or merge regressors for linear model ","element":"span"},{"text":"selection. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Electronic Journal of Statistics","element":"span"},{"text":", 9(2):1749–1778, 2015.","element":"span"}],[{"id":"id-50","text":"R. Mazumder, J. H. Friedman, and T. Hastie. Sparsenet: Coordinate descent with nonconvex ","element":"span"},{"text":"penalties. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the American Statistical Association","element":"span"},{"text":", 106(495):1125–1138, 2011.","element":"span"}],[{"id":"id-19","text":"M.-R. Oelker, W. P¨oßnecker, and G. Tutz. Selection and fusion of categorical predictors with l ","element":"span"},{"text":"0-type penalties. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Statistical Modelling","element":"span"},{"text":", 15(5):389–410, 2015.","element":"span"}],[{"id":"id-13","text":"D. Pauger and H. Wagner. Bayesian effect fusion for categorical predictors. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Bayesian Analysis","element":"span"},{"text":", 14(2):341–369, 2019.","element":"span"}],[{"id":"id-67","text":"D. Pauger, M. Leitner, H. Wagner, and G. Malsiner-Walli. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"effectFusion: Bayesian Effect Fusion for Categorical Predictors","element":"span"},{"text":", 2019. R package version 1.1.1.","element":"span"}],[{"id":"id-66","text":"A. Prochenka-So�ltys and P. Pokarowski. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"DMRnet: Delete or Merge Regressors Algorithms for Linear and Logistic Model Selection and High-Dimensional Data","element":"span"},{"text":", 2018. R package version 0.2.0.","element":"span"}],[{"id":"id-64","text":"R Core Team. R: A language and environment for statistical computing. R Foundation for ","element":"span"},{"text":"Statistical Computing, Vienna, Austria. 2020.","element":"span"}],[{"id":"id-4","text":"A. J. Scott and M. Knott. A cluster analysis method for grouping means in the analysis of ","element":"span"},{"text":"variance. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Biometrics","element":"span"},{"text":", pages 507–512, 1974.","element":"span"}],[{"id":"id-65","text":"B. Stokell. CatReg: Solution Paths for Linear and Logistic Regression Models with Categorical ","element":"span"},{"text":"Predictors, with SCOPE Penalty ","element":"span"},{"href":"https://CRAN.R-project.org/package=CatReg","style":{"fontFamily":"monospace"},"text":"https://CRAN.R-project.org/package=CatReg","element":"a"},{"text":", 2021.","element":"span"}],[{"id":"id-68","text":"T. Therneau and B. Atkinson. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"rpart: Recursive Partitioning and Regression Trees","element":"span"},{"text":", 2019. R package version 4.1-15.","element":"span"}],[{"id":"id-9","text":"R. Tibshirani. Regression shrinkage and selection via the lasso. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the Royal Statistical Society: Series B (Methodological)","element":"span"},{"text":", 58(1):267–288, 1996.","element":"span"}],[{"id":"id-22","text":"R. Tibshirani, M. Saunders, S. Rosset, J. Zhu, and K. Knight. Sparsity and smoothness via the ","element":"span"},{"text":"fused lasso. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the Royal Statistical Society: Series B (Statistical Methodology)","element":"span"},{"text":", 67 (1):91–108, 2005.","element":"span"}],[{"id":"id-52","text":"P. Tseng. Convergence of a block coordinate descent method for nondifferentiable minimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Optimization Theory and Applications","element":"span"},{"text":", 109(3):475–494, 2001.","element":"span"}],[{"id":"id-3","text":"J. W. Tukey. Comparing individual means in the analysis of variance. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Biometrics","element":"span"},{"text":", pages 99–114, 1949.","element":"span"}],[{"id":"id-14","text":"G. Tutz and J. Gertheiss. Regularized regression for categorical data. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Statistical Modelling","element":"span"},{"text":", 16 (3):161–200, 2016.","element":"span"}],[{"id":"id-30","text":"Z. Wang, H. Liu, and T. Zhang. Optimal computational and statistical rates of convergence for ","element":"span"},{"text":"sparse nonconvex learning problems. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Statistics","element":"span"},{"text":", 42(6):2164, 2014.","element":"span"}],[{"id":"id-78","text":"M. Yuan and Y. Lin. Model selection and estimation in regression with grouped variables. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the Royal Statistical Society: Series B (Statistical Methodology)","element":"span"},{"text":", 68(1):49–67, 2006.","element":"span"}],[{"id":"id-25","text":"C.-H. Zhang. Nearly unbiased variable selection under minimax concave penalty. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Statistics","element":"span"},{"text":", 38(2):894–942, 2010.","element":"span"}],[{"id":"id-32","text":"T. Zhao, H. Liu, and T. Zhang. Pathwise coordinate optimization for sparse learning: Algorithm ","element":"span"},{"text":"and theory. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Statistics","element":"span"},{"text":", 46(1):180–218, 2018.","element":"span"}],[{"id":"id-18","text":"H. Zou. ","element":"span"},{"text":"The adaptive lasso and its oracle properties. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the American Statistical Association","element":"span"},{"text":", 101(476):1418–1429, 2006.","element":"span"}]]},{"heading":"A Appendix","paragraphs":[[{"id":"id-42","style":{"fontWeight":"bold"},"text":"A.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Candidate minimiser functions","element":"span"}],[{"text":"In this section we give explicit forms of the functions ","element":"span"},{"style":{"height":13.24},"width":64.84,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-0.png","element":"img","alt":" pk,r","inline":true,"padRight":true},{"text":"as defined in Section ","element":"span"},{"href":"#id-38","text":"3.1. ","element":"a"},{"text":"We write ","element":"span"},{"style":{"height":19.98},"width":479,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-1.png","element":"img","alt":"qk,r(x) = arx2 + brx + cr","inline":true,"padRight":true},{"text":"for simplicity, suppressing the subscript ","element":"span"},{"style":{"height":15.6},"width":528.79,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-2.png","element":"img","alt":" k. For S ⊆ R and a, b ∈ R,","inline":true,"padRight":true},{"text":"we write ","element":"span"},{"style":{"fontStyle":"italic"},"text":"aS ","element":"span"},{"text":"+ ","element":"span"},{"style":{"fontStyle":"italic"},"text":"b ","element":"span"},{"text":"for the set ","element":"span"},{"style":{"height":17.6},"width":319.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-3.png","element":"img","alt":" {ax + b : x ∈ S}.","inline":true}],[{"text":"Recall from Section ","element":"span"},{"href":"#id-38","text":"3.1 ","element":"a"},{"text":"that","element":"span"}],[{"style":{"width":"56%"},"width":1018,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-4.png","element":"img"}],[{"text":"For a function ","element":"span"},{"style":{"height":17.6},"width":329.28,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-5.png","element":"img","alt":" f : R → R ∪ {∞}","inline":true},{"text":", we denote the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"effective domain ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"by","element":"span"}],[{"style":{"width":"31%"},"width":577,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-6.png","element":"img"}],[{"text":"For each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , m","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"), there are cases corresponding to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"= 1 and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"= 2. The formulas are as follows:","element":"span"}],[{"style":{"width":"83%"},"width":1506,"height":160,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-7.png","element":"img"}],[{"text":"with ","element":"span"},{"text":"dom ","element":"span"},{"style":{"height":13.24},"width":141.68,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-8.png","element":"img","alt":" uk,r,1 =","inline":true}],[{"style":{"width":"62%"},"width":1135,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-9.png","element":"img"}],[{"text":"If ","element":"span"},{"style":{"height":18.44},"width":542.68,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-10.png","element":"img","alt":" gk(θk+1) = uk,r,1(θk+1), then","inline":true}],[{"style":{"width":"30%"},"width":544,"height":103,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-11.png","element":"img"}],[{"text":"The second case is","element":"span"}],[{"style":{"width":"76%"},"width":1372,"height":251,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-12.png","element":"img"}],[{"text":"Here, if ","element":"span"},{"style":{"height":18.44},"width":542.68,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-13.png","element":"img","alt":" gk(θk+1) = uk,r,2(θk+1), then","inline":true}],[{"style":{"width":"20%"},"width":378,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-14.png","element":"img"}],[{"text":"Considering ","element":"span"},{"href":"#id-88","text":"(16)","element":"a"},{"text":", we see that we can also have the case where ","element":"span"},{"style":{"height":17.6},"width":358.78,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-15.png","element":"img","alt":" gk(θk+1) = fk(θk+1","inline":true},{"text":"). Thus we can form the set of quadratics ","element":"span"},{"style":{"height":13.24},"width":64.85,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-16.png","element":"img","alt":" pk,r","inline":true,"padRight":true},{"text":"and associated intervals as the set of ","element":"span"},{"style":{"height":13.24},"width":88.63,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-17.png","element":"img","alt":" uk,r,t","inline":true,"padRight":true},{"text":"as above for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2 and the ","element":"span"},{"style":{"height":13.24},"width":62.38,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-18.png","element":"img","alt":" qk,r","inline":true,"padRight":true},{"text":"themselves. Note that when ","element":"span"},{"style":{"height":18.44},"width":904.78,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-19.png","element":"img","alt":" gk(θk+1) = qk,r(θk+1), we have bk(θk+1) = θk+1.","inline":true}],[{"id":"id-45","style":{"fontWeight":"bold"},"text":"A.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Algorithm details","element":"span"}],[{"text":"Algorithm ","element":"span"},{"href":"#id-89","text":"1 ","element":"a"},{"text":"describes in detail how the optimisation routine works. In the algorithm we make use of the following objects:","element":"span"}],[{"text":"• ","element":"span"},{"text":"for ","element":"span"},{"style":{"height":17.6},"width":211.15,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/27-20.png","element":"img","alt":" x ∈ R, A(x","inline":true},{"text":") is the active set at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":";","element":"span"}],[{"text":"• ","element":"span"},{"style":{"fontStyle":"italic"},"text":"E ","element":"span"},{"text":"is the set of points at which the active set changes;","element":"span"}],[{"text":"• ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") is the intersection set at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":";","element":"span"}],[{"id":"id-89","style":{"width":"100%"},"width":1806,"height":2616,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/28-0.png","element":"img"}],[{"text":"• ","element":"span"},{"style":{"fontStyle":"italic"},"text":"U ","element":"span"},{"text":"is a set of tuples (","element":"span"},{"style":{"height":17.6},"width":324.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-0.png","element":"img","alt":"I, r) where I ⊆ R","inline":true,"padRight":true},{"text":"is an interval and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"is an integer, which is dynamically updated as the algorithm progresses.","element":"span"}],[{"text":"See Section ","element":"span"},{"href":"#id-90","text":"3.1.2 ","element":"a"},{"text":"for definitions of the sets above. We also use the convention that if ","element":"span"},{"style":{"height":8.4},"width":163.44,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-1.png","element":"img","alt":" x = −∞","inline":true,"padRight":true},{"text":"then [","element":"span"},{"style":{"height":17.6},"width":308.32,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-2.png","element":"img","alt":"x, y) = (−∞, y).","inline":true}],[{"text":"All of the ","element":"span"},{"style":{"height":19.15},"width":478.5,"height":47.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-3.png","element":"img","alt":" pk,1, . . . , pk,m(k) and Jk,m","inline":true,"padRight":true},{"text":"are computed at the start of each iterate ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":". We then initialise","element":"span"}],[{"style":{"width":"17%"},"width":313,"height":135,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-4.png","element":"img"}],[{"text":"the set of all of the end-points of the intervals ","element":"span"},{"style":{"height":18.75},"width":386.02,"height":46.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-5.png","element":"img","alt":" Jk−1,1, . . . , Jk−1,n(k).","inline":true}],[{"text":"Here ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"can be thought of as the ‘current position’ of the algorithm; ˜","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"is used to store when the minimising function ","element":"span"},{"style":{"height":14.75},"width":154.6,"height":36.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-6.png","element":"img","alt":" pk−1,r(x)","inline":true,"padRight":true},{"text":"last changed. We initialise ˜","element":"span"},{"style":{"height":17.6},"width":633.9,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-7.png","element":"img","alt":"x = −∞ and x = −1 + max{y ∈","inline":true},{"style":{"height":18.52},"width":428.87,"height":46.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-8.png","element":"img","alt":"Ik−1,1 : f′k−1(y−) ≤ 0}","inline":true},{"text":". This choice of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"ensures that the active set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") contains only one ","element":"span"},{"text":"element (as mentioned in Section ","element":"span"},{"href":"#id-38","text":"3.1)","element":"a"},{"text":"; this will always be the index corresponding to the function ˜","element":"span"},{"style":{"height":13.24},"width":121.59,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-9.png","element":"img","alt":"qk−1,1.","inline":true}],[{"style":{"width":"95%"},"width":1732,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-10.png","element":"img"}],[{"text":"the functions ˜","element":"span"},{"style":{"height":18.35},"width":285.27,"height":45.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-11.png","element":"img","alt":"qk,1, . . . , ˜qk,m(k)","inline":true,"padRight":true},{"text":"and their corresponding intervals ","element":"span"},{"style":{"height":18.75},"width":284.68,"height":46.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-12.png","element":"img","alt":" Ik,1, . . . , Ik,m(k)","inline":true,"padRight":true},{"text":"that partition ","element":"span"},{"text":"R","element":"span"},{"text":". Finally, we initialise the set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") which will contain the intersections between ","element":"span"},{"style":{"height":14.75},"width":154.6,"height":36.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-13.png","element":"img","alt":" pk−1,r(x)","inline":true,"padRight":true},{"text":"and other functions in the active set. As the active set begins with only one function, we set ","element":"span"},{"style":{"height":18},"width":190.7,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-14.png","element":"img","alt":"N(x) = ∅.","inline":true}],[{"text":"As mentioned in Section ","element":"span"},{"href":"#id-38","text":"3.1, ","element":"a"},{"text":"there are several modifications that can speed up the algorithm. One such modification follows from the fact that for each ","element":"span"},{"style":{"height":13.24},"width":143.72,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-15.png","element":"img","alt":" r, uk,r,2","inline":true,"padRight":true},{"text":"is a constant function over its effective domain, and their effective domain is a semi-infinite interval (see Section ","element":"span"},{"href":"#id-42","text":"A.1 ","element":"a"},{"text":"of the Appendix for their expressions). Therefore, for a given point ","element":"span"},{"style":{"height":12.8},"width":119.87,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/29-16.png","element":"img","alt":" x ∈ R","inline":true},{"text":", we can remove all such functions from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") except for the one taking the minimal value.","element":"span"}],[{"text":"We also note that in Algorithm ","element":"span"},{"href":"#id-89","text":"1, ","element":"a"},{"text":"the set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") is not recomputed in its entirety at every point ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"at which ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") is updated, as is described in Section ","element":"span"},{"href":"#id-38","text":"3.1. ","element":"a"},{"text":"Line 13 shows how sometimes ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") can instead be updated by adding or removing elements from it. Often, points 3 (i) and 3 (ii) from the description in the Section ","element":"span"},{"href":"#id-38","text":"3.1 ","element":"a"},{"text":"will coincide, and in such instances some calls to ","element":"span"},{"text":"ChooseFunction ","element":"span"},{"text":"(Algorithm ","element":"span"},{"href":"#id-89","text":"2) ","element":"a"},{"text":"can be skipped.","element":"span"}]]},{"heading":"Supplementary material","paragraphs":[[{"text":"This supplementary material is organised as follows. In Section ","element":"span"},{"text":"S1 ","element":"span"},{"text":"we include further details of our algorithm and the proofs of results in Sections ","element":"span"},{"text":"2 ","element":"span"},{"text":"& ","element":"span"},{"text":"3. ","element":"span"},{"text":"The proofs of Theorems ","element":"span"},{"href":"#id-33","text":"5 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-91","text":"6 ","element":"a"},{"text":"along with a number of lemmas they require can be found in Section ","element":"span"},{"text":"S2. ","element":"span"},{"text":"Section ","element":"span"},{"text":"S3 ","element":"span"},{"text":"contains information regarding simulation settings and additional results for the experiments in Section ","element":"span"},{"text":"6 ","element":"span"},{"text":"of the main paper.","element":"span"}]]},{"heading":"S1 Additional algorithmic details","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"S1.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Remarks on constrained and unconstrained formulations of the univariate objective","element":"span"}],[{"text":"It is clear why the identifiability constraint ","element":"span"},{"href":"#id-92","text":"(8) ","element":"a"},{"text":"is important when we consider the multivariate problem in Section ","element":"span"},{"href":"#id-39","text":"3.2. ","element":"a"},{"text":"However, for the univariate problem, both constrained and unconstrained formulations of the objective can be clearly defined:","element":"span"}],[{"id":"id-94","style":{"width":"80%"},"width":1462,"height":286,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-0.png","element":"img"}],[{"text":"As discussed in Section ","element":"span"},{"href":"#id-93","text":"3.1.1, ","element":"a"},{"text":"we can enlarge the feasible set in ","element":"span"},{"href":"#id-94","text":"(30) ","element":"a"},{"text":"to be all of ","element":"span"},{"style":{"height":15.53},"width":197.4,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-1.png","element":"img","alt":" RK: simi-","inline":true,"padRight":true},{"text":"larly to the observation that ","element":"span"},{"style":{"height":22.32},"width":496.54,"height":55.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-2.png","element":"img","alt":"�k wkˆθuk = ˆµ = �k wk ¯Yk","inline":true},{"text":", the minimiser of ","element":"span"},{"href":"#id-94","text":"(30) ","element":"a"},{"text":"over all of ","element":"span"},{"style":{"height":15.13},"width":61.51,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-3.png","element":"img","alt":" RK","inline":true,"padRight":true},{"text":"will always be in Θ. This can be shown by following the argument at the beginning of the proof of Lemma ","element":"span"},{"href":"#id-36","text":"10. ","element":"a"},{"text":"Therefore the algorithm defined in Section ","element":"span"},{"href":"#id-38","text":"3.1 ","element":"a"},{"text":"can also be applied to the unconstrained formulation of the objective.","element":"span"}],[{"text":"It is clear that these problems are essentially identical, as ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.28},"width":44.91,"height":43.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-4.png","element":"img","alt":"θu","inline":true,"padRight":true},{"text":"is a minimiser of the unconstrained objective if and only if ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.28},"width":153.67,"height":53.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-5.png","element":"img","alt":"θu − ˆµ1","inline":true,"padRight":true},{"text":"is a minimiser of the constrained objective. Observe that while ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":18.08},"width":162.46,"height":45.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-6.png","element":"img","alt":"θu ∈ RK","inline":true},{"text":", the solution to the constrained objective is in fact (ˆ","element":"span"},{"style":{"height":21.68},"width":416.06,"height":54.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-7.png","element":"img","alt":"µ, ˆθc) ∈ R × Θ, which","inline":true,"padRight":true},{"text":"is the same ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":"-dimensional space only with a different parametersation. In particular, ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.68},"width":90.99,"height":44.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-8.png","element":"img","alt":"θc is","inline":true,"padRight":true},{"text":"non-unique if and only if ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.28},"width":44.91,"height":43.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-9.png","element":"img","alt":"θu","inline":true,"padRight":true},{"text":"is non-unique.","element":"span"}],[{"text":"Since one can obtain the solution to the constrained objective by solving the unconstrained one and then reparameterising (and vice versa), we are free to assume without loss of generality that ","element":"span"},{"style":{"height":19.13},"width":292.48,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-10.png","element":"img","alt":" wT ¯Y = 0, so ˆµ","inline":true,"padRight":true},{"text":"= 0, when solving the univariate problem, and will remark where we do this.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"S1.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proofs of results in Sections ","element":"span"},{"style":{"fontWeight":"bold"},"text":"2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"& ","element":"span"},{"style":{"fontWeight":"bold"},"text":"3","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Proposition ","element":"span"},{"href":"#id-27","style":{"fontStyle":"italic"},"text":"1. ","element":"a"},{"text":"Assume, without loss of generality, that ˆ","element":"span"},{"style":{"height":12},"width":26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-11.png","element":"img","alt":"µ","inline":true,"padRight":true},{"text":"= 0. Suppose that there exists ","element":"span"},{"style":{"height":16.8},"width":95.06,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-12.png","element":"img","alt":" l ̸= k","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.85},"width":129.63,"height":49.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-13.png","element":"img","alt":"θk = ˆθl","inline":true},{"text":". Without loss of generality we have that ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":21.41},"width":457.96,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-14.png","element":"img","alt":"Yk ̸= ˆθk (if ¯Yk = ˆθk then","inline":true,"padRight":true},{"text":"¯","element":"span"},{"style":{"height":21.01},"width":126.48,"height":52.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-15.png","element":"img","alt":"Yl ̸= ˆθl","inline":true,"padRight":true},{"text":"and it can be seen that ˆ","element":"span"},{"style":{"height":23.76},"width":270.42,"height":59.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-16.png","element":"img","alt":"θ(1) < ¯Yl < ˆθK","inline":true},{"text":", in which case swap labels).","element":"span"}],[{"text":"Now we construct ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-17.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"by setting ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":20.2},"width":762.65,"height":50.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-18.png","element":"img","alt":"θr = ˆθr ∧ ¯Yk for r = 1, . . . , k, and ˜θr = ˆθr","inline":true,"padRight":true},{"text":"otherwise. We have","element":"span"},{"style":{"height":21.41},"width":287.74,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-19.png","element":"img","alt":"ℓ(ˆµ, ˜θ) < ℓ(ˆµ, ˆθ","inline":true},{"text":") and, by convexity of ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-20.png","element":"img","alt":" ρ","inline":true},{"text":", it follows that","element":"span"}],[{"style":{"width":"44%"},"width":795,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-21.png","element":"img"}],[{"text":"This gives the conclusion ","element":"span"},{"style":{"height":21.41},"width":229,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-22.png","element":"img","alt":" Q(˜θ) < Q(ˆθ","inline":true},{"text":"), contradicting the optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12.8},"width":33.7,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/30-23.png","element":"img","alt":"θ.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof of Proposition ","element":"span"},{"href":"#id-95","style":{"fontStyle":"italic"},"text":"2. ","element":"a"},{"text":"Suppose, for a contradiction, that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.85},"width":142.55,"height":49.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-0.png","element":"img","alt":"θk < ˆθl","inline":true},{"text":". Then at least one of the following must be true:","element":"span"}],[{"id":"id-96","style":{"width":"65%"},"width":1176,"height":176,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-1.png","element":"img"}],[{"text":"Let ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-2.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"be defined as follows. Set ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":21.01},"width":413.3,"height":52.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-3.png","element":"img","alt":"θr = ˆθr for all r ̸= k, l","inline":true},{"text":". If ","element":"span"},{"href":"#id-96","text":"(32) ","element":"a"},{"text":"holds set ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":19.85},"width":129.7,"height":49.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-4.png","element":"img","alt":"θk = ˆθl","inline":true,"padRight":true},{"text":"and if ","element":"span"},{"href":"#id-96","text":"(33) ","element":"a"},{"text":"holds set ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":19.85},"width":129.63,"height":49.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-5.png","element":"img","alt":"θl = ˆθk","inline":true},{"text":". Observe that","element":"span"}],[{"style":{"width":"42%"},"width":759,"height":122,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-6.png","element":"img"}],[{"text":"and that the squared loss of ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-7.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"is strictly smaller than the squared loss of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-8.png","element":"img","alt":"θ","inline":true},{"text":", thus contradicting optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":37.91,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-9.png","element":"img","alt":"θ.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof of Proposition ","element":"span"},{"href":"#id-54","style":{"fontStyle":"italic"},"text":"3. ","element":"a"},{"text":"In this proof we consider the unconstrained formulation of the objective ","element":"span"},{"href":"#id-94","text":"(31) ","element":"a"},{"text":"discussed in Section ","element":"span"},{"href":"#id-42","text":"S1.1. ","element":"a"},{"text":"Suppose that (","element":"span"},{"text":"¯","element":"span"},{"style":{"height":20.45},"width":124.6,"height":51.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-10.png","element":"img","alt":"Yk)Kk=1 ","inline":true,"padRight":true},{"text":"is such that there are two distinct solutions ","element":"span"},{"text":"to ","element":"span"},{"href":"#id-97","text":"(12)","element":"a"},{"text":", ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":25.28},"width":198.37,"height":63.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-11.png","element":"img","alt":"θ(1) ̸= ˆθ(2)","inline":true},{"text":". Let us assume that the levels are indexed such that ¯","element":"span"},{"style":{"height":17.51},"width":422.33,"height":43.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-12.png","element":"img","alt":"Y1 ≤ · · · ≤ ¯YK. Define","inline":true},{"style":{"height":24.44},"width":478.81,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-13.png","element":"img","alt":"k∗ = max{k : ˆθ(1)k ̸= ˆθ(2)k }","inline":true,"padRight":true},{"text":"to be the largest index at which the two solutions take different values ","element":"span"},{"text":"and note that we must have ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":24.29},"width":311.8,"height":60.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-14.png","element":"img","alt":"θ(r)1 ≤ · · · ≤ ˆθ(r)K .","inline":true}],[{"text":"First consider the case where ","element":"span"},{"style":{"height":13.2},"width":272.03,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-15.png","element":"img","alt":" k∗ < K. Then","inline":true}],[{"style":{"width":"52%"},"width":948,"height":65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-16.png","element":"img"}],[{"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2. We now argue that we must have ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":25.64},"width":913.74,"height":64.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-17.png","element":"img","alt":"θ(1)k∗+1 = ˆθ(2)k∗+1 =: t∗ ≥ (ˆθ(1)k∗ ∨ ˆθ(2)k∗ ) + γλ. Indeed,","inline":true,"padRight":true},{"text":"suppose not, and suppose that without loss of generality ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":24.44},"width":662.76,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-18.png","element":"img","alt":"θ(2)k∗ > ˆθ(1)k∗ . Fix r ∈ {1, 2}. The","inline":true,"padRight":true},{"text":"directional derivative of the objective in the direction of the binary vector with ones at the indices given by ","element":"span"},{"style":{"height":15.02},"width":41.76,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-19.png","element":"img","alt":" Sr","inline":true,"padRight":true},{"text":"and zeroes elsewhere evaluated at ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.28},"width":68.31,"height":53.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-20.png","element":"img","alt":"θ(r)","inline":true,"padRight":true},{"text":"must be 0. But comparing these for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2, we see they are identical except for the term ","element":"span"},{"style":{"height":24.44},"width":273.44,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-21.png","element":"img","alt":" ρ′(θk∗+1 − ˆθ(r)k∗","inline":true,"padRight":true},{"text":"), which will be strictly ","element":"span"},{"text":"larger for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"= 2, giving a contradiction. This then implies that both ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":24.44},"width":226.57,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-22.png","element":"img","alt":"θ(1)k∗ and ˆθ(2)k∗","inline":true,"padRight":true},{"text":"must minimise ","element":"span"},{"style":{"height":16.4},"width":386.06,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-23.png","element":"img","alt":"fk∗ over θ ≤ t∗ − γλ","inline":true,"padRight":true},{"text":"since the full objective value is","element":"span"}],[{"style":{"width":"78%"},"width":1418,"height":89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-24.png","element":"img"}],[{"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"2. We also have that when ","element":"span"},{"style":{"height":24.44},"width":502.24,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-25.png","element":"img","alt":" k∗ = K, both ˆθ(1)k∗ and ˆθ(2)k∗","inline":true,"padRight":true},{"text":"must minimise ","element":"span"},{"style":{"height":16.4},"width":71.11,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-26.png","element":"img","alt":" fk∗.","inline":true}],[{"style":{"width":"95%"},"width":1730,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-27.png","element":"img"}],[{"style":{"height":21.29},"width":429.92,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-28.png","element":"img","alt":"fk(θk) − 12wk( ¯Yk − θk)2","inline":true},{"text":". In particular, properties (i) and (iii) of Lemma ","element":"span"},{"href":"#id-41","text":"4 ","element":"a"},{"text":"hold with ","element":"span"},{"style":{"height":16.4},"width":212.64,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-29.png","element":"img","alt":" fk replaced","inline":true,"padRight":true},{"text":"by ","element":"span"},{"style":{"height":12},"width":82.64,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-30.png","element":"img","alt":" gk−1","inline":true},{"text":". These can be characterised as ","element":"span"},{"style":{"height":18.44},"width":1013.21,"height":46.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-31.png","element":"img","alt":" gk−1(θk) = ˇqk,r(θk) for θk ∈ Ik,r, where Ik,r are the","inline":true,"padRight":true},{"text":"intervals associated with ","element":"span"},{"style":{"height":21.29},"width":813.11,"height":53.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-32.png","element":"img","alt":" fk and ˇqk,r(θk) := qk,r(θk) − 12wk( ¯Yk − θk)2","inline":true},{"text":". Note that for each ","element":"span"},{"style":{"height":16.44},"width":109.78,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-33.png","element":"img","alt":" r, ˇqk,r","inline":true,"padRight":true},{"text":"depends on the values of ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":18.01},"width":228.4,"height":45.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-34.png","element":"img","alt":"Y1, . . . , ¯Yk−1","inline":true,"padRight":true},{"text":"but not that of ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":14.84},"width":43.33,"height":37.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-35.png","element":"img","alt":"Yk","inline":true,"padRight":true},{"text":"(observe that ","element":"span"},{"style":{"height":18.44},"width":121.05,"height":46.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-36.png","element":"img","alt":" qk,r(θk","inline":true},{"text":") includes a term","element":"span"}],[{"style":{"width":"25%"},"width":468,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-37.png","element":"img"}],[{"text":"Now as ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":25.64},"width":958.04,"height":64.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-38.png","element":"img","alt":"θ(1)k∗ ≤ ˆθ(1)k∗+1 − γλ and ˆθ(2)k∗ ≤ ˆθ(2)k∗+1 − γλ (if k∗ < K","inline":true},{"text":"), by Lemma ","element":"span"},{"href":"#id-41","text":"4 ","element":"a"},{"text":"(iii) both must be ","element":"span"},{"text":"local minima of ","element":"span"},{"style":{"height":16.4},"width":54.85,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-39.png","element":"img","alt":" fk∗","inline":true},{"text":", and we have that there must exist distinct ","element":"span"},{"style":{"height":16.8},"width":136.9,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-40.png","element":"img","alt":" r1 ̸= r2","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":24.44},"width":218.35,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-41.png","element":"img","alt":"θ(1)k∗ ∈ Ik∗,r1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":24.44},"width":313.73,"height":61.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-42.png","element":"img","alt":"θ(2)k∗ ∈ Ik∗,r2. Let","inline":true}],[{"style":{"width":"28%"},"width":517,"height":125,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/31-43.png","element":"img"}],[{"text":"Since ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":24.44},"width":64.79,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-0.png","element":"img","alt":"θ(1)k∗","inline":true,"padRight":true},{"text":"must be the minimum of ˇ","element":"span"},{"style":{"height":21.29},"width":571.02,"height":53.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-1.png","element":"img","alt":"qk∗,r1(θk∗) + 12wk∗( ¯Yk∗ − θk∗)2","inline":true,"padRight":true},{"text":"(and similarly for ˆ","element":"span"},{"style":{"height":24.44},"width":163.68,"height":61.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-2.png","element":"img","alt":"θ(2)k∗ ), we","inline":true,"padRight":true},{"text":"must have that","element":"span"}],[{"id":"id-98","style":{"width":"94%"},"width":1713,"height":225,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-3.png","element":"img"}],[{"text":"This is a quadratic equation in ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":14.84},"width":58.82,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-4.png","element":"img","alt":"Yk∗","inline":true},{"text":", so there are at most two values for which ","element":"span"},{"href":"#id-98","text":"(34) ","element":"a"},{"text":"holds. Considering all pairs ","element":"span"},{"style":{"height":11.2},"width":94.7,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-5.png","element":"img","alt":" r1, r2","inline":true},{"text":", we see that in order for there to exist two solutions ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":25.28},"width":285.49,"height":63.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-6.png","element":"img","alt":"θ(1) ̸= ˆθ(2), ¯Yk∗","inline":true,"padRight":true},{"text":"must take values in a set of size at most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":"), for some function ","element":"span"},{"style":{"height":12.4},"width":198.16,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-7.png","element":"img","alt":" c : N → N.","inline":true}],[{"text":"Now let","element":"span"}],[{"style":{"width":"72%"},"width":1304,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-8.png","element":"img"}],[{"text":"What we have shown, is that associated with each element ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":20.45},"width":208.9,"height":51.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-9.png","element":"img","alt":"Yk)Kk=1 ∈ S","inline":true},{"text":", there is at least one ","element":"span"},{"style":{"height":12.8},"width":41.09,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-10.png","element":"img","alt":" k∗","inline":true,"padRight":true},{"text":"such that","element":"span"}],[{"style":{"width":"40%"},"width":739,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-11.png","element":"img"}],[{"text":"is bounded above by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":"). Now for each ","element":"span"},{"style":{"height":17.42},"width":363.56,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-12.png","element":"img","alt":" j = 1, . . . , K, let Sj","inline":true,"padRight":true},{"text":"be the set of ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":20.45},"width":395.26,"height":51.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-13.png","element":"img","alt":"Yk)Kk=1 ∈ S for which","inline":true,"padRight":true},{"text":"the there exists a ","element":"span"},{"style":{"height":12.8},"width":41.09,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-14.png","element":"img","alt":" k∗ ","inline":true,"padRight":true},{"text":"with the property above and ","element":"span"},{"style":{"height":16.4},"width":121.75,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-15.png","element":"img","alt":" k∗ = j","inline":true},{"text":". Note that ","element":"span"},{"style":{"height":20.15},"width":477.95,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-16.png","element":"img","alt":" ∪jSj = S. Now Sj ⊂ RK","inline":true,"padRight":true},{"text":"has Lebesgue measure zero as a finite union of graphs of measurable functions ","element":"span"},{"style":{"height":19.13},"width":286.62,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-17.png","element":"img","alt":" f : RK−1 → R.","inline":true,"padRight":true},{"text":"Thus ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"has Lebesgue measure zero.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof of Lemma ","element":"span"},{"href":"#id-41","style":{"fontStyle":"italic"},"text":"4. ","element":"a"},{"text":"Assume, without loss of generality, that ˆ","element":"span"},{"style":{"height":12},"width":26,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-18.png","element":"img","alt":"µ","inline":true,"padRight":true},{"text":"= 0. We proceed inductively, assuming that the properties (i) and (iii) hold for ","element":"span"},{"style":{"height":16.4},"width":39.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-19.png","element":"img","alt":" fk","inline":true},{"text":", and (ii) holds for ","element":"span"},{"style":{"height":16.44},"width":80.55,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-20.png","element":"img","alt":" bk+1","inline":true},{"text":". Additionally we include in our inductive hypothesis that for all ","element":"span"},{"style":{"height":18.52},"width":347.14,"height":46.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-21.png","element":"img","alt":" x, f′k(x−) ≥ f′k(x+","inline":true},{"text":"), where we define ","element":"span"},{"style":{"height":18.51},"width":211.84,"height":46.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-22.png","element":"img","alt":" f′k(x−) and","inline":true},{"style":{"height":18.51},"width":109.75,"height":46.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-23.png","element":"img","alt":"f′k(x+","inline":true},{"text":") to be the left-derivative and right-derivative of ","element":"span"},{"style":{"height":16.4},"width":131.95,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-24.png","element":"img","alt":" fk at x","inline":true},{"text":", respectively. We note that these ","element":"span"},{"text":"trivially hold for the base case ","element":"span"},{"style":{"height":16.4},"width":38.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-25.png","element":"img","alt":" f1","inline":true},{"text":", and the case ","element":"span"},{"style":{"height":15.02},"width":35.72,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-26.png","element":"img","alt":" b2","inline":true,"padRight":true},{"text":"can be checked by direct calculation.","element":"span"}],[{"text":"We first prove (i), that ","element":"span"},{"style":{"height":16.44},"width":83.19,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-27.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"is continuous, coercive, and piecewise quadratic and with finitely many pieces. We then show that ","element":"span"},{"style":{"height":19.71},"width":575.09,"height":49.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-28.png","element":"img","alt":" f′k+1(x−) ≥ f′k+1(x+) for all x","inline":true},{"text":", which allows us to show that ","element":"span"},{"text":"(iii) holds for ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-29.png","element":"img","alt":" fk+1","inline":true},{"text":". Finally, we use these results to show that (ii) holds for ","element":"span"},{"style":{"height":16.44},"width":94.49,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-30.png","element":"img","alt":" bk+2.","inline":true}],[{"text":"We now show that ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-31.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"is coercive and continuous. Clearly ","element":"span"},{"style":{"height":18.22},"width":535.65,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-32.png","element":"img","alt":" gk(x) ≥ miny≤x fk(y), so it","inline":true,"padRight":true},{"text":"follows that ","element":"span"},{"style":{"height":17.6},"width":568.18,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-33.png","element":"img","alt":" gk(x) → ∞ as x → −∞ as fk","inline":true,"padRight":true},{"text":"is coercive. Furthermore ","element":"span"},{"style":{"height":12},"width":38.81,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-34.png","element":"img","alt":" gk","inline":true,"padRight":true},{"text":"is bounded from below as ","element":"span"},{"style":{"height":16.4},"width":39.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-35.png","element":"img","alt":" fk","inline":true,"padRight":true},{"text":"is coercive and continuous. Thus since ","element":"span"},{"style":{"height":21.29},"width":712.24,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-36.png","element":"img","alt":" fk+1(x) = gk(x) + 12wk+1( ¯Yk+1 − x)2","inline":true},{"text":", it follows ","element":"span"},{"text":"that ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-37.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"is coercive. Next as ","element":"span"},{"style":{"height":18.22},"width":881.99,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-38.png","element":"img","alt":" gk(x) = miny≤x fk(y) + ρ(y − x), and fk and ρ","inline":true,"padRight":true},{"text":"are continuous, it follows that ","element":"span"},{"style":{"height":12},"width":38.81,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-39.png","element":"img","alt":" gk","inline":true,"padRight":true},{"text":"is continuous and therefore that ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-40.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"is continuous.","element":"span"}],[{"text":"To see why ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-41.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"is piecewise quadratic with finitely many pieces, we observe that it can be written ","element":"span"},{"style":{"height":21.29},"width":1133.14,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-42.png","element":"img","alt":" fk+1(x) = fk(bk+1(x)) + ρ(x − bk+1(x)) + 12wk+1( ¯Yk+1 − x)2","inline":true},{"text":". We have by our inductive ","element":"span"},{"text":"hypothesis that ","element":"span"},{"style":{"height":16.4},"width":39.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-43.png","element":"img","alt":" fk","inline":true,"padRight":true},{"text":"is piecewise quadratic and ","element":"span"},{"style":{"height":17.6},"width":124.46,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-44.png","element":"img","alt":" bk+1(x","inline":true},{"text":") is piecewise linear, both with finitely many pieces. Since the composition of a piecewise linear function inside a piecewise quadratic function is piecewise quadratic, the remainder of (i) is shown.","element":"span"}],[{"text":"We now turn our attention to (iii), and define for ","element":"span"},{"style":{"height":12.8},"width":121.78,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-45.png","element":"img","alt":" x ∈ R:","inline":true}],[{"style":{"width":"38%"},"width":686,"height":179,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-46.png","element":"img"}],[{"text":"We will first show that ","element":"span"},{"style":{"height":19.72},"width":670.84,"height":49.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-47.png","element":"img","alt":" f′k+1(x+) ≤ f′k+1(x−) for all x ∈ R","inline":true},{"text":". Suppose that we are increasing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"and we have reached a point where ","element":"span"},{"style":{"height":17.6},"width":83.26,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/32-48.png","element":"img","alt":" gk(x","inline":true},{"text":") is not differentiable (that is, the left-derivative and the right-derivative do not match). By assumption (ii) for ","element":"span"},{"style":{"height":16.44},"width":80.55,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-0.png","element":"img","alt":" bk+1","inline":true},{"text":", we can assume that there is some window ","element":"span"},{"style":{"height":13.2},"width":67.16,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-1.png","element":"img","alt":" δ >","inline":true,"padRight":true},{"text":"0 such that ","element":"span"},{"style":{"height":17.6},"width":73.3,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-2.png","element":"img","alt":" y∗(t","inline":true},{"text":") is linear for ","element":"span"},{"style":{"height":17.6},"width":630.67,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-3.png","element":"img","alt":" t ∈ (x − δ, x), say y∗(t) = α + βt.","inline":true}],[{"text":"In order to proceed with the following argument, we must show that for sufficiently small ","element":"span"},{"style":{"height":10.4},"width":66.86,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-4.png","element":"img","alt":"ϵ >","inline":true,"padRight":true},{"text":"0, we have ","element":"span"},{"style":{"height":17.6},"width":713.31,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-5.png","element":"img","alt":" α + β(x + ϵ) ≤ x + ϵ. If α + βx < x","inline":true},{"text":", this is immediate. Therefore it remains to consider the case ","element":"span"},{"style":{"height":16.4},"width":225.7,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-6.png","element":"img","alt":" α + βx = x","inline":true},{"text":", for which we show that we must have ","element":"span"},{"style":{"height":16.4},"width":404.94,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-7.png","element":"img","alt":" α = 0 and β = 1, i.e","inline":true},{"style":{"height":17.6},"width":462.25,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-8.png","element":"img","alt":"y∗(t) = t for t ∈ (x−δ, x","inline":true},{"text":"). This follows from the observation that if ","element":"span"},{"style":{"height":17.6},"width":164.22,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-9.png","element":"img","alt":" y∗(t) < t","inline":true},{"text":", then for all ","element":"span"},{"style":{"height":13.82},"width":108.89,"height":34.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-10.png","element":"img","alt":" t1 > t","inline":true,"padRight":true},{"text":"we have ","element":"span"},{"style":{"height":17.6},"width":304.69,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-11.png","element":"img","alt":" y∗(t1) /∈ (y∗(t), t","inline":true},{"text":"]. Indeed, suppose not, then","element":"span"}],[{"style":{"width":"99%"},"width":1801,"height":613,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-12.png","element":"img"}],[{"text":"Note that ","element":"span"},{"style":{"height":16.4},"width":39.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-13.png","element":"img","alt":" fk","inline":true,"padRight":true},{"text":"has both left-derivatives and right-derivatives at every point in ","element":"span"},{"text":"R","element":"span"},{"text":". Suppose first that ","element":"span"},{"style":{"height":16.4},"width":73.1,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-14.png","element":"img","alt":" β ≥","inline":true,"padRight":true},{"text":"0, and we observe that","element":"span"}],[{"style":{"width":"46%"},"width":848,"height":48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-15.png","element":"img"}],[{"text":"Then by the basic definition of the right-derivative,","element":"span"}],[{"style":{"width":"81%"},"width":1479,"height":538,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-16.png","element":"img"}],[{"text":"where the last inequality follows from our inductive hypothesis that ","element":"span"},{"style":{"height":18.52},"width":457.16,"height":46.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-17.png","element":"img","alt":" f′k(y+) ≤ f′k(y−) for all","inline":true},{"style":{"height":16},"width":108.29,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-18.png","element":"img","alt":"y ∈ R","inline":true},{"text":". An analogous argument shows that the same conclusion holds when ","element":"span"},{"style":{"height":16.4},"width":118.98,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-19.png","element":"img","alt":" β < 0.","inline":true}],[{"text":"Now we use this to prove the claim. ","element":"span"},{"text":"Because there are no points of ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-20.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"at which the left-derivative is less than the right-derivative, without loss of generality we claim that ","element":"span"},{"style":{"height":16.44},"width":83.18,"height":41.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-21.png","element":"img","alt":" fk+1","inline":true,"padRight":true},{"text":"is differentiable at ","element":"span"},{"style":{"height":17.6},"width":629.73,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-22.png","element":"img","alt":" y∗(x) for all x, unless y∗(x) = x","inline":true},{"text":". Indeed, suppose not, then we have that ","element":"span"},{"style":{"height":19.71},"width":535.22,"height":49.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-23.png","element":"img","alt":"f′k+1(y∗(x)−) > f′k+1(y∗(x)+","inline":true},{"text":") and necessarily that defining ","element":"span"},{"style":{"height":17.6},"width":677.62,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-24.png","element":"img","alt":" h(y) := fk+1(y) + ρ(x − y), we have","inline":true,"padRight":true},{"text":"0 ","element":"span"},{"style":{"height":17.6},"width":193.72,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-25.png","element":"img","alt":" ∈ ∂h(y∗(x","inline":true},{"text":")). But since ","element":"span"},{"style":{"height":17.6},"width":417.15,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-26.png","element":"img","alt":" h(y∗(x)+) < h(y∗(x)−","inline":true},{"text":"), we contradict the optimality of ","element":"span"},{"style":{"height":17.6},"width":240.39,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-27.png","element":"img","alt":" y∗(x) as this","inline":true,"padRight":true},{"text":"point is in fact a local maximum.","element":"span"}],[{"text":"We finally consider claim (ii). By (iii), we have that for every point ","element":"span"},{"style":{"height":17.6},"width":136.69,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-28.png","element":"img","alt":" x, y∗(x","inline":true},{"text":") is either ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"or at the minimum of one of the quadratic pieces of ","element":"span"},{"style":{"height":17.6},"width":302.26,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-29.png","element":"img","alt":" fk+1(·) + ρ(x − ·","inline":true},{"text":"). In either case, we have that ","element":"span"},{"style":{"height":17.6},"width":83.85,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-30.png","element":"img","alt":"y∗(x","inline":true},{"text":") is linear in ","element":"span"},{"style":{"height":17.6},"width":674.92,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-31.png","element":"img","alt":" x and thus fk+1(y∗(x))+ρ(x−y∗(x","inline":true},{"text":")) is quadratic in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". We can define ","element":"span"},{"style":{"height":17.6},"width":143.48,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/33-32.png","element":"img","alt":" gk+1(x)","inline":true,"padRight":true},{"text":"pointwise as the minimum of this finite set of quadratic functions of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":", whose expressions are given in Appendix ","element":"span"},{"href":"#id-42","text":"A.1. ","element":"a"},{"text":"Importantly, the coefficients in the linear expression ","element":"span"},{"style":{"height":17.6},"width":339.76,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-0.png","element":"img","alt":" y∗(x) of x depend","inline":true,"padRight":true},{"text":"only on which of these functions is the minimum at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". As the number of intersections between elements in this set of quadratic functions is bounded above by twice the square of the size of the set, we can conclude that ","element":"span"},{"style":{"height":17.6},"width":124.46,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-1.png","element":"img","alt":" bk+2(x","inline":true},{"text":") is piecewise linear and with a finite number of pieces, thus concluding the proof.","element":"span"}],[{"id":"id-46","style":{"fontWeight":"bold"},"text":"S1.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Computation time experiments","element":"span"}],[{"text":"A small experiment was performed to demonstrate the runtimes one can expect in practice for the univariate problem. Note that this clustering is applied iteratively in the block coordinate descent procedure we propose to use in multivariate settings. We considered 3 settings: one with no signal, one with 2 true clusters and one with 5 true clusters. Independent and identically distributed Gaussian noise was added to each of the subaverages. As in Section ","element":"span"},{"href":"#id-99","text":"6.3 ","element":"a"},{"text":"the number of categories was increased by random splitting of the levels. Each of these tests were repeated 25 times, on a computer with a 3.2GHz processor. The results are shown in Figure ","element":"span"},{"href":"#id-100","text":"8.","element":"a"}],[{"style":{"width":"93%"},"width":1679,"height":514,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-2.png","element":"img"}],[{"id":"id-100","text":"Figure 8: Computation times for solving the univariate problem.","element":"figcaption","subtype":"caption"}],[{"id":"id-48","style":{"fontWeight":"bold"},"text":"S1.4 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Discretised algorithm","element":"span"}],[{"text":"For very large-scale problems, speed can be improved if we only allow coefficients to take values in some fixed finite grid, rather than any real value. Below we describe how such an algorithm would approximately solve the univariate objective ","element":"span"},{"href":"#id-97","text":"(12)","element":"a"},{"text":". We will use the unconstrained objective as discussed in Section ","element":"span"},{"href":"#id-42","text":"S1.1. ","element":"a"},{"text":"We would first fix ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"grid points ","element":"span"},{"style":{"height":15.1},"width":262.63,"height":37.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-3.png","element":"img","alt":" ϑ1 < · · · < ϑL","inline":true},{"text":", and then proceed as described in Algorithm ","element":"span"},{"href":"#id-101","text":"3.","element":"a"}],[{"text":"This algorithm has the same basic structure to the approach we use in Section ","element":"span"},{"href":"#id-38","text":"3.1 ","element":"a"},{"text":"for computing the exact global optimum. The difference is that now, instead of as in ","element":"span"},{"href":"#id-102","text":"(14)","element":"a"},{"text":", we define ","element":"span"},{"style":{"height":16.4},"width":39.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-4.png","element":"img","alt":" fk","inline":true,"padRight":true},{"text":"in the following way:","element":"span"}],[{"style":{"width":"76%"},"width":1379,"height":154,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-5.png","element":"img"}],[{"text":"The objects ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"play analogous roles to ","element":"span"},{"style":{"height":16.4},"width":191.74,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-6.png","element":"img","alt":" fk and bk","inline":true,"padRight":true},{"text":"in Section ","element":"span"},{"href":"#id-38","text":"3.1. ","element":"a"},{"text":"Since we restrict ","element":"span"},{"style":{"height":17.6},"width":333.35,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-7.png","element":"img","alt":"θk ∈ {ϑ1, . . . , ϑL}","inline":true},{"text":", we only need to store the values that ","element":"span"},{"style":{"height":16.4},"width":39.36,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-8.png","element":"img","alt":" fk","inline":true,"padRight":true},{"text":"takes at these ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"values; this is the purpose of the vector ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"in Algorithm ","element":"span"},{"href":"#id-101","text":"3. ","element":"a"},{"text":"Similarly, the rows ","element":"span"},{"style":{"height":17.6},"width":107.74,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-9.png","element":"img","alt":" B(k, ·","inline":true},{"text":") serve the same purpose as the functions ","element":"span"},{"style":{"height":15.24},"width":36.72,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-10.png","element":"img","alt":" bk","inline":true,"padRight":true},{"text":"where, again, we only need to store ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"values corresponding to the different options for ","element":"span"},{"style":{"height":15.24},"width":52.96,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/34-11.png","element":"img","alt":" θk.","inline":true}],[{"id":"id-101","style":{"width":"100%"},"width":1806,"height":886,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/35-0.png","element":"img"}],[{"text":"This algorithm returns the optimal solution ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/35-1.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"to the objective where each of the coefficients are restricted to take values only in ","element":"span"},{"style":{"height":17.6},"width":236.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/35-2.png","element":"img","alt":" {ϑ1, . . . , ϑL}","inline":true},{"text":". We must ensure that the grid of values has fine enough resolution that interesting answers can be obtained, which requires ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L ","element":"span"},{"text":"being sufficiently large. The number of clusters obtained by this approximate algorithm is bounded above by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":", so this must not be chosen too small.","element":"span"}],[{"text":"One can see that the computational complexity of this algorithm is linear in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":", with a total of ","element":"span"},{"style":{"height":19.13},"width":138.35,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/35-3.png","element":"img","alt":" O(KL2","inline":true},{"text":") operations required. This is of course in addition to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":") operations needed to compute ","element":"span"},{"style":{"height":18.01},"width":511.81,"height":45.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/35-4.png","element":"img","alt":" w1, . . . , wK and ¯Y1, . . . , ¯YK","inline":true,"padRight":true},{"text":"beforehand. In particular, choosing ","element":"span"},{"style":{"height":19.58},"width":172.01,"height":48.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/35-5.png","element":"img","alt":" L ≲√K","inline":true,"padRight":true},{"text":"guarantees that the complexity of this algorithm is at worst quadratic in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"K","element":"span"},{"text":".","element":"span"}]]},{"heading":"S2 Proofs of results in Section 4","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"S2.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-33","style":{"fontWeight":"bold"},"text":"5","element":"a"}],[{"text":"The proof of Theorem ","element":"span"},{"href":"#id-33","text":"5 ","element":"a"},{"text":"requires a number of auxiliary lemmas, which can be found in Section ","element":"span"},{"href":"#id-103","text":"S2.1.1.","element":"a"}],[{"text":"Let us define ","element":"span"},{"style":{"height":25.72},"width":1546.92,"height":64.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/35-6.png","element":"img","alt":" Ri = Yi − ˆµ for i = 1, . . . , n, and ¯Rk = 1nk�ni=1 1{Xi=k}Ri for k = 1, . . . , K. Note","inline":true,"padRight":true},{"text":"that","element":"span"}],[{"style":{"width":"95%"},"width":1725,"height":699,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/35-7.png","element":"img"}],[{"text":"where we define ","element":"span"},{"style":{"height":27.07},"width":751.02,"height":67.67,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-0.png","element":"img","alt":" v(k) ∈ Rn by v(k)i = 1nk 1{Xi=k}. Since P","inline":true,"padRight":true},{"text":"is an orthogonal projection matrix, we ","element":"span"},{"text":"have that ","element":"span"},{"style":{"height":26.28},"width":495.64,"height":65.71,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-1.png","element":"img","alt":" ∥Pv(k)∥2 ≤ ∥v(k)∥2 = 1√nk","inline":true,"padRight":true},{"text":". It follows that ","element":"span"},{"style":{"height":20.05},"width":151.04,"height":50.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-2.png","element":"img","alt":" v(k)T Pε","inline":true,"padRight":true},{"text":"is sub-Gaussian with parameter ","element":"span"},{"style":{"height":18.85},"width":128.87,"height":47.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-3.png","element":"img","alt":"σ/√nk","inline":true},{"text":". Applying the standard sub-Gaussian tail bound, we obtain","element":"span"}],[{"style":{"width":"50%"},"width":912,"height":257,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-4.png","element":"img"}],[{"text":"where recall that ","element":"span"},{"style":{"height":17.6},"width":204.39,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-5.png","element":"img","alt":" wk = nk/n","inline":true},{"text":". Therefore, we have that","element":"span"}],[{"id":"id-118","style":{"width":"97%"},"width":1753,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-6.png","element":"img"}],[{"text":"In the following we work on the intersection Λ := ","element":"span"},{"style":{"height":20.45},"width":141.16,"height":51.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-7.png","element":"img","alt":" ∩Kk=1Λk","inline":true},{"text":". This entails that for each ","element":"span"},{"style":{"height":19.21},"width":161.8,"height":48.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-8.png","element":"img","alt":" k, | ¯Rk −","inline":true},{"style":{"height":20.66},"width":282.65,"height":51.65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-9.png","element":"img","alt":"θ0k| < √ηγ∗sλ/","inline":true},{"text":"2. We now relabel indices such that ¯","element":"span"},{"style":{"height":17.52},"width":287.18,"height":43.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-10.png","element":"img","alt":"R1 ≤ · · · ≤ ¯RK","inline":true},{"text":", and so from Proposition ","element":"span"},{"href":"#id-95","text":"2 ","element":"a"},{"text":"that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.71},"width":280.64,"height":49.27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-11.png","element":"img","alt":"θ1 ≤ · · · ≤ ˆθK","inline":true},{"text":". Since our assumption ","element":"span"},{"href":"#id-59","text":"(24) ","element":"a"},{"text":"implies ∆(","element":"span"},{"style":{"height":21.4},"width":279.31,"height":53.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-12.png","element":"img","alt":"θ0) ≥ √ηγ∗sλ","inline":true},{"text":", it follows that on Λ the observed ordering is consistent with the ordering of the true coefficients, i.e. there exist 0 = ","element":"span"},{"style":{"height":15.02},"width":640.53,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-13.png","element":"img","alt":" k0 < k1 < · · · < ks = K such that","inline":true}],[{"style":{"width":"82%"},"width":1490,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-14.png","element":"img"}],[{"text":"Indeed, observe that for ","element":"span"},{"style":{"height":16},"width":283.69,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-15.png","element":"img","alt":" j = 1, . . . , s −","inline":true,"padRight":true},{"text":"1, we have by the triangle inequality and ","element":"span"},{"href":"#id-59","text":"(24)","element":"a"},{"text":", the stronger property that","element":"span"}],[{"id":"id-108","style":{"width":"77%"},"width":1398,"height":203,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-16.png","element":"img"}],[{"text":"Our optimisation objective is therefore","element":"span"}],[{"id":"id-119","style":{"width":"76%"},"width":1377,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-17.png","element":"img"}],[{"text":"Since ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":21.6},"width":774.53,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-18.png","element":"img","alt":"Rkj − ¯Rkj−1+1 < √ηγ∗sλ for j = 1, . . . , s","inline":true},{"text":", it follows from Lemma ","element":"span"},{"href":"#id-104","text":"8 ","element":"a"},{"text":"that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.79},"width":316.31,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-19.png","element":"img","alt":"θkj+1 − ˆθkj ≥ γλ","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":16},"width":358.9,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-20.png","element":"img","alt":" j = 1, . . . , s − 1, so","inline":true}],[{"id":"id-105","style":{"width":"90%"},"width":1625,"height":478,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-21.png","element":"img"}],[{"text":"Observe that we can have ","element":"span"},{"style":{"height":17.42},"width":524.24,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-22.png","element":"img","alt":" kj−1 + 1 > kj −1 for some j","inline":true},{"text":", in which case we take the sum over that range to be zero. Note that ","element":"span"},{"href":"#id-105","text":"(40) ","element":"a"},{"text":"can be optimised over (","element":"span"},{"style":{"height":19.19},"width":283.74,"height":47.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/36-23.png","element":"img","alt":"θkj−1+1, . . . , θkj","inline":true},{"text":") separately for each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , s","element":"span"},{"text":". If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"= 1, i.e. the true signal is zero, then the result follows from Lemma ","element":"span"},{"href":"#id-36","text":"10. ","element":"a"},{"text":"Now we see what happens when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s > ","element":"span"},{"text":"1.","element":"span"}],[{"text":"Without loss of generality, consider ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1 and note that if ","element":"span"},{"style":{"height":15.02},"width":39.71,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-0.png","element":"img","alt":" k1","inline":true,"padRight":true},{"text":"= 1 it is immediate that ˆ","element":"span"},{"style":{"height":21.68},"width":147.26,"height":54.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-1.png","element":"img","alt":"θ1 = ˆθ01","inline":true},{"text":". Hence, we can assume that ","element":"span"},{"style":{"height":15.02},"width":93.25,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-2.png","element":"img","alt":" k1 >","inline":true,"padRight":true},{"text":"1. We note that ˆ","element":"span"},{"style":{"height":22.22},"width":615.92,"height":55.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-3.png","element":"img","alt":"θ01 = �k1k=1 wk ¯Rk/w01, where we","inline":true,"padRight":true},{"text":"define ","element":"span"},{"style":{"height":20.05},"width":204.39,"height":50.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-4.png","element":"img","alt":" w0k = n0k/n","inline":true},{"text":". We see that our goal is to compute","element":"span"}],[{"style":{"width":"79%"},"width":1431,"height":287,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.93},"width":145.74,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-6.png","element":"img","alt":" 1 ∈ Rk1 ","inline":true,"padRight":true},{"text":"is a vector of ones and ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":21.68},"width":603.09,"height":54.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-7.png","element":"img","alt":"Rk := ¯Rk − ˆθ01 for k = 1, . . . , k1","inline":true},{"text":". Note that we subtract ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.41},"width":38.7,"height":48.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-8.png","element":"img","alt":"θ01 ","inline":true,"padRight":true},{"text":"to ensure that","element":"span"}],[{"style":{"width":"14%"},"width":264,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-9.png","element":"img"}],[{"text":"as required for application of Lemma ","element":"span"},{"href":"#id-36","text":"10. ","element":"a"},{"text":"We have by assumption that for ","element":"span"},{"style":{"height":15.6},"width":273.32,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-10.png","element":"img","alt":" k ∈ 1, . . . , k1,","inline":true},{"style":{"height":21.94},"width":728.63,"height":54.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-11.png","element":"img","alt":"| ˜Rk| ≤ √ηγ∗sλ/2 ≤ (2 ∧�w01γ)λ/w01","inline":true},{"text":". Thus, Lemma ","element":"span"},{"href":"#id-36","text":"10 ","element":"a"},{"text":"can be applied with ˇ","element":"span"},{"style":{"height":19.41},"width":285.73,"height":48.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-12.png","element":"img","alt":"w = w01 and it","inline":true,"padRight":true},{"text":"follows that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.68},"width":475.8,"height":54.21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-13.png","element":"img","alt":"θk = ˆθ01 for k = 1, . . . , k1.","inline":true}],[{"id":"id-103","style":{"fontWeight":"bold"},"text":"S2.1.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Auxiliary lemmas","element":"span"}],[{"text":"Here we prove a number of results required to obtain conditions for recovering the oracle least squares estimate in the univariate case. Lemma ","element":"span"},{"href":"#id-36","text":"10 ","element":"a"},{"text":"gives conditions for recovery of the true solution, in the case where there is zero signal. Lemmas ","element":"span"},{"href":"#id-104","text":"8 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-106","text":"9 ","element":"a"},{"text":"ensure that the true levels are far enough apart that they can be separated. Once we have this separation, we apply Lemma ","element":"span"},{"href":"#id-36","text":"10 ","element":"a"},{"text":"on each of the levels to obtain the solution.","element":"span"}],[{"id":"id-113","style":{"fontWeight":"bold"},"text":"Lemma 7. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the optimisation problem","element":"span"}],[{"style":{"width":"33%"},"width":605,"height":92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-14.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":17.6},"width":371.38,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-15.png","element":"img","alt":" τ > 0 and κ ∈ (0, 1]","inline":true},{"style":{"fontStyle":"italic"},"text":". Suppose further that ","element":"span"},{"style":{"height":19.12},"width":628.58,"height":47.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-16.png","element":"img","alt":" τ < (1∧√κγ)λ/2κ. Then x∗ = 0","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the unique optimum.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"We first observe that","element":"span"}],[{"style":{"width":"72%"},"width":1312,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-17.png","element":"img"}],[{"text":"For convenience, we define ","element":"span"},{"style":{"height":21.49},"width":635.83,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-18.png","element":"img","alt":" F(x) := (2τ − x)2/2 + ρκγ,λ/κ(x","inline":true},{"text":"). It now suffices to show that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"is uniquely minimised at 0 provided ","element":"span"},{"style":{"height":19.12},"width":372.83,"height":47.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-19.png","element":"img","alt":" τ < (1 ∧ √κγ)λ/2κ","inline":true},{"text":". We can clearly see that ","element":"span"},{"style":{"height":17.6},"width":224.62,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-20.png","element":"img","alt":" x∗ ∈ [0, 2τ].","inline":true,"padRight":true},{"text":"Equation (2.3) of ","element":"span"},{"href":"#id-107","text":"Breheny and Huang ","element":"a"},{"href":"#id-107","text":"[2011] ","element":"a"},{"text":"gives the result when ","element":"span"},{"style":{"height":15.2},"width":142.16,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-21.png","element":"img","alt":" κγ ≥ 1.","inline":true}],[{"text":"When ","element":"span"},{"style":{"height":13.2},"width":99.72,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-22.png","element":"img","alt":" κγ <","inline":true,"padRight":true},{"text":"1, we see that any stationary point of ","element":"span"},{"style":{"height":17.6},"width":303.51,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-23.png","element":"img","alt":" F in [0, γλ ∧ 2τ","inline":true},{"text":"] must be a maximum, since on this interval ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") is a quadratic function with a negative coefficient of ","element":"span"},{"style":{"height":15.13},"width":41.94,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-24.png","element":"img","alt":" x2","inline":true},{"text":". Therefore its minimum over [0","element":"span"},{"style":{"height":16},"width":70.42,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-25.png","element":"img","alt":", γλ","inline":true},{"text":"] is attained at either ","element":"span"},{"style":{"height":16},"width":634.46,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-26.png","element":"img","alt":" x = 0 or x = γλ ∧ 2τ. If 2τ ≤ γλ","inline":true},{"text":", then it suffices to check that ","element":"span"},{"style":{"height":17.6},"width":243.97,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-27.png","element":"img","alt":" F(0) < F(2τ","inline":true},{"text":"). This holds if and only if ","element":"span"},{"style":{"height":17.6},"width":221.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-28.png","element":"img","alt":" τ < γλ/(γκ","inline":true,"padRight":true},{"text":"+ 1), but since we are assuming ","element":"span"},{"style":{"height":17.6},"width":371.97,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-29.png","element":"img","alt":"τ ≤ γλ/2 and κγ <","inline":true,"padRight":true},{"text":"1, this is always satisfied.","element":"span"}],[{"text":"If ","element":"span"},{"style":{"height":16},"width":153.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-30.png","element":"img","alt":" γλ < 2τ","inline":true},{"text":", then we can see that the minimum of ","element":"span"},{"style":{"height":17.6},"width":267.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-31.png","element":"img","alt":" F over [γλ, 2τ","inline":true},{"text":"] will be attained at exactly 2","element":"span"},{"style":{"height":8},"width":23,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-32.png","element":"img","alt":"τ","inline":true},{"text":". Thus, here it also suffices to check ","element":"span"},{"style":{"height":17.6},"width":243.97,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-33.png","element":"img","alt":" F(0) < F(2τ","inline":true},{"text":"), which holds if and only if ","element":"span"},{"style":{"height":20.8},"width":278.9,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-34.png","element":"img","alt":" τ <�γ/κλ/2.","inline":true,"padRight":true},{"text":"The final bound ","element":"span"},{"style":{"height":19.12},"width":367.05,"height":47.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/37-35.png","element":"img","alt":" τ < (1 ∧ √κγ)λ/2κ","inline":true,"padRight":true},{"text":"follows from combining the results for these cases.","element":"span"}],[{"text":"The following is a deterministic result to establish separation between groups of coefficients.","element":"span"}],[{"id":"id-104","style":{"fontWeight":"bold"},"text":"Lemma 8. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the setup of Theorem ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"5, ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and assume that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":15.6},"width":106.5,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-0.png","element":"img","alt":"µ = 0","inline":true},{"style":{"fontStyle":"italic"},"text":". Suppose that ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":14.62},"width":199.52,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-1.png","element":"img","alt":"Y1 ≤ · · · ≤","inline":true,"padRight":true},{"text":"¯","element":"span"},{"style":{"height":14.7},"width":55.33,"height":36.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-2.png","element":"img","alt":"YK","inline":true},{"style":{"fontStyle":"italic"},"text":", and that for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , s ","element":"span"},{"style":{"fontStyle":"italic"},"text":"we have","element":"span"}],[{"id":"id-111","style":{"width":"62%"},"width":1134,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-3.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":17.42},"width":222.21,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-4.png","element":"img","alt":" kj and kj−1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"are as defined in ","element":"span"},{"href":"#id-108","text":"(36)","element":"a"},{"style":{"fontStyle":"italic"},"text":". Suppose further that for ","element":"span"},{"style":{"height":16},"width":306.04,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-5.png","element":"img","alt":" j = 1, . . . , s − 1,","inline":true}],[{"style":{"width":"77%"},"width":1390,"height":59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-6.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Then for ","element":"span"},{"style":{"height":23.79},"width":993.91,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-7.png","element":"img","alt":" j = 1, . . . , s, we have ¯Ykj−1+1 ≤ ˆθkj−1+1 ≤ ˆθkj ≤ ¯Ykj.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"For convenience, within this lemma we define ","element":"span"},{"style":{"height":18.32},"width":249.07,"height":45.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-8.png","element":"img","alt":" ζ := √ηγ∗sλ","inline":true},{"text":". Recall that the objective function which ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-9.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"optimises takes the form","element":"span"}],[{"style":{"width":"48%"},"width":883,"height":131,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-10.png","element":"img"}],[{"text":"We first claim that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.21},"width":560.7,"height":48.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-11.png","element":"img","alt":"θk ∈ [ ¯Y1, ¯YK] for k = 1, . . . , K","inline":true},{"text":". To see this, suppose that this is not the case and define ","element":"span"},{"text":"ˇ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-12.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"by projecting ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.41},"width":1276.32,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-13.png","element":"img","alt":"θ onto [ ¯Y1, ¯YK]K (i.e. ˇθk = ¯YK ∧ ( ¯Y1 ∨ ˆθk) for k = 1, . . . , K). The","inline":true,"padRight":true},{"text":"penalty contribution from ","element":"span"},{"text":"ˇ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-14.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"is no larger than that of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-15.png","element":"img","alt":"θ","inline":true},{"text":", and the loss contribution is strictly smaller, so we obtain the contradiction ","element":"span"},{"style":{"height":21.4},"width":258.87,"height":53.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-16.png","element":"img","alt":" Q(ˇθ) < Q(ˆθ).","inline":true}],[{"text":"We now proceed to show that for ","element":"span"},{"style":{"height":16},"width":258.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-17.png","element":"img","alt":" j = 1, . . . , s −","inline":true,"padRight":true},{"text":"1, we have ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.79},"width":624.16,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-18.png","element":"img","alt":"θkj ≤ ¯Ykj and ˆθkj+1 ≥ ¯Ykj+1. We","inline":true,"padRight":true},{"text":"prove the first of these sets of inequalities, since the second follows similarly by considering the problem with ","element":"span"},{"style":{"height":20.21},"width":156.12,"height":50.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-19.png","element":"img","alt":" −ˆθ, − ¯Y","inline":true,"padRight":true},{"text":"and reversing the indices. Suppose, for contradiction, that there exists some ","element":"span"},{"style":{"height":23.8},"width":853.5,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-20.png","element":"img","alt":" j in {1, . . . , s − 1} with ˆθkj > ¯Ykj. Let this j","inline":true,"padRight":true},{"text":"be minimal, such that for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l < j ","element":"span"},{"text":"we have ˆ","element":"span"},{"style":{"height":19.91},"width":178.45,"height":49.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-21.png","element":"img","alt":"θkl ≤ ¯Ykl.","inline":true}],[{"text":"Next define ","element":"span"},{"style":{"height":15.02},"width":30.02,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-22.png","element":"img","alt":" l1","inline":true,"padRight":true},{"text":"to be the maximal element of ","element":"span"},{"style":{"height":18.22},"width":426.55,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-23.png","element":"img","alt":" {kj−1 + 1, . . . , kj − 1}","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":194.56,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-24.png","element":"img","alt":"θl1 ≤ ¯Ykj.","inline":true,"padRight":true},{"text":"Similarly, we define ","element":"span"},{"style":{"height":18.22},"width":425.52,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-25.png","element":"img","alt":" l2 ∈ {kj + 1, . . . , kj+1}","inline":true,"padRight":true},{"text":"to be minimal such that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":209.66,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-26.png","element":"img","alt":"θl2 ≥ ¯Ykj+1","inline":true},{"text":". The existence of ","element":"span"},{"style":{"height":15.02},"width":161.36,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-27.png","element":"img","alt":" l1 and l2","inline":true,"padRight":true},{"text":"is guaranteed by Lemma ","element":"span"},{"href":"#id-106","text":"9.","element":"a"}],[{"text":"We note that for ","element":"span"},{"href":"#id-106","style":{"height":23.79},"width":477.39,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-28.png","element":"img","alt":" l = l1 + 1, . . . , kj, ˆθl = ˆθkj","inline":true,"padRight":true},{"text":"and hence ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":693.31,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-29.png","element":"img","alt":"Yl − ˆθl)2 ≥ ( ¯Ykj − ˆθl)2 = ( ¯Ykj − ˆθkj)2.","inline":true,"padRight":true},{"text":"This can be shown by contradiction, as in ","element":"span"},{"href":"#id-109","text":"(55)","element":"a"},{"text":". For such ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l","element":"span"},{"text":", we have from optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12.8},"width":117.26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-30.png","element":"img","alt":"θ that","inline":true,"padRight":true},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":333.05,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-31.png","element":"img","alt":"Yl − ˆθl1 ≥ ˆθkj − ¯Yl","inline":true,"padRight":true},{"text":"(otherwise one could improve the objective by setting ˆ","element":"span"},{"style":{"height":21.62},"width":137.86,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-32.png","element":"img","alt":"θl1 = ˆθl","inline":true},{"text":") which implies that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.43},"width":142.71,"height":48.57,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-33.png","element":"img","alt":"θl1 < ¯Yl","inline":true},{"text":". From this it follows that ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.8},"width":859.91,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-34.png","element":"img","alt":"Yl − ˆθl1)2 ≤ ( ¯Ykj − ˆθl1)2, since ˆθl1 < ¯Yl ≤ ¯Ykj.","inline":true}],[{"text":"Similarly, if ","element":"span"},{"style":{"height":17.42},"width":146.59,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-35.png","element":"img","alt":" l2 > kj","inline":true,"padRight":true},{"text":"+ 1, then for ","element":"span"},{"style":{"height":17.42},"width":390.07,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-36.png","element":"img","alt":" l = kj + 1, . . . , l2 −","inline":true,"padRight":true},{"text":"1 we have ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.8},"width":426.84,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-37.png","element":"img","alt":"θl = ˆθkj+1 and hence","inline":true,"padRight":true},{"text":"( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":1059.71,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-38.png","element":"img","alt":"Yl − ˆθl)2 ≥ ( ¯Ykj+1 − ˆθl)2 = ( ¯Ykj+1 − ˆθkj+1)2. For such l","inline":true},{"text":", it follows that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.43},"width":144.84,"height":48.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-39.png","element":"img","alt":"θl2 > ¯Yl","inline":true,"padRight":true},{"text":"and therefore that ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":505.67,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-40.png","element":"img","alt":"Yl − ˆθl2)2 ≤ ( ¯Ykj+1 − ˆθl2)2.","inline":true}],[{"text":"Now, we define","element":"span"}],[{"style":{"width":"52%"},"width":944,"height":268,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-41.png","element":"img"}],[{"text":"We also define ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":15.94},"width":140.76,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-42.png","element":"img","alt":"θ ∈ RK ","inline":true,"padRight":true},{"text":"according to","element":"span"}],[{"style":{"width":"27%"},"width":502,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/38-43.png","element":"img"}],[{"style":{"width":"99%"},"width":1804,"height":345,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-0.png","element":"img"}],[{"text":"Thus,","element":"span"}],[{"id":"id-112","style":{"width":"83%"},"width":1514,"height":748,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-1.png","element":"img"}],[{"text":"We specify the infimum in ","element":"span"},{"href":"#id-110","text":"(47) ","element":"a"},{"text":"because ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":125.57,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-2.png","element":"img","alt":"Ykj, ˆθl2","inline":true},{"text":"] is not closed, and let (","element":"span"},{"style":{"height":10.62},"width":53.06,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-3.png","element":"img","alt":"am","inline":true},{"text":") be a convergent sequence in ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":125.56,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-4.png","element":"img","alt":"Ykj, ˆθl2","inline":true},{"text":"] whose limit attains this infimum. We define ","element":"span"},{"style":{"height":15.42},"width":334.77,"height":38.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-5.png","element":"img","alt":" a∗ = limm→∞ am.","inline":true}],[{"text":"By assumption ","element":"span"},{"href":"#id-111","text":"(43)","element":"a"},{"text":", at least one of (","element":"span"},{"style":{"height":21.62},"width":420.25,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-6.png","element":"img","alt":"a∗ − ˆθl1) and (ˆθl2 − a∗","inline":true},{"text":") is greater than or equal to ","element":"span"},{"style":{"height":16},"width":62.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-7.png","element":"img","alt":" γλ.","inline":true,"padRight":true},{"text":"Here, we use that the separation ","element":"span"},{"href":"#id-111","text":"(43) ","element":"a"},{"style":{"height":21.62},"width":467.76,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-8.png","element":"img","alt":" ≥ 2γλ. If ˆθl2 − a∗ ≥ γλ","inline":true,"padRight":true},{"text":"then we denote this case (A1) and ","element":"span"},{"href":"#id-112","text":"(45) ","element":"a"},{"text":"becomes","element":"span"}],[{"id":"id-110","style":{"width":"88%"},"width":1592,"height":253,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-9.png","element":"img"}],[{"text":"We define ˜","element":"span"},{"style":{"height":12.74},"width":40.06,"height":31.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-10.png","element":"img","alt":"a∗ ","inline":true,"padRight":true},{"text":"to be the minimiser over ˜","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"of ","element":"span"},{"href":"#id-110","text":"(47)","element":"a"},{"text":". We can observe that since ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":333.98,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-11.png","element":"img","alt":"Ykj − ˆθl1 < ζ and","inline":true},{"style":{"height":23.94},"width":1207.61,"height":59.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-12.png","element":"img","alt":"ζ < (1 ∧ �γ ˜wkj)λ/ ˜wkj, we have ¯Ykj − ˆθl1 < (1 ∧ �γ ˜wkj)λ/ ˜wkj","inline":true},{"text":". Thus, we have by Lemma ","element":"span"},{"href":"#id-113","text":"7 ","element":"a"},{"text":"that the uniquely optimal ˜","element":"span"},{"style":{"height":21.62},"width":145.78,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-13.png","element":"img","alt":"a∗ = ˆθl1","inline":true},{"text":". This gives that the value of ","element":"span"},{"href":"#id-110","text":"(47) ","element":"a"},{"text":"is zero.","element":"span"}],[{"text":"It is straightforward to see from ","element":"span"},{"href":"#id-110","text":"(46) ","element":"a"},{"text":"that ","element":"span"},{"style":{"height":21.6},"width":155.15,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-14.png","element":"img","alt":" a∗ = ¯Ykj","inline":true,"padRight":true},{"text":"must be the unique limit of (","element":"span"},{"style":{"height":17.6},"width":216.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-15.png","element":"img","alt":"am). As we","inline":true,"padRight":true},{"text":"have assumed that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":171.17,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-16.png","element":"img","alt":"θkj > ¯Ykj","inline":true,"padRight":true},{"text":"and the infimum is not attained in (","element":"span"},{"text":"¯","element":"span"},{"style":{"height":21.6},"width":181.88,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-17.png","element":"img","alt":"Ykj, ¯Ykj+1","inline":true},{"text":"), the inequality in line ","element":"span"},{"href":"#id-110","text":"(46) ","element":"a"},{"text":"can be made strict. It follows that ","element":"span"},{"style":{"height":21.41},"width":258.88,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-18.png","element":"img","alt":" Q(ˆθ) > Q(˜θ).","inline":true}],[{"text":"Thus, it remains for us to consider the case where ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.01},"width":240.84,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-19.png","element":"img","alt":"θl2 −a∗ < γλ","inline":true},{"text":", which implies that ","element":"span"},{"style":{"height":21.62},"width":177.76,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-20.png","element":"img","alt":" a∗− ˆθl1 ≥","inline":true},{"style":{"height":16},"width":51.05,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-21.png","element":"img","alt":"γλ","inline":true},{"text":". We denote this case (A2). Now, from ","element":"span"},{"href":"#id-112","text":"(45) ","element":"a"},{"text":"we can obtain","element":"span"}],[{"id":"id-114","style":{"width":"88%"},"width":1592,"height":114,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-22.png","element":"img"}],[{"text":"The objective is piecewise quadratic (and continuously differentiable), with two pieces: [","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.62},"width":576.54,"height":54.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-23.png","element":"img","alt":"θl1, ˆθl2 − γλ] and (ˆθl2 − γλ, ˆθl2","inline":true},{"text":"]. On the first region, the objective is a convex quadratic with minimum at ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.8},"width":371.74,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/39-24.png","element":"img","alt":"Ykj ∈ [ˆθl1, ˆθl2 − γλ].","inline":true}],[{"text":"By the assumption that ","element":"span"},{"style":{"height":21.62},"width":274.7,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-0.png","element":"img","alt":" a∗ > ˆθl2 − γλ","inline":true},{"text":", we know that the objective must be concave on (","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.62},"width":220.23,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-1.png","element":"img","alt":"θl2 − γλ, ˆθl2","inline":true},{"text":"]. It is clear that the derivative of the objective at ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.01},"width":155.81,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-2.png","element":"img","alt":"θl2 − γλ","inline":true,"padRight":true},{"text":"is positive. Hence, if ˜","element":"span"},{"style":{"height":21.62},"width":237.53,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-3.png","element":"img","alt":"a∗ = ˆθl2−γλ","inline":true},{"text":", then the objective will take a strictly lower value at some ˜","element":"span"},{"style":{"height":21.62},"width":477.09,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-4.png","element":"img","alt":"a∗ ∈ (ˆθl2−γλ−ϵ, ˆθl2−γλ)","inline":true,"padRight":true},{"text":"(for some small ","element":"span"},{"style":{"height":10.4},"width":63.84,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-5.png","element":"img","alt":" ϵ >","inline":true,"padRight":true},{"text":"0), contradicting optimality of ˜","element":"span"},{"style":{"height":12.73},"width":40.06,"height":31.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-6.png","element":"img","alt":"a∗","inline":true},{"text":". It therefore follows that ˜","element":"span"},{"style":{"height":21.62},"width":161.37,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-7.png","element":"img","alt":"a∗ = ˆθl2.","inline":true}],[{"text":"With this knowledge, we can further simplify ","element":"span"},{"href":"#id-114","text":"(48) ","element":"a"},{"text":"to obtain","element":"span"}],[{"style":{"width":"56%"},"width":1025,"height":93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-8.png","element":"img"}],[{"text":"The second inequality follows from ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.8},"width":628.33,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-9.png","element":"img","alt":"Ykj − ˆθl1 ≤ ζ and ˆθl2 − ¯Ykj > ζ","inline":true},{"text":". Hence, we obtain that ","element":"span"},{"style":{"height":21.4},"width":258.88,"height":53.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-10.png","element":"img","alt":"Q(ˆθ) > Q(˜θ).","inline":true}],[{"text":"We now we direct our attention towards case (B), where similarly to before we observe that the penalty contributions between ","element":"span"},{"style":{"height":21.41},"width":396.55,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-11.png","element":"img","alt":" l1 and l2 in Q(ˆθ) are","inline":true}],[{"style":{"width":"66%"},"width":1206,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-12.png","element":"img"}],[{"text":"Similarly to ","element":"span"},{"href":"#id-112","text":"(44) ","element":"a"},{"text":"in case (A), we obtain","element":"span"}],[{"id":"id-115","style":{"width":"86%"},"width":1566,"height":680,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-13.png","element":"img"}],[{"text":"We specify the infimum in ","element":"span"},{"href":"#id-115","text":"(50) ","element":"a"},{"text":"because ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":21.6},"width":181.89,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-14.png","element":"img","alt":"Ykj, ¯Ykj+1","inline":true},{"text":") is not closed and therefore a minimum may not exist. Let (","element":"span"},{"style":{"height":15.6},"width":123.14,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-15.png","element":"img","alt":"am, bm","inline":true},{"text":") be a convergent sequence in ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":21.6},"width":181.88,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-16.png","element":"img","alt":"Ykj, ¯Ykj+1","inline":true},{"text":") whose limit achieves this infimum. We now define (","element":"span"},{"style":{"height":17.6},"width":474.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-17.png","element":"img","alt":"a∗, b∗) = limm→∞(am, bm","inline":true},{"text":"). By assumption ","element":"span"},{"href":"#id-111","text":"(43)","element":"a"},{"text":", we know that ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":18.79},"width":140.46,"height":46.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-18.png","element":"img","alt":"Ykj+1−","inline":true,"padRight":true},{"text":"¯","element":"span"},{"style":{"height":19.19},"width":200.34,"height":47.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-19.png","element":"img","alt":"Ykj ≥ 3γλ","inline":true},{"text":", which implies that ˆ","element":"span"},{"style":{"height":21.62},"width":294.97,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-20.png","element":"img","alt":"θl2 − ˆθl1 ≥ 3γλ","inline":true},{"text":". Thus, one of ","element":"span"},{"style":{"height":21.62},"width":605.6,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-21.png","element":"img","alt":" {(ˆθl2 − b∗), (b∗ − a∗), (a∗ − ˆθl1)}","inline":true,"padRight":true},{"text":"must be at least ","element":"span"},{"style":{"height":16},"width":62.47,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-22.png","element":"img","alt":" γλ.","inline":true}],[{"text":"We first consider if ","element":"span"},{"style":{"height":16},"width":242.21,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-23.png","element":"img","alt":" b∗ − a∗ ≥ γλ","inline":true},{"text":", and denote this case (B1). Here, ","element":"span"},{"href":"#id-115","text":"(50) ","element":"a"},{"text":"becomes","element":"span"}],[{"style":{"width":"19%"},"width":345,"height":12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-24.png","element":"img"}],[{"style":{"height":37.74},"width":548.32,"height":94.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/40-25.png","element":"img","alt":"Q(ˆθ) − Q(˜θ) ≥ inf¯Ykj Q(˜θ).","inline":true}],[{"text":"It therefore remains for us to obtain the result in the case that ","element":"span"},{"style":{"height":16},"width":237.94,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-1.png","element":"img","alt":" b∗ − a∗ < γλ","inline":true},{"text":", and we denote this case (B2). Using that the separation ","element":"span"},{"href":"#id-111","text":"(43) ","element":"a"},{"style":{"height":16.4},"width":213.33,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-2.png","element":"img","alt":" ≥ 3γλ + 2ζ","inline":true},{"text":", it is straightforward to see that one of ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":21.6},"width":491.38,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-3.png","element":"img","alt":"Ykj+1 − b∗) and (a∗ − ¯Ykj","inline":true},{"text":") must be at least ","element":"span"},{"style":{"height":16.4},"width":128.49,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-4.png","element":"img","alt":" γλ + ζ","inline":true},{"text":". By the symmetry of the problem, it is sufficient for us to consider the case where ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":19.19},"width":382.44,"height":47.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-5.png","element":"img","alt":"Ykj+1 − b∗ ≥ γλ + ζ","inline":true},{"text":". In this case, we can obtain from ","element":"span"},{"href":"#id-115","text":"(50) ","element":"a"},{"text":"that","element":"span"}],[{"id":"id-117","style":{"width":"82%"},"width":1492,"height":291,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-6.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":32},"width":1061.69,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-7.png","element":"img","alt":" B =�(˜a,˜b): ˆθl1 ≤ ˜a ≤ ˜b ≤ ¯Ykj+1 − γλ − ζ, ˜b − ˜a < γλ�","inline":true},{"text":". From this, we can extract the terms dependent on ","element":"span"},{"text":"˜","element":"span"},{"style":{"fontStyle":"italic"},"text":"b ","element":"span"},{"text":"to obtain","element":"span"}],[{"style":{"width":"74%"},"width":1342,"height":116,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-8.png","element":"img"}],[{"text":"This objective is piecewise quadratic (and continuously differentiable), with two pieces; [˜","element":"span"},{"style":{"height":15.53},"width":141.58,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-9.png","element":"img","alt":"a∗, ˜a∗+","inline":true},{"style":{"height":21.62},"width":369.8,"height":54.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-10.png","element":"img","alt":"γλ) and [˜a∗+γλ, ˆθl2","inline":true},{"text":"]. Over the second region, the objective is a convex quadratic with minimum at ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.8},"width":381.43,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-11.png","element":"img","alt":"Ykj+1 ∈ [˜a∗ + γλ, ˆθl2","inline":true},{"text":"]. By following the same argument as for ","element":"span"},{"href":"#id-114","text":"(48) ","element":"a"},{"text":"in case (A2), we see that ˜","element":"span"},{"style":{"height":12.8},"width":149.83,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-12.png","element":"img","alt":"b∗ = ˜a∗.","inline":true}],[{"text":"With this knowledge, we can further simplify ","element":"span"},{"href":"#id-117","text":"(53) ","element":"a"},{"text":"to obtain","element":"span"}],[{"style":{"width":"90%"},"width":1633,"height":233,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-13.png","element":"img"}],[{"text":"Since ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":19.19},"width":274.75,"height":47.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-14.png","element":"img","alt":"Ykj+1 − ˜a∗ > ζ","inline":true},{"text":", we can see that ( ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":582.88,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-15.png","element":"img","alt":"Ykj+1 − ˜a∗)2 − ( ¯Ykj+1 − ˆθl2)2 >","inline":true,"padRight":true},{"text":"0. Thus, it suffices for us to show that","element":"span"}],[{"style":{"width":"69%"},"width":1248,"height":120,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-16.png","element":"img"}],[{"text":"This objective is exactly as in ","element":"span"},{"href":"#id-110","text":"(47) ","element":"a"},{"text":"in case (A1), minimised over a smaller feasible set. Hence, it follows immediately that this holds and we can conclude that ","element":"span"},{"style":{"height":21.41},"width":258.87,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-17.png","element":"img","alt":" Q(ˆθ) > Q(˜θ).","inline":true}],[{"text":"We now have for all cases that ","element":"span"},{"style":{"height":21.4},"width":232.24,"height":53.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-18.png","element":"img","alt":" Q(ˆθ) > Q(˜θ","inline":true},{"text":"), which contradicts the optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":15.6},"width":164.17,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-19.png","element":"img","alt":"θ. Thus,","inline":true,"padRight":true},{"text":"we can conclude that for ","element":"span"},{"style":{"height":23.8},"width":866.75,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-20.png","element":"img","alt":" j = 1, . . . , s, ˆθkj ≤ ¯Ykj and ˆθkj−1+1 ≥ ¯Ykj−1+1.","inline":true}],[{"id":"id-106","style":{"fontWeight":"bold"},"text":"Lemma 9. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the setup of Lemma ","element":"span"},{"href":"#id-104","style":{"fontStyle":"italic"},"text":"8. ","element":"a"},{"style":{"fontStyle":"italic"},"text":"For each ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , s","element":"span"},{"style":{"fontStyle":"italic"},"text":", there exists ","element":"span"},{"style":{"height":20.49},"width":263.61,"height":51.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-21.png","element":"img","alt":" k∗j in {kj−1 +","inline":true,"padRight":true},{"text":"1","element":"span"},{"style":{"height":26.09},"width":733.37,"height":65.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-22.png","element":"img","alt":", . . . , kj} such that ˆθk∗j ∈ [ ¯Ykj−1+1, ¯Ykj].","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"We first show that if ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":169.74,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-23.png","element":"img","alt":"θkj > ¯Ykj","inline":true},{"text":", then for any ","element":"span"},{"style":{"height":23.79},"width":807.25,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-24.png","element":"img","alt":" k with kj−1 + 1 ≤ k ≤ kj, if ˆθk > ¯Ykj then","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"style":{"height":23.79},"width":166.87,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-25.png","element":"img","alt":"θk = ˆθkj.","inline":true}],[{"text":"We prove the first case since the proof for the second is identical. Suppose that this does not hold, i.e. ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":191.81,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-26.png","element":"img","alt":"θkj > ¯Ykj","inline":true,"padRight":true},{"text":"and there exists some (minimal) ","element":"span"},{"style":{"height":18.22},"width":642.37,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/41-27.png","element":"img","alt":" k in {kj−1 + 1, . . . , kj − 1} with","inline":true}],[{"text":"¯","element":"span"},{"style":{"height":23.79},"width":268.02,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-0.png","element":"img","alt":"Ykj < ˆθk < ˆθkj","inline":true},{"text":". Then we construct ˇ","element":"span"},{"style":{"height":16.4},"width":86.5,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-1.png","element":"img","alt":"θ by","inline":true}],[{"id":"id-109","style":{"width":"67%"},"width":1222,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-2.png","element":"img"}],[{"text":"We observe that the penalty contribution from ","element":"span"},{"text":"ˇ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-3.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"is no more than that of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-4.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"and that the quadratic loss for ","element":"span"},{"text":"ˇ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-5.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"will be strictly less than that of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-6.png","element":"img","alt":"θ","inline":true},{"text":". This gives us that ","element":"span"},{"style":{"height":21.41},"width":229,"height":53.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-7.png","element":"img","alt":" Q(ˇθ) < Q(ˆθ","inline":true},{"text":"), contradicting the optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":37.91,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-8.png","element":"img","alt":"θ.","inline":true}],[{"text":"Similarly, if ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":334.38,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-9.png","element":"img","alt":"θkj−1+1 < ¯Ykj−1+1","inline":true,"padRight":true},{"text":"then the corresponding statement that for any ","element":"span"},{"style":{"height":17.42},"width":261.14,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-10.png","element":"img","alt":" k with kj−1 +","inline":true,"padRight":true},{"text":"1 ","element":"span"},{"style":{"height":23.79},"width":755.34,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-11.png","element":"img","alt":" ≤ kj, if ˆθk < ¯Ykj−1+1 then ˆθk = ˆθkj−1+1.","inline":true}],[{"text":"We now establish a simple preliminary result. Suppose that for some ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , s","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"there exists ","element":"span"},{"style":{"height":23.79},"width":905.2,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-12.png","element":"img","alt":" k in {kj−1 + 1, . . . , kj} with ˆθk /∈ [ ¯Ykj−1+1, ¯Ykj","inline":true},{"text":"], such that ","element":"span"},{"style":{"height":22.61},"width":523.13,"height":56.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-13.png","element":"img","alt":"�{l: ˆθl=ˆθk} wl ≥ η/2s. We","inline":true,"padRight":true},{"text":"claim that if ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.79},"width":873.7,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-14.png","element":"img","alt":"θk > ¯Ykj then ˆθk ≤ ¯Ykj + (�2s/η√γλ ∨ γλ","inline":true},{"text":"). Similarly, if ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":359.94,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-15.png","element":"img","alt":"θk < ¯Ykj−1+1 then","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"style":{"height":22.51},"width":656.66,"height":56.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-16.png","element":"img","alt":"θk ≥ ¯Ykj−1+1 − (�2s/η√γλ ∨ γλ).","inline":true}],[{"text":"To prove the claim, we consider the case ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":169.79,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-17.png","element":"img","alt":"θk > ¯Ykj","inline":true,"padRight":true},{"text":"(the other is identical). By the first observation, if ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.79},"width":879.44,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-18.png","element":"img","alt":"θl > ¯Ykj for l in {kj−1 + 1, . . . kj} then ˆθl = ˆθk","inline":true},{"text":". Now, for contradiction, suppose ˆ","element":"span"},{"style":{"height":22.51},"width":547.34,"height":56.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-19.png","element":"img","alt":"θk > ¯Ykj + (�2s/η√γλ ∨ γλ","inline":true},{"text":") and let this ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"be minimal. Then we can construct ","element":"span"},{"text":"ˇ","element":"span"},{"style":{"height":16.4},"width":86.49,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-20.png","element":"img","alt":"θ by","inline":true}],[{"style":{"width":"47%"},"width":856,"height":133,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-21.png","element":"img"}],[{"text":"By appealing to the optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-22.png","element":"img","alt":"θ","inline":true},{"text":", we can easily observe that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.66},"width":229.59,"height":44.14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-23.png","element":"img","alt":"θk−1 ≤ ¯Yk−1","inline":true,"padRight":true},{"text":"and therefore that the ordering of the entries of ","element":"span"},{"text":"ˇ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-24.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"matches that of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-25.png","element":"img","alt":"θ","inline":true},{"text":". Here, we use that (","element":"span"},{"style":{"height":21.65},"width":454.37,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-26.png","element":"img","alt":"�2s/η√γλ ∨ γλ) ≥ γλ.","inline":true}],[{"style":{"width":"99%"},"width":1804,"height":667,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-27.png","element":"img"}],[{"text":"We will without loss of generality take the second statement to be true (the proof for the first case follows identically). Let ","element":"span"},{"style":{"height":12.8},"width":40.09,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-28.png","element":"img","alt":" k′ ","inline":true,"padRight":true},{"text":"denote the minimal element in ","element":"span"},{"style":{"height":18.22},"width":547.02,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-29.png","element":"img","alt":" {kj−1 + 1, . . . , kj} such that","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"style":{"height":23.79},"width":164.52,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-30.png","element":"img","alt":"θk′ = ˆθkj","inline":true},{"text":". From the preliminary result established earlier, ˆ","element":"span"},{"style":{"height":22.51},"width":676.14,"height":56.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-31.png","element":"img","alt":"θkj ≤ ¯Ykj + (�2s/η√γλ ∨ γλ). By","inline":true,"padRight":true},{"text":"appealing to the optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-32.png","element":"img","alt":"θ","inline":true},{"text":", we see that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.79},"width":316.79,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-33.png","element":"img","alt":"θkj+1 < ˆθkj + γλ","inline":true,"padRight":true},{"text":"(otherwise, we could take ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":19.19},"width":109.12,"height":47.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-34.png","element":"img","alt":"θkj to","inline":true,"padRight":true},{"text":"be ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":18.79},"width":54.97,"height":46.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-35.png","element":"img","alt":"Ykj","inline":true,"padRight":true},{"text":"and strictly reduce the value of the objective).","element":"span"}],[{"text":"Now, we will use that the separation is at least 2(","element":"span"},{"style":{"height":21.65},"width":445.61,"height":54.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-36.png","element":"img","alt":"�2s/η√γλ ∨ γλ) + γλ","inline":true},{"text":". By our earlier observation ","element":"span"},{"href":"#id-109","text":"(55)","element":"a"},{"text":", it is clear that any ","element":"span"},{"style":{"height":23.79},"width":1109.96,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-37.png","element":"img","alt":" l ∈ {kj + 1, . . . , kj+1} with ˆθl < ¯Ykj+1 has ˆθl = ˆθkj+1. Note","inline":true,"padRight":true},{"text":"that since ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":22.51},"width":717.09,"height":56.27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-38.png","element":"img","alt":"θkj+1 − ¯Ykj < (�2s/η√γλ ∨ γλ) + γλ","inline":true},{"text":", it follows that ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":23.79},"width":583.3,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-39.png","element":"img","alt":"Ykj+1 − ˆθkj+1 > (�2s/η√γλ ∨","inline":true},{"style":{"height":17.6},"width":129.31,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-40.png","element":"img","alt":"γλ)+ζ","inline":true,"padRight":true},{"text":"and therefore that ","element":"span"},{"style":{"height":26.7},"width":476.54,"height":66.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-41.png","element":"img","alt":"�{k: ˆθk=ˆθkj+1} wk < η/2s","inline":true,"padRight":true},{"text":"by the preliminary result. Since ","element":"span"},{"style":{"height":19.75},"width":213.95,"height":49.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-42.png","element":"img","alt":" w0min ≥ η/s","inline":true}],[{"text":"and separation ","element":"span"},{"href":"#id-111","text":"(43) ","element":"a"},{"style":{"height":21.65},"width":563.38,"height":54.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-43.png","element":"img","alt":" ≥ 2(�2s/η√γλ∨γλ)+γλ+ζ","inline":true},{"text":", we can define ","element":"span"},{"style":{"height":18.22},"width":573.62,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-44.png","element":"img","alt":" l′ ∈ {kj +1, . . . , kj+1} minimal","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":21.6},"width":217.74,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/42-45.png","element":"img","alt":"θl′ ≥ ¯Ykj+1.","inline":true}],[{"text":"Now, in order to contradict the optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-0.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"we construct a new feasible point ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-1.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"by setting","element":"span"}],[{"style":{"width":"38%"},"width":696,"height":184,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-2.png","element":"img"}],[{"text":"It follows that for ","element":"span"},{"style":{"height":17.42},"width":550.07,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-3.png","element":"img","alt":" l = kj + 1, . . . , l′ − 1 we have","inline":true}],[{"style":{"width":"35%"},"width":637,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-4.png","element":"img"}],[{"text":"It is also straightforward to see that ","element":"span"},{"style":{"height":23.8},"width":763.18,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-5.png","element":"img","alt":" |ˆθkj − ¯Yl| ≥ | ¯Ykj − ¯Yl| for l = k′, . . . , kj","inline":true},{"text":". If follows that the loss contribution in ","element":"span"},{"style":{"height":20.6},"width":76.46,"height":51.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-6.png","element":"img","alt":" Q(˜θ","inline":true},{"text":") is strictly less than that in ","element":"span"},{"style":{"height":21.4},"width":76.47,"height":53.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-7.png","element":"img","alt":" Q(ˆθ","inline":true},{"text":"). Hence, using ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.79},"width":337.81,"height":59.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-8.png","element":"img","alt":"θl′ − ˆθkj > γλ, we","inline":true,"padRight":true},{"text":"obtain","element":"span"}],[{"style":{"width":"64%"},"width":1167,"height":225,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-9.png","element":"img"}],[{"text":"contradicting the optimality of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-10.png","element":"img","alt":"θ","inline":true},{"text":". We conclude that for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , s","element":"span"},{"text":", there exists ","element":"span"},{"style":{"height":20.49},"width":256.55,"height":51.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-11.png","element":"img","alt":" k∗j in {kj−1 +","inline":true,"padRight":true},{"text":"1","element":"span"},{"style":{"height":18.22},"width":159.21,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-12.png","element":"img","alt":", . . . , kj}","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":23.89},"width":369.28,"height":59.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-13.png","element":"img","alt":"θk∗j ∈ [ ¯Ykj−1+1, ¯Ykj].","inline":true}],[{"id":"id-36","style":{"fontWeight":"bold"},"text":"Lemma 10. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Consider the univariate objective ","element":"span"},{"href":"#id-40","text":"(11)","element":"a"},{"style":{"fontStyle":"italic"},"text":", relaxing the normalisation constraint to ","element":"span"},{"text":"ˇ","element":"span"},{"style":{"height":18.25},"width":308.43,"height":45.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-14.png","element":"img","alt":"w := �k wk ≤ 1","inline":true},{"style":{"fontStyle":"italic"},"text":". Suppose that ","element":"span"},{"style":{"height":21.4},"width":1100.64,"height":53.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-15.png","element":"img","alt":" wT ¯Y = 0, and that ∥ ¯Y ∥∞ < (2 ∧ √γ ˇw) λ/ ˇw. Then ˆθ = 0.","inline":true}],[{"style":{"height":19.53},"width":904.68,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-16.png","element":"img","alt":"Proof. Let Pw = I − 1wT / ˇw and Dw ∈ RK×K ","inline":true,"padRight":true},{"text":"be the diagonal matrix with entries ","element":"span"},{"style":{"height":17.65},"width":175.18,"height":44.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-17.png","element":"img","alt":" Dkk√wk.","inline":true,"padRight":true},{"text":"First note that","element":"span"}],[{"style":{"width":"69%"},"width":1249,"height":410,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-18.png","element":"img"}],[{"text":"Thus for all ","element":"span"},{"style":{"height":18.33},"width":321.14,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-19.png","element":"img","alt":" θ ∈ RK, we have","inline":true}],[{"style":{"width":"88%"},"width":1589,"height":500,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-20.png","element":"img"}],[{"text":"Consider minimising ","element":"span"},{"style":{"height":19.53},"width":821.05,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/43-21.png","element":"img","alt":" F over RK × [−τ, τ]K × S, where S ⊆ RK ","inline":true,"padRight":true},{"text":"is the unit simplex scaled by ˇ","element":"span"},{"style":{"fontStyle":"italic"},"text":"w","element":"span"},{"text":". We aim to show this minimum is 0. As with the first claim in the proof of Lemma ","element":"span"},{"href":"#id-104","text":"8, ","element":"a"},{"text":"it is straightforward to see that for any feasible (","element":"span"},{"style":{"height":16.4},"width":116.82,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-0.png","element":"img","alt":"θ, ξ, w","inline":true},{"text":"), there exists ","element":"span"},{"style":{"height":17.6},"width":514.61,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-1.png","element":"img","alt":" θ′ with ∥θ′∥∞ ≤ ∥ξ∥∞ and","inline":true}],[{"style":{"width":"84%"},"width":1534,"height":146,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-2.png","element":"img"}],[{"text":"As on the RHS we are minimising a continuous function over a compact set, we know a minimiser must exist. Let (","element":"span"},{"text":"˜","element":"span"},{"style":{"height":20.21},"width":116.8,"height":50.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-3.png","element":"img","alt":"θ, ˜ξ, ˜w","inline":true},{"text":") be a minimiser (to be specified later). Observe that","element":"span"}],[{"style":{"width":"68%"},"width":1237,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-4.png","element":"img"}],[{"text":"is linear as a function of ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-5.png","element":"img","alt":" ξ","inline":true},{"text":". Hence it is minimised over the set","element":"span"}],[{"style":{"width":"35%"},"width":632,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-6.png","element":"img"}],[{"text":"at some point in ","element":"span"},{"style":{"height":19.53},"width":172.58,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-7.png","element":"img","alt":" {−τ, τ}K","inline":true},{"text":". Here conv(","element":"span"},{"style":{"height":5.6},"width":12,"height":14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-8.png","element":"img","alt":"·","inline":true},{"text":") denotes the convex hull operation. We thus have","element":"span"}],[{"style":{"width":"99%"},"width":1802,"height":438,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-9.png","element":"img"}],[{"text":"Since the penalty contribution from ","element":"span"},{"text":"ˇ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-10.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"is not greater than that of ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-11.png","element":"img","alt":"θ","inline":true},{"text":", it follows that ","element":"span"},{"style":{"height":20.61},"width":258.88,"height":51.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-12.png","element":"img","alt":" Q(ˇθ) ≤ Q(˜θ).","inline":true,"padRight":true},{"text":"Thus, we can assume that entries of ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-13.png","element":"img","alt":"θ","inline":true,"padRight":true},{"text":"can take one of only two distinct values.","element":"span"}],[{"text":"Next we write ˜","element":"span"},{"style":{"height":21.4},"width":323.3,"height":53.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-14.png","element":"img","alt":"α = �k:˜ξk=−τ ˜wk","inline":true,"padRight":true},{"text":"and observe that ˜","element":"span"},{"style":{"height":20.61},"width":329.01,"height":51.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-15.png","element":"img","alt":"wT ˜ξ = ( ˇw − 2˜α)τ","inline":true},{"text":". Let us set ","element":"span"},{"style":{"height":19.05},"width":217.6,"height":47.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-16.png","element":"img","alt":" s = mink ˜θk","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":19.05},"width":425.34,"height":47.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-17.png","element":"img","alt":" x = maxk ˜θk − mink ˜θk","inline":true},{"text":". Then we have","element":"span"}],[{"style":{"width":"88%"},"width":1599,"height":401,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-18.png","element":"img"}],[{"text":"In the second line above, we have solved for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"to find that","element":"span"}],[{"style":{"width":"40%"},"width":732,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-19.png","element":"img"}],[{"text":"In the third line above, we have solved for ˜","element":"span"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-20.png","element":"img","alt":"α","inline":true,"padRight":true},{"text":"to obtain ˜","element":"span"},{"style":{"height":17.6},"width":140.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-21.png","element":"img","alt":"α = ˇw/","inline":true},{"text":"2 and hence ˜","element":"span"},{"style":{"height":17.6},"width":375.06,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-22.png","element":"img","alt":"α( ˇw − ˜α)/ ˇw = ˇw/4.","inline":true,"padRight":true},{"text":"These follow from optimality of ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":12.8},"width":147.91,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-23.png","element":"img","alt":"θ and ˜w","inline":true,"padRight":true},{"text":"respectively. The result follows from applying Lemma ","element":"span"},{"href":"#id-113","text":"7, ","element":"a"},{"text":"setting ","element":"span"},{"style":{"height":17.6},"width":171.37,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-24.png","element":"img","alt":" κ = ˇw/4.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"S2.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of Theorem ","element":"span"},{"href":"#id-91","style":{"fontWeight":"bold"},"text":"6","element":"a"}],[{"text":"We begin by defining ","element":"span"},{"style":{"height":14.73},"width":51.07,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-25.png","element":"img","alt":" P 0 ","inline":true,"padRight":true},{"text":"to be the orthogonal projection onto the linear space","element":"span"}],[{"style":{"width":"53%"},"width":961,"height":157,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/44-26.png","element":"img"}],[{"text":"The residuals from the oracle least-squares fit are (","element":"span"},{"style":{"height":19.13},"width":413.84,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-0.png","element":"img","alt":"I − P 0)Y = (I − P 0)ε","inline":true},{"text":". The partial residuals ","element":"span"},{"style":{"height":16.33},"width":75.18,"height":40.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-1.png","element":"img","alt":"R(j) ","inline":true,"padRight":true},{"text":"as defined in ","element":"span"},{"href":"#id-53","text":"(18) ","element":"a"},{"text":"for the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"th variable are therefore","element":"span"}],[{"style":{"width":"69%"},"width":1262,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-2.png","element":"img"}],[{"text":"For ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j ","element":"span"},{"text":"= 1","element":"span"},{"style":{"fontStyle":"italic"},"text":", . . . , p","element":"span"},{"text":", we define ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":26.3},"width":912.66,"height":65.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-3.png","element":"img","alt":"R(j)k = �ni=1 1{Xij=k}R(j)i /njk for k = 1, . . . , Kj","inline":true},{"text":", reordering the ","element":"span"},{"text":"labels such that ","element":"span"},{"text":"¯","element":"span"},{"style":{"height":28.24},"width":332.88,"height":70.6,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-4.png","element":"img","alt":"R(j)1 ≤ · · · ≤ ¯R(j)Kj","inline":true},{"text":". We then aim to apply the arguments of Theorem ","element":"span"},{"href":"#id-33","text":"5 ","element":"a"},{"text":"to ˆ","element":"span"},{"style":{"height":17.02},"width":40.91,"height":42.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-5.png","element":"img","alt":"θj","inline":true,"padRight":true},{"text":"defined by","element":"span"}],[{"style":{"width":"80%"},"width":1453,"height":137,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-6.png","element":"img"}],[{"text":"In order to do this, we define the events (for some ","element":"span"},{"style":{"height":12.62},"width":34.08,"height":31.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-7.png","element":"img","alt":" τj","inline":true,"padRight":true},{"text":"to be determined later):","element":"span"}],[{"style":{"width":"79%"},"width":1443,"height":228,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-8.png","element":"img"}],[{"text":"On the intersection of events ","element":"span"},{"style":{"height":26.84},"width":166.25,"height":67.09,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-9.png","element":"img","alt":" ∩Kjk=1Λ(2)jk ","inline":true,"padRight":true},{"text":", we have that ","element":"span"},{"style":{"height":26.51},"width":473.12,"height":66.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-10.png","element":"img","alt":" | ¯R(j)k −ˆθ0jk| < √ηγ∗jsjλj/","inline":true},{"text":"2. By following","element":"span"},{"text":"an identical approach to that involved in computing ","element":"span"},{"href":"#id-118","text":"(35)","element":"a"},{"text":", we have that","element":"span"}],[{"style":{"width":"62%"},"width":1122,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-11.png","element":"img"}],[{"text":"where we recall that ","element":"span"},{"style":{"height":18.44},"width":247.65,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-12.png","element":"img","alt":" wjk = njk/n.","inline":true}],[{"text":"We now turn our attention to the event Λ","element":"span"},{"style":{"height":26.41},"width":43.09,"height":66.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-13.png","element":"img","alt":"(1)j ","inline":true,"padRight":true},{"text":". Note that if ","element":"span"},{"style":{"height":13.02},"width":35.46,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-14.png","element":"img","alt":" sj","inline":true,"padRight":true},{"text":"= 1, then this is immediately ","element":"span"},{"text":"satisfied since ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":26.8},"width":387.62,"height":66.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-15.png","element":"img","alt":"θ0j = θ0j = 0. If sj >","inline":true,"padRight":true},{"text":"1, we use that the oracle least squares estimate ˆ","element":"span"},{"style":{"height":20.48},"width":214.63,"height":51.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-16.png","element":"img","alt":"θ0 = AY is","inline":true,"padRight":true},{"text":"a linear transformation ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"of the responses (","element":"span"},{"style":{"height":18.09},"width":597.4,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-17.png","element":"img","alt":"Yi)ni=1. For each i = 1, . . . , n, Yi","inline":true,"padRight":true},{"text":"has an independent ","element":"span"},{"text":"(non-central) sub-Gaussian distribution with parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-18.png","element":"img","alt":" σ","inline":true},{"text":". Therefore for each ","element":"span"},{"style":{"height":17.42},"width":276.86,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-19.png","element":"img","alt":" k = 1, . . . , Kj,","inline":true,"padRight":true},{"text":"ˆ","element":"span"},{"style":{"height":22.45},"width":166.4,"height":56.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-20.png","element":"img","alt":"θ0jk − θ0jk ","inline":true,"padRight":true},{"text":"also has a sub-Gaussian distribution, with parameter at most ","element":"span"},{"style":{"height":24.13},"width":122.6,"height":60.33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-21.png","element":"img","alt":" σc−1/2min","inline":true,"padRight":true},{"text":"(recalling that ","element":"span"},{"style":{"height":19.53},"width":455.28,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-22.png","element":"img","alt":"cmin = (maxl(AAT )ll)−1","inline":true},{"text":"). This enables us to show that","element":"span"}],[{"style":{"width":"45%"},"width":828,"height":132,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-23.png","element":"img"}],[{"text":"We can now set ","element":"span"},{"style":{"height":20.76},"width":316.39,"height":51.91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-24.png","element":"img","alt":" τj = √ηγ∗jsjλj/","inline":true},{"text":"2. From ","element":"span"},{"href":"#id-61","text":"(26) ","element":"a"},{"text":"and the triangle inequality, on the event Λ","element":"span"},{"style":{"height":26.41},"width":43.09,"height":66.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-25.png","element":"img","alt":"(1)j","inline":true,"padRight":true},{"text":"we have that","element":"span"}],[{"style":{"width":"33%"},"width":599,"height":220,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-26.png","element":"img"}],[{"text":"Thus, on the intersection of events Λ","element":"span"},{"style":{"height":31.6},"width":313.94,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-27.png","element":"img","alt":"(1)j ∩�∩Kjk=1Λ(2)jk�","inline":true},{"text":", we can proceed as in the proof of Theo-","element":"span"}],[{"text":"rem ","element":"span"},{"href":"#id-33","text":"5 ","element":"a"},{"text":"from ","element":"span"},{"href":"#id-119","text":"(38)","element":"a"},{"text":", to conclude that ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":26.79},"width":158.46,"height":66.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/45-28.png","element":"img","alt":"θj = ˆθ0j.","inline":true}],[{"text":"It immediately follows that on the intersection of events ","element":"span"},{"style":{"height":31.6},"width":668.3,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-0.png","element":"img","alt":" ∩pj=1�Λ(1)j ∩�∩Kjk=1Λ(2)jk��, we have","inline":true}],[{"text":"ˆ","element":"span"},{"style":{"height":20.08},"width":127,"height":50.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-1.png","element":"img","alt":"θ = ˆθ0","inline":true},{"text":". By a union bound, this occurs with probability at least","element":"span"}],[{"style":{"width":"91%"},"width":1646,"height":438,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-2.png","element":"img"}],[{"text":"where in the final line we use ","element":"span"},{"style":{"height":17.02},"width":162.76,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-3.png","element":"img","alt":" sj ≤ Kj.","inline":true}]]},{"heading":"S3 Additional experimental information","paragraphs":[[{"id":"id-70","style":{"fontWeight":"bold"},"text":"S3.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Details of methods","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Tree-based methods","element":"span"}],[{"text":"We used the implementation of the random forest procedure ","element":"span"},{"href":"#id-120","text":"[Breiman, ","element":"a"},{"href":"#id-120","text":"2001] ","element":"a"},{"text":"in the R package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"randomForest ","element":"span"},{"href":"#id-121","text":"[Liaw and Wiener, ","element":"a"},{"href":"#id-121","text":"2002] ","element":"a"},{"text":"with default settings. CART ","element":"span"},{"href":"#id-122","text":"[Breiman et al., ","element":"a"},{"href":"#id-122","text":"1984] ","element":"a"},{"text":"was implemented in the R package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"rpart ","element":"span"},{"href":"#id-123","text":"[Therneau and Atkinson, ","element":"a"},{"href":"#id-123","text":"2019]","element":"a"},{"text":", with pruning according to the 1-SE rule (as described in the package documentation).","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"CAS-ANOVA","element":"span"}],[{"text":"The CAS-ANOVA estimator ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.28},"width":70.89,"height":43.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-4.png","element":"img","alt":"θcas","inline":true,"padRight":true},{"text":"optimises over (","element":"span"},{"style":{"height":16},"width":70.68,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-5.png","element":"img","alt":"µ, θ","inline":true},{"text":") a sum of a squared loss term ","element":"span"},{"href":"#id-15","text":"(3) ","element":"a"},{"text":"and an all-pairs penalty term ","element":"span"},{"href":"#id-15","text":"(4)","element":"a"},{"text":". In particular, ","element":"span"},{"href":"#id-124","text":"Bondell and Reich ","element":"a"},{"href":"#id-124","text":"[2009] ","element":"a"},{"text":"consider two regimes of weight vectors ","element":"span"},{"style":{"fontStyle":"italic"},"text":"w","element":"span"},{"text":". The first is not data-dependent and sets ","element":"span"},{"style":{"height":20.29},"width":656.08,"height":50.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-6.png","element":"img","alt":" wj,k1k2 = (Kj + 1)−1√njk1 + njk2.","inline":true,"padRight":true},{"text":"The second, ‘adaptive CAS-ANOVA’, uses the ordinary least squares estimate for ","element":"span"},{"style":{"height":12},"width":25,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-7.png","element":"img","alt":" θ","inline":true,"padRight":true},{"text":"to scale the weights. Here, ","element":"span"},{"style":{"height":24.72},"width":952.41,"height":61.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-8.png","element":"img","alt":" wj,k1k2 = (Kj + 1)−1√njk1 + njk2|ˆθOLSjk1 − ˆθOLSjk2 |−1.","inline":true},{"text":"Here we introduce a new variant of adaptive CAS-ANOVA, following ideas in ","element":"span"},{"href":"#id-125","text":"B¨uhlmann and ","element":"a"},{"href":"#id-125","text":"Van De Geer ","element":"a"},{"href":"#id-125","text":"[2011] ","element":"a"},{"text":"for a 2-stage adaptive Lasso procedure. Instead of using the ordinary least squares estimate ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":20.48},"width":92.41,"height":51.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-9.png","element":"img","alt":"θOLS","inline":true,"padRight":true},{"text":"in the above expression, an initial (standard) CAS-ANOVA estimate is used to scale the weights, with ","element":"span"},{"style":{"height":12.8},"width":26,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-10.png","element":"img","alt":" λ","inline":true,"padRight":true},{"text":"selected for the initial estimate by 5-fold cross-validation. In simulations, this outperformed the adaptive CAS-ANOVA estimate using ordinary least squares initial estimates so in the interests of time and computational resources this was omitted from the simulation study. Henceforth adaptive CAS-ANOVA will refer to this 2-stage procedure.","element":"span"}],[{"text":"The authors describe the optimisation of ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":17.28},"width":70.88,"height":43.2,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/46-11.png","element":"img","alt":"θcas","inline":true,"padRight":true},{"text":"as a quadratic programming problem, which was solved using the R package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"rosqp ","element":"span"},{"href":"#id-126","text":"[Anderson, ","element":"a"},{"href":"#id-126","text":"2018]","element":"a"},{"text":". Here we used our own implementation of the quadratic programming approach described by the authors. We found it considerably faster than the code available from the authors’ website, and uses ADMM-based optimisation ","element":"span"},{"href":"#id-127","text":"[Boyd et al., ","element":"a"},{"href":"#id-127","text":"2011] ","element":"a"},{"text":"tools not available at the time of its publication. We also found, as discussed in Section 5.1 of ","element":"span"},{"href":"#id-128","text":"Maj-Ka´nska et al. ","element":"a"},{"href":"#id-128","text":"[2015]","element":"a"},{"text":", that we could not achieve the best results using the publicly available code. ","element":"span"},{"text":"Lastly, using our own implementation allowed us to explore a modification of CAS-ANOVA using the more modern approach of adaptive weights via a 2-stage procedure ","element":"span"},{"href":"#id-125","text":"[B¨uhlmann and Van De Geer, ","element":"a"},{"href":"#id-125","text":"2011] ","element":"a"},{"text":"to compare SCOPE to a wider class of all-pairs penalty procedures.","element":"span"}],[{"text":"For large categorical variables, solutions are slow to compute and consume large amounts of memory. In the case of binary response, CAS-ANOVA models were fitted iterating a locally quadratic approximation to the loss function.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"DMR","element":"span"}],[{"text":"The DMR algorithm ","element":"span"},{"href":"#id-128","text":"[Maj-Ka´nska et al., ","element":"a"},{"href":"#id-128","text":"2015] ","element":"a"},{"text":"is implemented in the R package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"DMRnet ","element":"span"},{"href":"#id-129","text":"[Prochenk","element":"a"},{"text":"a- ","element":"span"},{"href":"#id-129","text":"So�ltys and Pokarowski, ","element":"a"},{"href":"#id-129","text":"2018]","element":"a"},{"text":". The degrees of freedom in the model is decided by 5-fold cross-validation. It is based on pruning variables using the Group Lasso ","element":"span"},{"href":"#id-130","text":"[Yuan and Lin, ","element":"a"},{"href":"#id-130","text":"2006] ","element":"a"},{"text":"to obtain at a low-dimensional model, then performing backwards selection based on ranking ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"-statistics for hypotheses corresponding to each fusion between levels in categorical variables.","element":"span"}],[{"text":"The cross-validation routine appeared to error when all levels of all categorical variables were not present in one of the folds. In Section ","element":"span"},{"href":"#id-62","text":"6.2, ","element":"a"},{"text":"cross-validation was therefore not possible so model selection was performed based on Generalized Information Criterion (GIC) ","element":"span"},{"href":"#id-131","text":"[Zheng ","element":"a"},{"href":"#id-131","text":"and Loh, ","element":"a"},{"href":"#id-131","text":"1995]","element":"a"},{"text":". In all other examples, models were selected via 5-fold cross-validation.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Bayesian effect fusion","element":"span"}],[{"text":"In Section ","element":"span"},{"href":"#id-132","text":"6.1.1 ","element":"a"},{"text":"we include Bayesian effect fusion ","element":"span"},{"href":"#id-133","text":"[Pauger and Wagner, ","element":"a"},{"href":"#id-133","text":"2019]","element":"a"},{"text":", implemented in the R package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"effectFusion ","element":"span"},{"href":"#id-134","text":"[Pauger et al., ","element":"a"},{"href":"#id-134","text":"2019]","element":"a"},{"text":". Coefficients within each categorical variable were modelled with a sparse Gaussian mixture model. The posterior mean was estimated with 1000 samples.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lasso","element":"span"}],[{"text":"In Section ","element":"span"},{"href":"#id-135","text":"6.1.2 ","element":"a"},{"text":"we also include Lasso ","element":"span"},{"href":"#id-136","text":"[Tibshirani, ","element":"a"},{"href":"#id-136","text":"1996] ","element":"a"},{"text":"fits, to serve as a reference point. Of course, this is unsuitable for models where levels in categorical variables should be clustered together, but the advanced development of the well-known R package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"glmnet ","element":"span"},{"href":"#id-137","text":"[Friedman et al., ","element":"a"},{"href":"#id-137","text":"2010] ","element":"a"},{"text":"nevertheless sees its use in practice.","element":"span"}],[{"text":"In order to make the fit symmetric across the categories within each variable, models were fitted with an unpenalised intercept and featuring dummy variables for all of the categories within each variable. This is instead of the corner-point dummy variable encoding of factor variables that is commonly used when fitting linear models. Models are fitted and cross-validated with ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"cv.glmnet ","element":"span"},{"text":"using the default settings.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"SCOPE","element":"span"}],[{"text":"For SCOPE, we have provided the R package ","element":"span"},{"style":{"fontFamily":"monospace"},"text":"CatReg ","element":"span"},{"href":"#id-138","text":"[Stokell, ","element":"a"},{"href":"#id-138","text":"2021]","element":"a"},{"text":". The univariate update step (see Section ","element":"span"},{"href":"#id-38","text":"3.1) ","element":"a"},{"text":"is implemented in C++ using Rcpp ","element":"span"},{"href":"#id-139","text":"[Eddelbuettel and Fran¸cois, ","element":"a"},{"href":"#id-139","text":"2011]","element":"a"},{"text":", with models fitted using a wrapper in R. For the binary response case, the outer loop to iterate the local quadratic approximations in the proximal Newton algorithm are done within R. In the future, performance could be improved by iterating the univariate update step (and the local quadratic approximations, as in Sections ","element":"span"},{"href":"#id-62","text":"6.2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-99","text":"6.3) ","element":"a"},{"text":"within some lower-level language. In higher-dimensional experiments, SCOPE was slowed by cycling through all the variables; an active-set approach to this could make it faster still.","element":"span"}],[{"id":"id-47","style":{"fontWeight":"bold"},"text":"S3.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Further details of numerical experiments","element":"span"}],[{"text":"For the experiments in Section ","element":"span"},{"href":"#id-140","text":"6.1, ","element":"a"},{"text":"we define the signal-to-noise ratio (SNR) as ","element":"span"},{"style":{"height":17.6},"width":294.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/47-0.png","element":"img","alt":" σS/σ, where σS","inline":true,"padRight":true},{"text":"is the standard deviation of the signal ","element":"span"},{"style":{"height":15.6},"width":245.23,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/47-1.png","element":"img","alt":" Y − ε, and σ","inline":true,"padRight":true},{"text":"is the standard deviation of the noise ","element":"span"},{"style":{"height":8.4},"width":32.35,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/47-2.png","element":"img","alt":" ε.","inline":true}],[{"style":{"width":"95%"},"width":1715,"height":1162,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/48-0.png","element":"img"}],[{"id":"id-142","text":"Figure 9: Prediction performance of various methods: (A) SCOPE-8; (B) SCOPE-32; (C) ","element":"figcaption","subtype":"caption"},{"text":"SCOPE-CV; (D) Linear regression; (E) Oracle least squares; (F) CAS-ANOVA; (G) Adaptive CAS-ANOVA; (H) DMR; (I) BEF; (J) CART; (K) RF. Note that some ‘boxes’ are not visible in some of the plots; this is due to the MSPE in the tests being beyond the range of the plot.","element":"figcaption","subtype":"caption"}],[{"id":"id-72","style":{"fontWeight":"bold"},"text":"S3.2.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Low-dimensional simulations","element":"span"}],[{"text":"In Table ","element":"span"},{"href":"#id-141","text":"7 ","element":"a"},{"text":"we include details of computation time and dimension of the fitted models. Figure ","element":"span"},{"href":"#id-142","text":"9 ","element":"a"},{"text":"visualises the results also summarised in Table ","element":"span"},{"href":"#id-71","text":"1 ","element":"a"},{"text":"in the main paper.","element":"span"}],[{"style":{"width":"82%"},"width":1495,"height":580,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/48-1.png","element":"img"}],[{"id":"id-141","text":"Table 7: Mean fitted model dimension and computation time for the various methods.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"95%"},"width":1715,"height":1537,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/49-0.png","element":"img"}],[{"id":"id-143","text":"Figure 10: Prediction performance of various methods: (A) SCOPE-8; (B) SCOPE-32; (C) ","element":"figcaption","subtype":"caption"},{"text":"SCOPE-CV; (D) Oracle least squares; (E) DMR; (F) CART; (G) RF; (H) Lasso. Note that some ‘boxes’ are not visible in some of the plots; this is due to the MSPE in the tests being beyond the range of the plot.","element":"figcaption","subtype":"caption"}],[{"id":"id-77","style":{"fontWeight":"bold"},"text":"S3.2.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"High-dimensional simulations","element":"span"}],[{"text":"Here we include additional results relating to the high-dimensional experiments. ","element":"span"},{"text":"Figure ","element":"span"},{"href":"#id-143","text":"10 ","element":"a"},{"text":"visualises the results in Table ","element":"span"},{"href":"#id-73","text":"2 ","element":"a"},{"text":"of the main paper.","element":"span"}],[{"style":{"width":"79%"},"width":1443,"height":354,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/49-1.png","element":"img"}],[{"text":"Table 8: Mean computation time (s)","element":"figcaption","subtype":"caption"}],[{"style":{"width":"71%"},"width":1295,"height":264,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/50-0.png","element":"img"}],[{"text":"Table 9: Mean fitted model dimension","element":"figcaption","subtype":"caption"}],[{"style":{"width":"51%"},"width":935,"height":400,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/50-1.png","element":"img"}],[{"text":"Table 10: Proposition of times each ","element":"figcaption","subtype":"caption"},{"style":{"height":11.6},"width":24,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/50-2.png","element":"img","alt":" γ","inline":true,"padRight":true},{"text":"was selected by cross-validation.","element":"figcaption","subtype":"caption"}]]},{"heading":"References","paragraphs":[[{"id":"id-126","text":"E. Anderson. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"rosqp: Quadratic Programming Solver using the ’OSQP’ Library","element":"span"},{"text":", 2018. R package version 0.1.0.","element":"span"}],[{"id":"id-124","text":"H. D. Bondell and B. J. Reich. Simultaneous factor selection and collapsing levels in ANOVA. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Biometrics","element":"span"},{"text":", 65(1):169–177, 2009.","element":"span"}],[{"id":"id-127","text":"S. Boyd, N. Parikh, E. Chu, B. Peleato, J. Eckstein, et al. ","element":"span"},{"text":"Distributed optimization and statistical learning via the alternating direction method of multipliers. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Foundations and ","element":"span"},{"style":{"height":18.4},"width":575.96,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.12606/images/50-3.png","element":"img","alt":"Trends® in Machine learning","inline":true},{"text":", 3(1):1–122, 2011.","element":"span"}],[{"id":"id-107","text":"P. Breheny and J. Huang. Coordinate descent algorithms for nonconvex penalized regression, ","element":"span"},{"text":"with applications to biological feature selection. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The Annals of Applied Statistics","element":"span"},{"text":", 5(1):232, 2011.","element":"span"}],[{"id":"id-120","text":"L. Breiman. Random forests. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Machine Learning","element":"span"},{"text":", 45(1):5–32, 2001.","element":"span"}],[{"id":"id-122","text":"L. Breiman, J. Friedman, C. Stone, and R. Olshen. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Classification and Regression Trees","element":"span"},{"text":". The Wadsworth and Brooks-Cole statistics-probability series. Taylor & Francis, 1984.","element":"span"}],[{"id":"id-125","text":"P. B¨uhlmann and S. Van De Geer. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Statistics for high-dimensional data: methods, theory and applications","element":"span"},{"text":". Springer Science & Business Media, 2011.","element":"span"}],[{"id":"id-139","text":"D. Eddelbuettel and R. Fran¸cois. Rcpp: Seamless R and C++ integration. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Statistical Software","element":"span"},{"text":", 40(8):1–18, 2011.","element":"span"}],[{"id":"id-137","text":"J. Friedman, T. Hastie, and R. Tibshirani. Regularization paths for generalized linear models ","element":"span"},{"text":"via coordinate descent. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Statistical Software","element":"span"},{"text":", 33(1):1–22, 2010.","element":"span"}],[{"id":"id-121","text":"A. Liaw and M. Wiener. Classification and regression by randomforest. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"R News","element":"span"},{"text":", 2(3):18–22, 2002.","element":"span"}],[{"id":"id-128","text":"A. Maj-Ka´nska, P. Pokarowski, A. Prochenka, et al. Delete or merge regressors for linear model ","element":"span"},{"text":"selection. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Electronic Journal of Statistics","element":"span"},{"text":", 9(2):1749–1778, 2015.","element":"span"}],[{"id":"id-133","text":"D. Pauger and H. Wagner. Bayesian effect fusion for categorical predictors. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Bayesian Analysis","element":"span"},{"text":", 14(2):341–369, 2019.","element":"span"}],[{"id":"id-134","text":"D. Pauger, M. Leitner, H. Wagner, and G. Malsiner-Walli. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"effectFusion: Bayesian Effect Fusion for Categorical Predictors","element":"span"},{"text":", 2019. R package version 1.1.1.","element":"span"}],[{"id":"id-129","text":"A. Prochenka-So�ltys and P. Pokarowski. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"DMRnet: Delete or Merge Regressors Algorithms for Linear and Logistic Model Selection and High-Dimensional Data","element":"span"},{"text":", 2018. R package version 0.2.0.","element":"span"}],[{"id":"id-138","text":"B. Stokell. CatReg: Solution Paths for Linear and Logistic Regression Models with Categorical ","element":"span"},{"text":"Predictors, with SCOPE Penalty ","element":"span"},{"href":"https://CRAN.R-project.org/package=CatReg","style":{"fontFamily":"monospace"},"text":"https://CRAN.R-project.org/package=CatReg","element":"a"},{"text":", 2021.","element":"span"}],[{"id":"id-123","text":"T. Therneau and B. Atkinson. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"rpart: Recursive Partitioning and Regression Trees","element":"span"},{"text":", 2019. R package version 4.1-15.","element":"span"}],[{"id":"id-136","text":"R. Tibshirani. Regression shrinkage and selection via the lasso. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the Royal Statistical Society: Series B (Methodological)","element":"span"},{"text":", 58(1):267–288, 1996.","element":"span"}],[{"id":"id-130","text":"M. Yuan and Y. Lin. Model selection and estimation in regression with grouped variables. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the Royal Statistical Society: Series B (Statistical Methodology)","element":"span"},{"text":", 68(1):49–67, 2006.","element":"span"}],[{"id":"id-131","text":"X. Zheng and W.-Y. Loh. Consistent variable selection in linear models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the American Statistical Association","element":"span"},{"text":", 90(429):151–156, 1995.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]