1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTMwMy40MjA3IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","updated":"2013-10-01T06:31:11.000Z","paperID":"1303.4207","published":"2013-03-18T11:17:55.000Z","authors":"[\"Shusen Wang\",\"Zhihua Zhang\"]","title":"Improving CUR Matrix Decomposition and the Nyström Approximation via Adaptive Sampling","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-03T19:54:59.232Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9pbXByb3ZpbmctY3VyLW1hdHJpeC1kZWNvbXBvc2l0aW9uLWFuZC10aGUifQ==","type":"pwc","url":"https://paperswithcode.com/paper/improving-cur-matrix-decomposition-and-the","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"zhihua zhang","node":{"id":"eyJhZGRyZXNzIjoiemhpaHVhQHNqdHUuZWR1LmNuIn0=","address":"zhihua@sjtu.edu.cn","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"M5YT8IoAAAAJ"}],"twitter":[],"location":[],"owner":[]}},{"author":"shusen wang","node":{"id":"eyJhZGRyZXNzIjoid3NzQHpqdS5lZHUuY24ifQ==","address":"wss@zju.edu.cn","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/12660689?v=4","username":"wangshusen"}],"scholar":[{"thirdPartyID":"HAf4pEoAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJiMjU0YmJlNC01YjJiLTRmNzItYmY2MC0yZjczMzZiOWRhOGMifQ==","name":"leixia wang","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/d836a813fd360ddac664b8ed73396ecd_720eb05bd79ed4269d263de83d9d3fbca60755ddfb798ad17261527b67ed3e14"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTkwNy4wMjE4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1907.02189"},{"id":"eyJwYXBlcklEIjoiMTMwMy40MjA3IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1303.4207"},{"id":"eyJwYXBlcklEIjoiMTUwNS4wNzU3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1505.07570"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wMjgwMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.02803"},{"id":"eyJwYXBlcklEIjoiMTQwNi41Njc1IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1406.5675"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wNDgzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.04837"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wOTEyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.09126"},{"id":"eyJwYXBlcklEIjoiMTcwOS4wMzUyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1709.03528"},{"id":"eyJwYXBlcklEIjoiMTQwNC4wMTM4IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1404.0138"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wNTc4MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.05783"},{"id":"eyJwYXBlcklEIjoiMTQwMy43NzM3IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1403.7737"},{"id":"eyJwYXBlcklEIjoiMTcwOC4wMTk0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1708.01945"},{"id":"eyJwYXBlcklEIjoiMTUwMy4wODM5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1503.08395"},{"id":"eyJwYXBlcklEIjoiMTgwMy4wODAyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1803.08021"},{"id":"eyJwYXBlcklEIjoiMjEwNy4wNDIyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.04225"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xNDk5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.14994"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wODAxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.08014"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wMjYwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.02606"},{"id":"eyJwYXBlcklEIjoiMjIwMy4wMTIxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.01214"},{"id":"eyJwYXBlcklEIjoiMjQwNS4wODI5OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.08299"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wNDQ3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.04477"},{"id":"eyJwYXBlcklEIjoiMTkxMi4xMjAwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1912.12008"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wMDcwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.00704"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xMzMwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.13301"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wNDk1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.04952"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wMTE3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.01178"},{"id":"eyJwYXBlcklEIjoiMTkwOS4xMTIwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.11207"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wOTcxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.09713"},{"id":"eyJwYXBlcklEIjoiMjIwNC4wMjYzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2204.02634"},{"id":"eyJwYXBlcklEIjoiMTkwOS4xMTIwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.11201"},{"id":"eyJwYXBlcklEIjoiMTQxMi43OTM4IiwicHVibGlzaGVyIjoiYXJ4aXYifQ==","publisher":"arxiv","paperID":"1412.7938"},{"id":"eyJwYXBlcklEIjoiMjAwNS4wMjE3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.02177"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xMzA2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.13061"},{"id":"eyJwYXBlcklEIjoiMjIwOC4wMjQyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2208.02424"},{"id":"eyJwYXBlcklEIjoiMjMwOC4wMTIwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2308.01204"}]}]}}]},"__typename":"paper","authorArray":["Shusen Wang","Zhihua Zhang"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"1303.4207","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"1303.4207","publisher":"arxiv","paperJSON":{"title":"Improving CUR Matrix Decomposition and the Nyström Approximation via Adaptive Sampling","paperID":"1303.4207","avgLineHeight":13.55,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"The CUR matrix decomposition and the Nystr¨om approximation are two important low-rank matrix approximation techniques. The Nystr¨om method approximates a symmetric positive semidefinite matrix in terms of a small number of its columns, while CUR approximates an arbitrary data matrix by a small number of its columns and rows. Thus, CUR decomposition can be regarded as an extension of the Nystr¨om approximation.","element":"span"}],[{"text":"In this paper we establish a more general error bound for the adaptive column/row sampling algorithm, based on which we propose more accurate CUR and Nystr¨om algorithms with expected relative-error bounds. The proposed CUR and Nystr¨om algorithms also have low time complexity and can avoid maintaining the whole data matrix in RAM. In addition, we give theoretical analysis for the lower error bounds of the standard Nystr¨om method and the ensemble Nystr¨om method. The main theoretical results established in this paper are novel, and our analysis makes no special assumption on the data matrices. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Keywords: ","element":"span"},{"text":"large-scale matrix computation, CUR matrix decomposition, the Nystr¨om method, randomized algorithms, adaptive sampling","element":"span"}]]},{"heading":"1. Introduction","paragraphs":[[{"text":"Large-scale matrices emerging from stocks, genomes, web documents, web images and videos everyday bring new challenges in modern data analysis. Most efforts have been focused on manipulating, understanding and interpreting large-scale data matrices. In many cases, matrix factorization methods are employed for constructing parsimonious and informative representations to facilitate computation and interpretation. A principled approach is the truncated singular value decomposition (SVD) which finds the best low-rank approximation of a data matrix. Applications of SVD such as eigenfaces (","element":"span"},{"href":"#id-0","referenceIndex":40,"text":"Sirovich and Kirby","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":40,"text":"1987","element":"a"},{"text":"; ","element":"span"},{"href":"#id-1","referenceIndex":44,"text":"Turk and Pentland","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":44,"text":"1991","element":"a"},{"text":") and latent semantic analysis (","element":"span"},{"href":"#id-2","referenceIndex":9,"text":"Deerwester et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-2","referenceIndex":9,"text":"1990","element":"a"},{"text":") have been illustrated to be very successful.","element":"span"}],[{"id":"id-115","text":"However, using SVD to find basis vectors and low-rank approximations has its limita- ","element":"span"},{"text":"tions. As pointed out by ","element":"span"},{"href":"#id-3","referenceIndex":2,"text":"Berry et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-3","referenceIndex":2,"text":"2005","element":"a"},{"text":"), it is often useful to find a low-rank matrix approximation which posses additional structures such as sparsity or nonnegativity. Since SVD or the standard QR decomposition for sparse matrices does not preserve sparsity in general, when the sparse matrix is large, computing or even storing such decompositions becomes challenging. Therefore it is useful to compute a low-rank matrix decomposition which preserves such structural properties of the original data matrix.","element":"span"}],[{"text":"Another limitation of SVD is that the basis vectors resulting from SVD have little concrete meaning, which makes it very difficult for us to understand and interpret the data in question. An example of ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"Drineas et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":") and ","element":"span"},{"href":"#id-5","referenceIndex":36,"text":"Mahoney and Drineas ","element":"a"},{"text":"(","element":"span"},{"href":"#id-5","referenceIndex":36,"text":"2009","element":"a"},{"text":") has well shown this viewpoint; that is, the vector [(1","element":"span"},{"style":{"height":19.64},"width":282.78,"height":49.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-0.png","element":"img","alt":"/2)age − (1/√","inline":true},{"text":"2)height + (1","element":"span"},{"style":{"fontStyle":"italic"},"text":"/","element":"span"},{"text":"2)income], the sum of the significant uncorrelated features from a data set of people’s features, is not particularly informative. ","element":"span"},{"href":"#id-6","referenceIndex":32,"text":"Kuruvilla et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-6","referenceIndex":32,"text":"2002","element":"a"},{"text":") have also claimed: “it would be interesting to try to find basis vectors for all experiment vectors, using actual experiment vectors and not artificial bases that offer little insight.” Therefore, it is of great interest to represent a data matrix in terms of a small number of actual columns and/or actual rows of the matrix. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Matrix column selection ","element":"span"},{"text":"and the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CUR matrix decomposition ","element":"span"},{"text":"provide such techniques.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"1.1 Matrix Column Selection","element":"span"}],[{"text":"Column selection has been extensively studied in the theoretical computer science (TCS) and numerical linear algebra (NLA) communities. The work in TCS mainly focuses on choosing good columns by randomized algorithms with provable error bounds (","element":"span"},{"href":"#id-7","referenceIndex":20,"text":"Frieze et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":20,"text":"2004","element":"a"},{"text":"; ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":"; ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"Drineas et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":"; ","element":"span"},{"href":"#id-9","referenceIndex":10,"text":"Deshpande and Rademacher","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":10,"text":"2010","element":"a"},{"text":"; ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Bout- ","element":"a"},{"href":"#id-10","referenceIndex":5,"text":"sidis et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":"; ","element":"span"},{"href":"#id-11","referenceIndex":25,"text":"Guruswami and Sinop","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":25,"text":"2012","element":"a"},{"text":"). The focus in NLA is then on deterministic algorithms, especially the rank-revealing QR factorizations, that select columns by pivoting rules (","element":"span"},{"href":"#id-12","referenceIndex":17,"text":"Foster","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":17,"text":"1986","element":"a"},{"text":"; ","element":"span"},{"href":"#id-13","referenceIndex":6,"text":"Chan","element":"a"},{"text":", ","element":"span"},{"href":"#id-13","referenceIndex":6,"text":"1987","element":"a"},{"text":"; ","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"Stewart","element":"a"},{"text":", ","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"1999","element":"a"},{"text":"; ","element":"span"},{"href":"#id-15","referenceIndex":4,"text":"Bischof and Hansen","element":"a"},{"text":", ","element":"span"},{"href":"#id-15","referenceIndex":4,"text":"1991","element":"a"},{"text":"; ","element":"span"},{"href":"#id-16","referenceIndex":28,"text":"Hong and Pan","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":28,"text":"1992","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":7,"text":"Chandrasekaran and Ipsen","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":7,"text":"1994","element":"a"},{"text":"; ","element":"span"},{"href":"#id-18","referenceIndex":24,"text":"Gu and Eisenstat","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":24,"text":"1996","element":"a"},{"text":"; ","element":"span"},{"href":"#id-3","referenceIndex":2,"text":"Berry et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":2,"text":"2005","element":"a"},{"text":"). In this paper we focus on randomized algorithms for column selection.","element":"span"}],[{"text":"Given a matrix ","element":"span"},{"style":{"height":13.93},"width":202.83,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-1.png","element":"img","alt":" A ∈ Rm×n","inline":true},{"text":", column selection algorithms aim to choose ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"to construct a matrix ","element":"span"},{"style":{"height":20.38},"width":706.41,"height":50.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-2.png","element":"img","alt":" C ∈ Rm×c such that ∥A − CC†A∥ξ","inline":true,"padRight":true},{"text":"achieves the minimum. Here “","element":"span"},{"style":{"height":16.4},"width":539.19,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-3.png","element":"img","alt":"ξ = 2,” “ξ = F,” and “ξ = ∗","inline":true},{"text":"” respectively represent the matrix spectral norm, the matrix Frobenius norm, and the matrix nuclear norm, and ","element":"span"},{"style":{"height":15.53},"width":51.24,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-4.png","element":"img","alt":" C† ","inline":true,"padRight":true},{"text":"denotes the Moore-Penrose inverse of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"text":". Since there are (","element":"span"},{"style":{"height":16.25},"width":21,"height":40.62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-5.png","element":"img","alt":"nc ","inline":true,"padRight":true},{"text":") possible choices of constructing ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"text":", selecting the best subset is a ","element":"span"},{"text":"hard problem.","element":"span"}],[{"text":"In recent years, many polynomial-time approximate algorithms have been proposed. Among them we are especially interested in those algorithms with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"multiplicative upper bounds","element":"span"},{"text":"; that is, there exists a polynomial function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"m, n, k, c","element":"span"},{"text":") such that with ","element":"span"},{"style":{"height":17.6},"width":152.7,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-6.png","element":"img","alt":" c (≥ k)","inline":true,"padRight":true},{"text":"columns selected from ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"the following inequality holds","element":"span"}],[{"style":{"width":"46%"},"width":799,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-7.png","element":"img"}],[{"text":"with high probability (w.h.p.) or in expectation w.r.t. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"text":". We call ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"approximation factor","element":"span"},{"text":". The bounds are strong when ","element":"span"},{"style":{"height":16.4},"width":167.12,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-8.png","element":"img","alt":" f = 1+ϵ","inline":true,"padRight":true},{"text":"for an error parameter ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/1-9.png","element":"img","alt":" ϵ","inline":true},{"text":"—they are known as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"relative-error bounds","element":"span"},{"text":". Particularly, the bounds are called ","element":"span"},{"style":{"fontStyle":"italic"},"text":"constant-factor bounds ","element":"span"},{"text":"when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"does not depend on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"(","element":"span"},{"href":"#id-19","referenceIndex":35,"text":"Mahoney","element":"a"},{"text":", ","element":"span"},{"href":"#id-19","referenceIndex":35,"text":"2011","element":"a"},{"text":"). The relative-error bounds and constant-factor bounds of the CUR matrix decomposition and the Nystr¨om approximation are similarly defined.","element":"span"}],[{"id":"id-116","text":"However, the column selection method, also known as the ","element":"span"},{"style":{"height":12.8},"width":190.62,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-0.png","element":"img","alt":" A ≈ CX","inline":true,"padRight":true},{"text":"decomposition in some applications, has its limitations. For a large sparse matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":", its submatrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"is sparse, but the coefficient matrix ","element":"span"},{"style":{"height":13.93},"width":184.8,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-1.png","element":"img","alt":" X ∈ Rc×n ","inline":true,"padRight":true},{"text":"is not sparse in general. The ","element":"span"},{"style":{"fontWeight":"bold"},"text":"CX ","element":"span"},{"text":"decomposition suffices when ","element":"span"},{"style":{"height":15.6},"width":363.89,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-2.png","element":"img","alt":" m ≫ n, because X","inline":true,"padRight":true},{"text":"is small in size. However, when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"are near equal, computing and storing the dense matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"X ","element":"span"},{"text":"in RAM becomes infeasible. In such an occasion the CUR matrix decomposition is a very useful alternative.","element":"span"}],[{"id":"id-67","style":{"fontWeight":"bold"},"text":"1.2 The CUR Matrix Decomposition","element":"span"}],[{"text":"The CUR matrix decomposition problem has been widely discussed in the literature (","element":"span"},{"href":"#id-20","referenceIndex":22,"text":"Gor- ","element":"a"},{"href":"#id-20","referenceIndex":22,"text":"einov et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-20","referenceIndex":22,"text":"1997a","element":"a"},{"text":",","element":"span"},{"href":"#id-21","referenceIndex":23,"text":"b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"Stewart","element":"a"},{"text":", ","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"1999","element":"a"},{"text":"; ","element":"span"},{"href":"#id-22","referenceIndex":45,"text":"Tyrtyshnikov","element":"a"},{"text":", ","element":"span"},{"href":"#id-22","referenceIndex":45,"text":"2000","element":"a"},{"text":"; ","element":"span"},{"href":"#id-3","referenceIndex":2,"text":"Berry et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":2,"text":"2005","element":"a"},{"text":"; ","element":"span"},{"href":"#id-23","referenceIndex":13,"text":"Drineas and ","element":"a"},{"href":"#id-23","referenceIndex":13,"text":"Mahoney","element":"a"},{"text":", ","element":"span"},{"href":"#id-23","referenceIndex":13,"text":"2005","element":"a"},{"text":"; ","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"Mahoney et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-24","referenceIndex":37,"text":"2008","element":"a"},{"text":"; ","element":"span"},{"href":"#id-25","referenceIndex":3,"text":"Bien et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-25","referenceIndex":3,"text":"2010","element":"a"},{"text":"), and it has been shown to be very useful in high dimensional data analysis. ","element":"span"},{"text":"Particularly, a CUR decomposition algorithm seeks to find a subset of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"to form a matrix ","element":"span"},{"style":{"height":13.93},"width":192.48,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-3.png","element":"img","alt":" C ∈ Rm×c","inline":true},{"text":", a subset of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"rows to form a matrix ","element":"span"},{"style":{"height":13.93},"width":187.86,"height":34.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-4.png","element":"img","alt":" R ∈ Rr×n","inline":true},{"text":", and an intersection matrix ","element":"span"},{"style":{"height":18.44},"width":704.1,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-5.png","element":"img","alt":" U ∈ Rc×r such that ∥A − CUR∥ξ is","inline":true,"padRight":true},{"text":"small. Accordingly, we use ","element":"span"},{"text":"˜","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontWeight":"bold"},"text":"CUR ","element":"span"},{"text":"to approximate ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":".","element":"span"}],[{"href":"#id-26","referenceIndex":14,"text":"Drineas et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-26","referenceIndex":14,"text":"2006","element":"a"},{"text":") proposed a CUR algorithm with additive-error bound. Later on, ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"Drineas et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":") devised a randomized CUR algorithm which has relative-error bound w.h.p. if sufficiently many columns and rows are sampled. ","element":"span"},{"href":"#id-27","referenceIndex":34,"text":"Mackey et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-27","referenceIndex":34,"text":"2011","element":"a"},{"text":") established a divide-and-conquer method which solves the CUR problem in parallel. The CUR algorithms guaranteed by relative-error bounds are of great interest.","element":"span"}],[{"text":"Unfortunately, the existing CUR algorithms usually require a large number of columns and rows to be chosen. ","element":"span"},{"text":"For example, for an ","element":"span"},{"style":{"height":12.8},"width":307.62,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-6.png","element":"img","alt":" m×n matrix A","inline":true,"padRight":true},{"text":"and a target rank ","element":"span"},{"style":{"height":13.6},"width":92.46,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-7.png","element":"img","alt":" k ≪","inline":true,"padRight":true},{"text":"min","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"style":{"fontStyle":"italic"},"text":"m, n","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"the subspace sampling algorithm ","element":"span"},{"text":"(","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"Drineas et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":")—a classical CUR algorithm— requires ","element":"span"},{"style":{"height":19.13},"width":233.91,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-8.png","element":"img","alt":" O(kϵ−2 log k","inline":true},{"text":") columns and ","element":"span"},{"style":{"height":19.87},"width":252.84,"height":49.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-9.png","element":"img","alt":" O(kϵ−4 log2 k","inline":true},{"text":") rows to achieve relative-error bound w.h.p. The subspace sampling algorithm selects columns/rows according to the statistical leverage scores, so the computational cost of this algorithm is at least equal to the cost of the truncated SVD of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":", that is, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnk","element":"span"},{"text":") in general. However, maintaining a large scale matrix in RAM is often impractical, not to mention performing SVD. Recently, ","element":"span"},{"href":"#id-28","referenceIndex":16,"text":"Drineas ","element":"a"},{"href":"#id-28","referenceIndex":16,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-28","referenceIndex":16,"text":"2012","element":"a"},{"text":") devised fast approximation to statistical leverage scores which can be used to speedup the subspace sampling algorithm heuristically—yet no theoretical results have been reported that the leverage scores approximation can give provably efficient subspace sampling algorithm.","element":"span"}],[{"text":"The CUR matrix decomposition problem has a close connection with the column selection problem. Especially, most CUR algorithms such as those of ","element":"span"},{"href":"#id-29","referenceIndex":12,"text":"Drineas and Kannan ","element":"a"},{"text":"(","element":"span"},{"href":"#id-29","referenceIndex":12,"text":"2003","element":"a"},{"text":"); ","element":"span"},{"href":"#id-26","referenceIndex":14,"text":"Drineas et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-26","referenceIndex":14,"text":"2006","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":") work in a two-stage manner where the first stage is a standard column selection procedure. Despite their strong resemblance, CUR is a harder problem than column selection because “one can get good columns or rows separately” does not mean that one can get good columns and rows together. If the second stage is na¨ıvely solved by a column selection algorithm on ","element":"span"},{"style":{"height":15.13},"width":61.94,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-10.png","element":"img","alt":" AT ","inline":true,"padRight":true},{"text":", then the approximation factor will trivially be","element":"span"},{"style":{"height":19.64},"width":133.53,"height":49.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/2-11.png","element":"img","alt":"√2f1 (","inline":true},{"href":"#id-5","referenceIndex":36,"text":"Mahoney and Drineas","element":"a"},{"text":", ","element":"span"},{"href":"#id-5","referenceIndex":36,"text":"2009","element":"a"},{"text":"). Thus, more sophisticated error analysis techniques for the second stage are indispensable in order to achieve relative-error bound.","element":"span"}],[{"id":"id-124","style":{"fontWeight":"bold"},"text":"1.3 The Nystr¨om Methods","element":"span"}],[{"text":"The Nystr¨om approximation is closely related to CUR, and it can potentially benefit from the advances in CUR techniques. Different from CUR, the Nystr¨om methods are used for approximating symmetric positive semidefinite (SPSD) matrices. The methods approximate an SPSD matrix only using a subset of its columns, so they can alleviate computation and storage costs when the SPSD matrix in question is large in size. ","element":"span"},{"text":"In fact, the Nystr¨om methods have been extensively used in the machine learning community. For example, they have been applied to Gaussian processes (","element":"span"},{"href":"#id-30","referenceIndex":46,"text":"Williams and Seeger","element":"a"},{"text":", ","element":"span"},{"href":"#id-30","referenceIndex":46,"text":"2001","element":"a"},{"text":"), kernel SVMs (","element":"span"},{"href":"#id-31","referenceIndex":48,"text":"Zhang ","element":"a"},{"href":"#id-31","referenceIndex":48,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-31","referenceIndex":48,"text":"2008","element":"a"},{"text":"), spectral clustering (","element":"span"},{"href":"#id-32","referenceIndex":18,"text":"Fowlkes et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-32","referenceIndex":18,"text":"2004","element":"a"},{"text":"), kernel PCA (","element":"span"},{"href":"#id-33","referenceIndex":43,"text":"Talwalkar et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-33","referenceIndex":43,"text":"2008","element":"a"},{"text":"; ","element":"span"},{"href":"#id-31","referenceIndex":48,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-31","referenceIndex":48,"text":"2008","element":"a"},{"text":"; ","element":"span"},{"href":"#id-34","referenceIndex":47,"text":"Zhang and Kwok","element":"a"},{"text":", ","element":"span"},{"href":"#id-34","referenceIndex":47,"text":"2010","element":"a"},{"text":"), etc.","element":"span"}],[{"text":"The Nystr¨om methods approximate any SPSD matrix in terms of a subset of its columns. Specifically, given an ","element":"span"},{"style":{"height":9.2},"width":111.25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-0.png","element":"img","alt":" m×m","inline":true,"padRight":true},{"text":"SPSD matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":", they require sampling ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"< m","element":"span"},{"text":") columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"to construct an ","element":"span"},{"style":{"height":12.8},"width":309.04,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-1.png","element":"img","alt":" m × c matrix C","inline":true},{"text":". Since there exists an ","element":"span"},{"style":{"height":9.2},"width":111.25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-2.png","element":"img","alt":" m×m","inline":true,"padRight":true},{"text":"permutation matrix ","element":"span"},{"style":{"height":12.8},"width":138.93,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-3.png","element":"img","alt":" Π such","inline":true,"padRight":true},{"text":"that ","element":"span"},{"style":{"height":12.8},"width":76.27,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-4.png","element":"img","alt":" ΠC","inline":true,"padRight":true},{"text":"consists of the first ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"columns of ","element":"span"},{"style":{"height":15.73},"width":140.48,"height":39.32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-5.png","element":"img","alt":" ΠAΠT ","inline":true,"padRight":true},{"text":", we always assume that ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"consists of the first ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"without loss of generality. We partition ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"as","element":"span"}],[{"id":"id-68","style":{"width":"46%"},"width":802,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-6.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.02},"width":227.04,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-7.png","element":"img","alt":" W and A21","inline":true,"padRight":true},{"text":"are of sizes ","element":"span"},{"style":{"height":17.6},"width":395.38,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-8.png","element":"img","alt":" c × c and (m−c) × c","inline":true},{"text":", respectively. There are three models which are defined as follows.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"style":{"height":16},"width":701.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-9.png","element":"img","alt":" The Standard Nystr¨om Method","inline":true},{"text":". The standard Nystr¨om approximation to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"is","element":"span"}],[{"style":{"width":"71%"},"width":1234,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-10.png","element":"img"}],[{"text":"Here ","element":"span"},{"style":{"height":15.54},"width":67.58,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-11.png","element":"img","alt":" W† ","inline":true,"padRight":true},{"text":"is called the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"intersection matrix","element":"span"},{"text":". The matrix (","element":"span"},{"style":{"height":19.54},"width":535.88,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-12.png","element":"img","alt":"Wk)†, where k ≤ c and Wk","inline":true,"padRight":true},{"text":"is the best ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"-rank approximation to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"W","element":"span"},{"text":", is also used as an intersection matrix for constructing approximations with even lower rank. But using ","element":"span"},{"style":{"height":15.53},"width":67.58,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-13.png","element":"img","alt":" W† ","inline":true,"padRight":true},{"text":"results in a tighter approximation than using (","element":"span"},{"style":{"height":19.53},"width":264.13,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-14.png","element":"img","alt":"Wk)† usually.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"style":{"height":17.6},"width":747.94,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-15.png","element":"img","alt":" The Ensemble Nystr¨om Method (","inline":true},{"href":"#id-35","referenceIndex":30,"text":"Kumar et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-35","referenceIndex":30,"text":"2009","element":"a"},{"text":"). It selects a collection of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"samples, each sample ","element":"span"},{"style":{"height":20.33},"width":355.86,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-16.png","element":"img","alt":" C(i), (i = 1, · · · , t","inline":true},{"text":"), containing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":". Then the ensemble method combines the samples to construct an approximation in the form of","element":"span"}],[{"id":"id-69","style":{"width":"64%"},"width":1107,"height":128,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-17.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.93},"width":64,"height":49.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-18.png","element":"img","alt":" µ(i) ","inline":true,"padRight":true},{"text":"are the weights of the samples. Typically, the ensemble Nystr¨om method seeks to find out the weights by minimizing ","element":"span"},{"style":{"height":22.78},"width":767.29,"height":56.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-19.png","element":"img","alt":" ∥A − ˜Aenst,c ∥F or ∥A − ˜Aenst,c ∥2. A simple","inline":true,"padRight":true},{"text":"but effective strategy is to set the weights as ","element":"span"},{"style":{"height":21.95},"width":402.39,"height":54.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-20.png","element":"img","alt":" µ(1) = · · · = µ(t) = 1t .","inline":true}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"style":{"height":16.4},"width":695.85,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-21.png","element":"img","alt":" The Modified Nystr¨om Method","inline":true,"padRight":true},{"text":"(proposed in this paper). It is defined as","element":"span"}],[{"style":{"width":"31%"},"width":539,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/3-22.png","element":"img"}],[{"id":"id-119","text":"This model is not strictly the Nystr¨om method because it uses a quite different in- ","element":"span"},{"text":"tersection matrix ","element":"span"},{"style":{"height":19.53},"width":533.8,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/4-0.png","element":"img","alt":" C†A(C†)T . It costs O(mc2","inline":true},{"text":") time to compute the Moore-Penrose inverse ","element":"span"},{"style":{"height":15.53},"width":236.18,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/4-1.png","element":"img","alt":" C† and m2c","inline":true,"padRight":true},{"text":"flops to compute matrix multiplications. The matrix multiplications can be executed very efficiently in multi-processor environment, so ideally computing the intersection matrix costs time only linear in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":". This model is more accurate (which will be justified in Section ","element":"span"},{"href":"#id-36","text":"4.3 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-37","text":"4.4","element":"a"},{"text":") but more costly than the conventional ones, so there is a trade-off between time and accuracy when deciding which model to use.","element":"span"}],[{"text":"Here and later, we call those which use intersection matrix ","element":"span"},{"style":{"height":19.53},"width":589.85,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/4-2.png","element":"img","alt":" W† or (Wk)† the conventional","inline":true},{"style":{"height":16.4},"width":331.92,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/4-3.png","element":"img","alt":"Nystr¨om methods","inline":true},{"text":", including the standard Nystr¨om and the ensemble Nystr¨om.","element":"span"}],[{"text":"To generate effective approximations, much work has been built on the upper error bounds of the sampling techniques for the Nystr¨om method. Most of the work, for example, ","element":"span"},{"href":"#id-23","referenceIndex":13,"text":"Drineas and Mahoney ","element":"a"},{"text":"(","element":"span"},{"href":"#id-23","referenceIndex":13,"text":"2005","element":"a"},{"text":"), ","element":"span"},{"href":"#id-38","referenceIndex":33,"text":"Li et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-38","referenceIndex":33,"text":"2010","element":"a"},{"text":"), ","element":"span"},{"href":"#id-35","referenceIndex":30,"text":"Kumar et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-35","referenceIndex":30,"text":"2009","element":"a"},{"text":"), ","element":"span"},{"href":"#id-39","referenceIndex":29,"text":"Jin et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-39","referenceIndex":29,"text":"2011","element":"a"},{"text":"), and ","element":"span"},{"href":"#id-40","referenceIndex":31,"text":"Kumar et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-40","referenceIndex":31,"text":"2012","element":"a"},{"text":"), studied the additive-error bound. ","element":"span"},{"text":"With assumptions on matrix coherence, better additive-error bounds were obtained by ","element":"span"},{"href":"#id-41","referenceIndex":42,"text":"Talwalkar and Rostamizadeh ","element":"a"},{"text":"(","element":"span"},{"href":"#id-41","referenceIndex":42,"text":"2010","element":"a"},{"text":"), ","element":"span"},{"href":"#id-39","referenceIndex":29,"text":"Jin et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-39","referenceIndex":29,"text":"2011","element":"a"},{"text":"), and ","element":"span"},{"href":"#id-27","referenceIndex":34,"text":"Mackey et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-27","referenceIndex":34,"text":"2011","element":"a"},{"text":"). However, as stated by ","element":"span"},{"href":"#id-19","referenceIndex":35,"text":"Mahoney ","element":"a"},{"text":"(","element":"span"},{"href":"#id-19","referenceIndex":35,"text":"2011","element":"a"},{"text":"), additive-error bounds are less compelling than relative-error bounds. In one recent work, ","element":"span"},{"href":"#id-42","referenceIndex":21,"text":"Gittens and Mahoney ","element":"a"},{"text":"(","element":"span"},{"href":"#id-42","referenceIndex":21,"text":"2013","element":"a"},{"text":") provided a relative-error bound for the first time, where the bound is in nuclear norm.","element":"span"}],[{"text":"However, the error bounds of the previous Nystr¨om methods are much weaker than those of the existing CUR algorithms, especially the relative-error bounds in which we are more interested (","element":"span"},{"href":"#id-19","referenceIndex":35,"text":"Mahoney","element":"a"},{"text":", ","element":"span"},{"href":"#id-19","referenceIndex":35,"text":"2011","element":"a"},{"text":"). ","element":"span"},{"text":"Actually, as will be proved in this paper, the lower error bounds of the standard Nystr¨om method and the ensemble Nystr¨om method are even much worse than the upper bounds of some existing CUR algorithms. This motivates us to improve the Nystr¨om method by borrowing the techniques in CUR matrix decomposition.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"1.4 Contributions and Outline","element":"span"}],[{"text":"The main technical contribution of this work is the adaptive sampling bound in Theorem ","element":"span"},{"href":"#id-43","text":"5","element":"a"},{"text":", which is an extension of Theorem 2.1 of ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":"). Theorem 2.1 of ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande ","element":"a"},{"href":"#id-8","referenceIndex":11,"text":"et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":") bounds the error incurred by projection onto column or row space, while our Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"bounds the error incurred by the projection simultaneously onto column space and row space. We also show that Theorem 2.1 of ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":") can be regarded as a special case of Theorem ","element":"span"},{"href":"#id-43","text":"5","element":"a"},{"text":".","element":"span"}],[{"text":"More importantly, our adaptive sampling bound provides an approach for improving CUR and the Nystr¨om approximation: no matter which relative-error column selection algorithm is employed, Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"ensures relative-error bounds for CUR and the Nystr¨om approximation. We present the results in Corollary ","element":"span"},{"href":"#id-44","text":"7","element":"a"},{"text":".","element":"span"}],[{"text":"Based on the adaptive sampling bound in Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"and its corollary ","element":"span"},{"href":"#id-44","text":"7","element":"a"},{"text":", we provide a concrete CUR algorithm which beats the best existing algorithm—the subspace sampling algorithm—both theoretically and empirically. The CUR algorithm is described in Algorithm ","element":"span"},{"href":"#id-45","text":"2 ","element":"a"},{"text":"and analyzed in Theorem ","element":"span"},{"href":"#id-46","text":"8","element":"a"},{"text":". ","element":"span"},{"text":"In Table ","element":"span"},{"href":"#id-47","text":"1 ","element":"a"},{"text":"we present a comparison between our proposed CUR algorithm and the subspace sampling algorithm. As we see, our algorithm requires much fewer columns and rows to achieve relative-error bound. ","element":"span"},{"text":"Our method is more scalable for it works on only a few columns or rows of the data matrix in question;","element":"span"}],[{"id":"id-121","style":{"width":"94%"},"width":1632,"height":159,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/5-0.png","element":"img"}],[{"id":"id-47","text":"Table 1: Comparisons between our ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"adaptive sampling ","element":"figcaption","subtype":"caption"},{"text":"based CUR algorithm and the best existing algorithm—the ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"subspace sampling ","element":"figcaption","subtype":"caption"},{"text":"algorithm of ","element":"figcaption","subtype":"caption"},{"href":"#id-4","referenceIndex":15,"text":"Drineas et al. ","element":"a","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a","subtype":"caption"},{"text":").","element":"figcaption","subtype":"caption"}],[{"style":{"width":"95%"},"width":1652,"height":248,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/5-1.png","element":"img"}],[{"id":"id-51","text":"Table 2: Lower bounds of the standard Nystr¨om method and the ensemble Nystr¨om ","element":"figcaption","subtype":"caption"},{"text":"method. ","element":"figcaption","subtype":"caption"},{"text":"The blanks indicate the lower bounds are unknown to us. ","element":"figcaption","subtype":"caption"},{"text":"Here ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"m ","element":"figcaption","subtype":"caption"},{"text":"denotes the column/row number of the SPSD matrix, ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"c ","element":"figcaption","subtype":"caption"},{"text":"denotes the number of selected columns, and ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"k ","element":"figcaption","subtype":"caption"},{"text":"denotes the target rank.","element":"figcaption","subtype":"caption"}],[{"text":"in contrast, the subspace sampling algorithm maintains the whole data matrix in RAM to implement SVD.","element":"span"}],[{"text":"Another important application of the adaptive sampling bound is to yield an algorithm for the modified Nystr¨om method. The algorithm has a strong relative-error upper bound: for a target rank ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":", by sampling ","element":"span"},{"style":{"height":22.15},"width":235.24,"height":55.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/5-2.png","element":"img","alt":"2kϵ2�1 + o(1)�","inline":true},{"text":"columns it achieves relative-error bound in expectation. The results are shown in Theorem ","element":"span"},{"href":"#id-48","text":"10","element":"a"},{"text":".","element":"span"}],[{"text":"Finally, we establish a collection of lower error bounds of the standard Nystr¨om and the ensemble Nystr¨om that use ","element":"span"},{"style":{"height":15.53},"width":67.58,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/5-3.png","element":"img","alt":" W† ","inline":true,"padRight":true},{"text":"as the intersection matrix. We show the lower bounds in Theorem ","element":"span"},{"href":"#id-49","text":"12 ","element":"a"},{"text":"and Table ","element":"span"},{"href":"#id-50","text":"3","element":"a"},{"text":"; here Table ","element":"span"},{"href":"#id-51","text":"2 ","element":"a"},{"text":"briefly summarizes the lower bounds in Table ","element":"span"},{"href":"#id-50","text":"3","element":"a"},{"text":". From the table we can see that the upper error bound of our adaptive sampling algorithm for the modified Nystr¨om method is even better than the lower bounds of the conventional Nystr¨om methods.","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/5-4.png","element":"img","alt":"2","inline":true}],[{"text":"The remainder of the paper is organized as follows. In Section ","element":"span"},{"text":"2 ","element":"span"},{"text":"we give the notation that will be used in this paper. In Section ","element":"span"},{"text":"3 ","element":"span"},{"text":"we survey the previous work on the randomized column selection, CUR matrix decomposition, and Nystr¨om approximation. In Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"we present our theoretical results and corresponding algorithms. In Section ","element":"span"},{"text":"5 ","element":"span"},{"text":"we empirically evaluate our proposed CUR and Nystr¨om algorithms. Finally, we conclude our work in Section ","element":"span"},{"text":"6","element":"span"},{"text":". All proofs are deferred to the appendices.","element":"span"}]]},{"heading":"2. Notation","paragraphs":[[{"text":"First of all, we present the notation and notion that are used here and later. We let ","element":"span"},{"style":{"height":14.62},"width":49.03,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-0.png","element":"img","alt":" Im","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"style":{"height":9.2},"width":128.05,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-1.png","element":"img","alt":" m × m","inline":true,"padRight":true},{"text":"identity matrix, ","element":"span"},{"style":{"height":14.22},"width":55.09,"height":35.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-2.png","element":"img","alt":" 1m","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"style":{"height":9.2},"width":72.31,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-3.png","element":"img","alt":" m×","inline":true},{"text":"1 vector of ones, and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"0 ","element":"span"},{"text":"denote a zero vector or matrix with appropriate size. For a matrix ","element":"span"},{"style":{"height":20.95},"width":693.11,"height":52.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-4.png","element":"img","alt":" A = [aij] ∈ Rm×n, we let a(i) be its","inline":true},{"style":{"height":17.42},"width":375.27,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-5.png","element":"img","alt":"i-th row, aj be its j","inline":true},{"text":"-th column, and ","element":"span"},{"style":{"height":17.42},"width":73.88,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-6.png","element":"img","alt":" Ai:j","inline":true,"padRight":true},{"text":"be a submatrix consisting of its ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"-th columns (","element":"span"},{"style":{"height":17.6},"width":122.65,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-7.png","element":"img","alt":"i ≤ j).","inline":true}],[{"text":"Let ","element":"span"},{"style":{"height":17.6},"width":713.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-8.png","element":"img","alt":" ρ = rank(A) ≤ min{m, n} and k ≤ ρ","inline":true},{"text":". The singular value decomposition (SVD) of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"can be written as","element":"span"}],[{"style":{"width":"95%"},"width":1647,"height":125,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-9.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":18.44},"width":815.04,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-10.png","element":"img","alt":" UA,k (m×k), ΣA,k (k×k), and VA,k (n×k","inline":true},{"text":") correspond to the top ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"singular values. We denote ","element":"span"},{"style":{"height":22.85},"width":404.77,"height":57.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-11.png","element":"img","alt":" Ak = UA,kΣA,kVTA,k ","inline":true,"padRight":true},{"text":"which is the best (or closest) rank-","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"approximation to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":". ","element":"span"},{"text":"We also use ","element":"span"},{"style":{"height":18.34},"width":247.02,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-12.png","element":"img","alt":" σi(A) = σA,i","inline":true,"padRight":true},{"text":"to denote the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th largest singular value. When ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"is SPSD, the SVD is identical to the eigenvalue decomposition, in which case we have ","element":"span"},{"style":{"height":14.74},"width":209.51,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-13.png","element":"img","alt":" UA = VA.","inline":true}],[{"text":"We define the matrix norms as follows. Let ","element":"span"},{"style":{"height":20.76},"width":814,"height":51.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-14.png","element":"img","alt":" ∥A∥1 = �i,j |aij| be the ℓ1-norm, ∥A∥F =","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":23.54},"width":501.84,"height":58.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-15.png","element":"img","alt":"�i,j a2ij)1/2 = (�i σ2A,i)1/2","inline":true,"padRight":true},{"text":"be the Frobenius norm, ","element":"span"},{"style":{"height":19.95},"width":727.5,"height":49.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-16.png","element":"img","alt":" ∥A∥2 = maxx∈Rn,∥x∥2=1 ∥Ax∥2 = σA,1","inline":true,"padRight":true},{"text":"be the spectral norm, and ","element":"span"},{"style":{"height":18.36},"width":308.78,"height":45.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-17.png","element":"img","alt":" ∥A∥∗ = �i σA,i","inline":true,"padRight":true},{"text":"be the nuclear norm. We always use ","element":"span"},{"style":{"height":18.44},"width":152.04,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-18.png","element":"img","alt":" ∥ · ∥ξ to","inline":true,"padRight":true},{"text":"represent ","element":"span"},{"style":{"height":17.6},"width":410.24,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-19.png","element":"img","alt":" ∥ · ∥2, ∥ · ∥F , or ∥ · ∥∗.","inline":true}],[{"text":"Based on SVD, the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"statistical leverage scores ","element":"span"},{"text":"of the columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"relative to the best rank-","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"approximation to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"is defined as","element":"span"}],[{"id":"id-63","style":{"width":"66%"},"width":1151,"height":68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-20.png","element":"img"}],[{"text":"We have that ","element":"span"},{"style":{"height":26.41},"width":262.36,"height":66.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-21.png","element":"img","alt":"�nj=1 ℓ[k]j = k","inline":true},{"text":". The leverage scores of the rows of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"are defined according to ","element":"span"},{"style":{"height":17.24},"width":95.41,"height":43.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-22.png","element":"img","alt":" UA,k","inline":true},{"text":". The leverage scores play an important role in low-rank matrix approximation. Informally speaking, the columns (or rows) with high leverage scores have greater influence in rank-","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"approximation than those with low leverage scores.","element":"span"}],[{"text":"Additionally, let ","element":"span"},{"style":{"height":24.07},"width":406.75,"height":60.18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-23.png","element":"img","alt":" A† = VA,ρΣ−1A,ρUTA,ρ ","inline":true,"padRight":true},{"text":"be the Moore-Penrose inverse of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"(","element":"span"},{"href":"#id-52","referenceIndex":1,"text":"Ben-Israel ","element":"a"},{"href":"#id-52","referenceIndex":1,"text":"and Greville","element":"a"},{"text":", ","element":"span"},{"href":"#id-52","referenceIndex":1,"text":"2003","element":"a"},{"text":"). ","element":"span"},{"text":"When ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"is nonsingular, the Moore-Penrose inverse is identical to the matrix inverse. Given matrices ","element":"span"},{"style":{"height":18.33},"width":1013.05,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-24.png","element":"img","alt":" A ∈ Rm×n, X ∈ Rm×p, and Y ∈ Rq×n, XX†A =","inline":true},{"style":{"height":20.34},"width":363.14,"height":50.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-25.png","element":"img","alt":"UXUTXA ∈ Rm×n","inline":true,"padRight":true},{"text":"is the projection of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"onto the column space of ","element":"span"},{"style":{"height":18.33},"width":351.84,"height":45.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-26.png","element":"img","alt":" X, and AY†Y =","inline":true},{"style":{"height":20.34},"width":335.14,"height":50.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-27.png","element":"img","alt":"AVYVTY ∈ Rm×n","inline":true,"padRight":true},{"text":"is the projection of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"onto the row space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Y","element":"span"},{"text":".","element":"span"}],[{"text":"Finally, we discuss the computational costs of the matrix operations mentioned above. For an ","element":"span"},{"style":{"height":9.2},"width":98.25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-28.png","element":"img","alt":" m×n","inline":true,"padRight":true},{"text":"general matrix ","element":"span"},{"style":{"height":17.6},"width":344.05,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-29.png","element":"img","alt":" A (assume m ≥ n","inline":true},{"text":"), it takes ","element":"span"},{"style":{"height":19.13},"width":134.42,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-30.png","element":"img","alt":" O(mn2","inline":true},{"text":") flops to compute the full SVD and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnk","element":"span"},{"text":") flops to compute the truncated SVD of rank ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"< n","element":"span"},{"text":"). The computation of ","element":"span"},{"style":{"height":19.53},"width":394.54,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-31.png","element":"img","alt":"A† also takes O(mn2","inline":true},{"text":") flops. It is worth mentioning that, although multiplying an ","element":"span"},{"style":{"height":9.2},"width":183.26,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-32.png","element":"img","alt":" m×n ma-","inline":true,"padRight":true},{"text":"trix by an ","element":"span"},{"style":{"height":12.4},"width":84.13,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-33.png","element":"img","alt":" n×p","inline":true,"padRight":true},{"text":"matrix runs in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnp ","element":"span"},{"text":"flops, it can be easily performed in parallel (","element":"span"},{"href":"#id-53","referenceIndex":27,"text":"Halko et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-53","referenceIndex":27,"text":"2011","element":"a"},{"text":"). In contrast, implementing operations like SVD and QR decomposition in parallel is much more difficult. So we denote the time complexity of such a matrix multiplication by ","element":"span"},{"style":{"height":18.44},"width":261.36,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/6-34.png","element":"img","alt":"TMultiply(mnp","inline":true},{"text":"), which can be tremendously smaller than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnp","element":"span"},{"text":") in practice.","element":"span"}]]},{"heading":"3. Previous Work","paragraphs":[[{"text":"In Section ","element":"span"},{"href":"#id-54","text":"3.1 ","element":"a"},{"text":"we present an adaptive sampling algorithm and its relative-error bound established by ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":"). In Section ","element":"span"},{"href":"#id-55","text":"3.2 ","element":"a"},{"text":"we highlight the near-optimal column selection algorithm of ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Boutsidis et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":") which we will use in our CUR and Nystr¨om algorithms for column/row sampling. ","element":"span"},{"text":"In Section ","element":"span"},{"href":"#id-56","text":"3.3 ","element":"a"},{"text":"we introduce two important CUR algorithms. In Section ","element":"span"},{"href":"#id-57","text":"3.4 ","element":"a"},{"text":"we introduce the only known relative-error algorithm for the standard Nystr¨om method.","element":"span"}],[{"id":"id-54","style":{"fontWeight":"bold"},"text":"3.1 The Adaptive Sampling Algorithm","element":"span"}],[{"text":"Adaptive sampling is an effective and efficient column sampling algorithm for reducing the error incurred by the first round of sampling. After one has selected a small subset of columns (denoted ","element":"span"},{"style":{"height":15.02},"width":53.24,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-0.png","element":"img","alt":" C1","inline":true},{"text":"), an adaptive sampling method is used to further select a proportion of columns according to the residual of the first round, that is, ","element":"span"},{"style":{"height":22.29},"width":221.86,"height":55.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-1.png","element":"img","alt":" A−C1C†1A","inline":true},{"text":". The approximation ","element":"span"},{"text":"error is guaranteed to be decreasing by a factor after the adaptive sampling (","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande ","element":"a"},{"href":"#id-8","referenceIndex":11,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":"). We show the result of ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":") in the following lemma.","element":"span"}],[{"id":"id-58","style":{"fontWeight":"bold"},"text":"Lemma 1 (The Adaptive Sampling Algorithm) ","element":"span"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a matrix ","element":"span"},{"style":{"height":17.13},"width":845.2,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-2.png","element":"img","alt":" A ∈ Rm×n, we let C1 ∈ Rm×c1 consist of c1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":", and define the residual ","element":"span"},{"style":{"height":22.29},"width":333.49,"height":55.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-3.png","element":"img","alt":"B = A − C1C†1A","inline":true},{"style":{"fontStyle":"italic"},"text":". Additionally, for ","element":"span"},{"style":{"height":16.4},"width":431.57,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-4.png","element":"img","alt":" i = 1, · · · , n, we define","inline":true}],[{"style":{"width":"20%"},"width":361,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-5.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"We further sample ","element":"span"},{"style":{"height":10.62},"width":35.88,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-6.png","element":"img","alt":" c2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns i.i.d. from ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":", in each trial of which the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"style":{"fontStyle":"italic"},"text":"-th column is chosen with probability ","element":"span"},{"style":{"height":16.73},"width":368.31,"height":41.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-7.png","element":"img","alt":" pi. Let C2 ∈ Rm×c2 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"contain the ","element":"span"},{"style":{"height":10.62},"width":35.88,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-8.png","element":"img","alt":" c2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"sampled columns and let ","element":"span"},{"style":{"height":17.6},"width":289.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-9.png","element":"img","alt":" C = [C1, C2] ∈","inline":true},{"style":{"height":15.93},"width":202.89,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-10.png","element":"img","alt":"Rm×(c1+c2)","inline":true},{"style":{"fontStyle":"italic"},"text":". Then, for any integer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k > ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":", the following inequality holds:","element":"span"}],[{"style":{"width":"60%"},"width":1045,"height":97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the expectation is taken w.r.t. ","element":"span"},{"style":{"height":15.02},"width":68.17,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-12.png","element":"img","alt":" C2.","inline":true}],[{"text":"We will establish in Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"a more general and more useful error bound for this adaptive sampling algorithm. It can be shown that Lemma ","element":"span"},{"href":"#id-58","text":"1 ","element":"a"},{"text":"is a special case of Theorem ","element":"span"},{"href":"#id-43","text":"5","element":"a"},{"text":".","element":"span"}],[{"id":"id-55","style":{"fontWeight":"bold"},"text":"3.2 The Near-Optimal Column Selection Algorithm","element":"span"}],[{"href":"#id-10","referenceIndex":5,"text":"Boutsidis et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":") proposed a relative-error column selection algorithm which requires only ","element":"span"},{"style":{"height":19.13},"width":279.7,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-13.png","element":"img","alt":" c = 2kϵ−1(1+o","inline":true},{"text":"(1)) columns get selected. ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Boutsidis et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":") also proved the lower bound of the column selection problem which shows that no column selection algorithm can achieve relative-error bound by selecting less than ","element":"span"},{"style":{"height":15.13},"width":165.06,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/7-14.png","element":"img","alt":" c = kϵ−1 ","inline":true,"padRight":true},{"text":"columns. Thus this algorithm is near optimal. Though an optimal algorithm recently proposed by ","element":"span"},{"href":"#id-11","referenceIndex":25,"text":"Guruswami and Sinop ","element":"a"},{"text":"(","element":"span"},{"href":"#id-11","referenceIndex":25,"text":"2012","element":"a"},{"text":") attains the the lower bound, this algorithm is quite inefficient in comparison with the near-optimal algorithm. So we prefer to use the near-optimal algorithm in our CUR and Nystr¨om algorithms for column/row sampling.","element":"span"}],[{"text":"The near-optimal algorithm consists of three steps: the approximate SVD via random projection (","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Boutsidis et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":"; ","element":"span"},{"href":"#id-53","referenceIndex":27,"text":"Halko et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-53","referenceIndex":27,"text":"2011","element":"a"},{"text":"), the dual set sparsification algorithm","element":"span"}],[{"id":"id-59","style":{"width":"99%"},"width":1728,"height":574,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-0.png","element":"img"}],[{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Boutsidis et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":"), and the adaptive sampling algorithm (","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":"). We describe the near-optimal algorithm in Algorithm ","element":"span"},{"href":"#id-59","text":"1 ","element":"a"},{"text":"and present the theoretical analysis in Lemma ","element":"span"},{"href":"#id-60","text":"2","element":"a"},{"text":".","element":"span"}],[{"id":"id-60","style":{"fontWeight":"bold"},"text":"Lemma 2 (The Near-Optimal Column Selection Algorithm) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a matrix ","element":"span"},{"style":{"height":13.2},"width":83.42,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-1.png","element":"img","alt":" A ∈","inline":true},{"style":{"height":17.14},"width":298.97,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-2.png","element":"img","alt":"Rm×n of rank ρ","inline":true},{"style":{"fontStyle":"italic"},"text":", a target rank ","element":"span"},{"style":{"height":17.6},"width":550.11,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-3.png","element":"img","alt":" k (2 ≤ k < ρ), and 0 < ϵ < 1","inline":true},{"style":{"fontStyle":"italic"},"text":". Algorithm ","element":"span"},{"href":"#id-59","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"selects","element":"span"}],[{"style":{"width":"20%"},"width":352,"height":91,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-4.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"to form a matrix ","element":"span"},{"style":{"height":13.93},"width":192.4,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-5.png","element":"img","alt":" C ∈ Rm×c","inline":true},{"style":{"fontStyle":"italic"},"text":", then the following inequality holds:","element":"span"}],[{"style":{"width":"44%"},"width":767,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-6.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the expectation is taken w.r.t. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"style":{"fontStyle":"italic"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Furthermore, the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"style":{"fontStyle":"italic"},"text":"can be obtained in ","element":"span"},{"style":{"height":21.8},"width":1020.5,"height":54.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-7.png","element":"img","alt":"O�mk2ϵ−4/3 + nk3ϵ−2/3�+ TMultiply�mnkϵ−2/3�time.","inline":true}],[{"text":"This algorithm has the merits of low time complexity and space complexity. None of the three steps—the randomized SVD, the dual set sparsification algorithm, and the adaptive sampling—requires loading the whole of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"into RAM. All of the three steps can work on only a small subset of the columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":". Though a relative-error algorithm recently proposed by ","element":"span"},{"href":"#id-11","referenceIndex":25,"text":"Guruswami and Sinop ","element":"a"},{"text":"(","element":"span"},{"href":"#id-11","referenceIndex":25,"text":"2012","element":"a"},{"text":") requires even fewer columns, it is less efficient than the near-optimal algorithm.","element":"span"}],[{"id":"id-56","style":{"fontWeight":"bold"},"text":"3.3 Previous Work in CUR Matrix Decomposition","element":"span"}],[{"text":"We introduce in this section two highly effective CUR algorithms: one is deterministic and the other is randomized.","element":"span"}],[{"text":"3.3.1 The Sparse Column-Row Approximation (SCRA)","element":"span"}],[{"href":"#id-14","referenceIndex":41,"text":"Stewart ","element":"a"},{"text":"(","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"1999","element":"a"},{"text":") proposed a deterministic CUR algorithm and called it the sparse column-row approximation (SCRA). SCRA is based on the truncated pivoted QR decomposition via a quasi Gram-Schmidt algorithm. Given a matrix ","element":"span"},{"style":{"height":13.94},"width":209.88,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-8.png","element":"img","alt":" A ∈ Rm×n","inline":true},{"text":", the truncated pivoted QR decomposition procedure deterministically finds a set of columns ","element":"span"},{"style":{"height":17.13},"width":359.99,"height":42.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/8-9.png","element":"img","alt":" C ∈ Rm×c by col-","inline":true,"padRight":true},{"text":"umn pivoting, whose span approximates the column space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":", and computes an upper triangular matrix ","element":"span"},{"style":{"height":15.87},"width":222.39,"height":39.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-0.png","element":"img","alt":" TC ∈ Rc×c ","inline":true,"padRight":true},{"text":"that orthogonalizes those columns. SCRA runs the same procedure again on ","element":"span"},{"style":{"height":15.13},"width":61.94,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-1.png","element":"img","alt":" AT ","inline":true,"padRight":true},{"text":"to select a set of rows ","element":"span"},{"style":{"height":13.93},"width":198.23,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-2.png","element":"img","alt":" R ∈ Rr×n ","inline":true,"padRight":true},{"text":"and computes the corresponding upper triangular matrix ","element":"span"},{"style":{"height":18.73},"width":1158.74,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-3.png","element":"img","alt":" TR ∈ Rr×r. Let C = QCTC and RT = QRTR denote","inline":true,"padRight":true},{"text":"the resulting truncated pivoted QR decomposition. The intersection matrix is computed by ","element":"span"},{"style":{"height":20.34},"width":691.54,"height":50.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-4.png","element":"img","alt":" U = (TTCTC)−1CT ART (TTRTR)−1","inline":true},{"text":". According to our experiments, this algorithm is ","element":"span"},{"text":"quite effective but very time expensive, especially when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"are large. Moreover, this algorithm does not have data-independent error bound.","element":"span"}],[{"id":"id-62","text":"3.3.2 The Subspace Sampling CUR Algorithm","element":"span"}],[{"href":"#id-4","referenceIndex":15,"text":"Drineas et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":") proposed a two-stage randomized CUR algorithm which has a relative-error bound with high probability (w.h.p.). ","element":"span"},{"text":"In the first stage the algorithm samples ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"to construct ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"text":", and in the second stage it samples ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"rows from ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"simultaneously to construct ","element":"span"},{"style":{"height":15.53},"width":552.92,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-5.png","element":"img","alt":" R and W and let U = W†","inline":true},{"text":". The sampling probabilities in the two stages are proportional to the leverage scores of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"text":", respectively. That is, in the first stage the sampling probabilities are proportional to the squared ","element":"span"},{"style":{"height":15.02},"width":35.18,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-6.png","element":"img","alt":" ℓ2","inline":true},{"text":"-norm of the rows of ","element":"span"},{"style":{"height":17.24},"width":94.74,"height":43.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-7.png","element":"img","alt":" VA,k","inline":true},{"text":"; in the second stage the sampling probabilities are proportional to the squared ","element":"span"},{"style":{"height":15.02},"width":35.18,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-8.png","element":"img","alt":"ℓ2","inline":true},{"text":"-norm of the rows of ","element":"span"},{"style":{"height":14.74},"width":66.6,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-9.png","element":"img","alt":" UC","inline":true},{"text":". That is why it is called the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"subspace sampling algorithm","element":"span"},{"text":". Here we show the main results of the subspace sampling algorithm in the following lemma.","element":"span"}],[{"style":{"height":17.6},"width":1458.25,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-10.png","element":"img","alt":"Lemma 3 (Subspace Sampling for CUR ) Given an m × n matrix A","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a target rank ","element":"span"},{"style":{"height":17.6},"width":306.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-11.png","element":"img","alt":" k ≪ min{m, n}","inline":true},{"style":{"fontStyle":"italic"},"text":", the subspace sampling algorithm selects ","element":"span"},{"style":{"height":19.13},"width":505.25,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-12.png","element":"img","alt":" c = O(kϵ−2 log k log(1/δ))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"= ","element":"span"},{"style":{"height":20.8},"width":409.87,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-13.png","element":"img","alt":"O�cϵ−2 log c log(1/δ)�","inline":true},{"style":{"fontStyle":"italic"},"text":"rows without replacement. Then","element":"span"}],[{"style":{"width":"63%"},"width":1105,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-14.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"holds with probability at least ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":15.6},"width":279.22,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-15.png","element":"img","alt":" − δ, where W","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"contains the rows of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"style":{"fontStyle":"italic"},"text":"with scaling. The running time is dominated by the truncated SVD of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":", that is, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"mnk","element":"span"},{"text":")","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"id":"id-57","style":{"fontWeight":"bold"},"text":"3.4 Previous Work in the Nystr¨om Approximation","element":"span"}],[{"text":"In a very recent work, ","element":"span"},{"href":"#id-42","referenceIndex":21,"text":"Gittens and Mahoney ","element":"a"},{"text":"(","element":"span"},{"href":"#id-42","referenceIndex":21,"text":"2013","element":"a"},{"text":") established a framework for analyzing errors incurred by the standard Nystr¨om method. Especially, the authors provided the first and the only known relative-error (in nuclear norm) algorithm for the standard Nystr¨om method. The algorithm is described as follows and, its bound is shown in Lemma ","element":"span"},{"href":"#id-61","text":"4","element":"a"},{"text":".","element":"span"}],[{"text":"Like the CUR algorithm in Section ","element":"span"},{"href":"#id-62","text":"3.3.2","element":"a"},{"text":", the Nystr¨om algorithm also samples columns by the subspace sampling of ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"Drineas et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":"). Each column is selected with probability ","element":"span"},{"style":{"height":26.41},"width":192.88,"height":66.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-16.png","element":"img","alt":"pj = 1kℓ[k]j","inline":true,"padRight":true},{"text":"with replacement, where ","element":"span"},{"style":{"height":23.8},"width":216.8,"height":59.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-17.png","element":"img","alt":" ℓ[k]1 , · · · , ℓ[k]m ","inline":true,"padRight":true},{"text":"are leverage scores defined in (","element":"span"},{"href":"#id-63","text":"3","element":"a"},{"text":"). After ","element":"span"},{"text":"column sampling, ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"W ","element":"span"},{"text":"are obtained by scaling the selected columns, that is,","element":"span"}],[{"style":{"width":"46%"},"width":801,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-18.png","element":"img"}],[{"text":"Here ","element":"span"},{"style":{"height":13.93},"width":184.04,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-19.png","element":"img","alt":" S ∈ Rm×c ","inline":true,"padRight":true},{"text":"is a column selection matrix that ","element":"span"},{"style":{"height":17.42},"width":271.25,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-20.png","element":"img","alt":" sij = 1 if the i","inline":true},{"text":"-th column of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"is the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"-th ","element":"span"},{"id":"id-61","text":"column selected, and ","element":"span"},{"style":{"height":13.93},"width":179.35,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-21.png","element":"img","alt":" D ∈ Rc×c ","inline":true,"padRight":true},{"text":"is a diagonal scaling matrix satisfying ","element":"span"},{"style":{"height":25.74},"width":391.07,"height":64.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/9-22.png","element":"img","alt":" djj = 1√cpi if sij = 1.","inline":true}],[{"id":"id-120","style":{"height":17.6},"width":1728.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-0.png","element":"img","alt":"Lemma 4 (Subspace Sampling for the Nystr¨om Approximation) Given an m×m","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"SPSD matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and a target rank ","element":"span"},{"style":{"height":13.6},"width":130.97,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-1.png","element":"img","alt":" k ≪ m","inline":true},{"style":{"fontStyle":"italic"},"text":", the subspace sampling algorithm selects","element":"span"}],[{"style":{"width":"26%"},"width":455,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-2.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"columns without replacement and constructs ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"W ","element":"span"},{"style":{"fontStyle":"italic"},"text":"by scaling the selected columns. Then the inequality","element":"span"}],[{"style":{"width":"71%"},"width":1239,"height":116,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-3.png","element":"img"}]]},{"heading":"4. Main Results","paragraphs":[[{"text":"We now present our main results. We establish a new error bound for the adaptive sampling algorithm in Section ","element":"span"},{"href":"#id-64","text":"4.1","element":"a"},{"text":". We apply adaptive sampling to the CUR and modified Nystr¨om problems, obtaining effective and efficient CUR and Nystr¨om algorithms in Section ","element":"span"},{"href":"#id-65","text":"4.2 ","element":"a"},{"text":"and Section ","element":"span"},{"href":"#id-36","text":"4.3 ","element":"a"},{"text":"respectively. In Section ","element":"span"},{"href":"#id-37","text":"4.4 ","element":"a"},{"text":"we study lower bounds of the conventional Nystr¨om methods to demonstrate the advantages of our approach. Finally, in Section ","element":"span"},{"href":"#id-66","text":"4.5 ","element":"a"},{"text":"we show that our expected bounds can extend to with high probability (w.h.p.) bounds.","element":"span"}],[{"id":"id-64","style":{"fontWeight":"bold"},"text":"4.1 Adaptive Sampling","element":"span"}],[{"text":"The relative-error adaptive sampling algorithm is originally established in Theorem 2.1 of ","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"Deshpande et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"text":"2006","element":"a"},{"text":") (see also Lemma ","element":"span"},{"href":"#id-58","text":"1 ","element":"a"},{"text":"in Section ","element":"span"},{"href":"#id-54","text":"3.1","element":"a"},{"text":"). The algorithm is based on the following idea: after selecting a proportion of columns from ","element":"span"},{"style":{"height":15.02},"width":263.48,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-4.png","element":"img","alt":" A to form C1","inline":true,"padRight":true},{"text":"by an arbitrary algorithm, the algorithm randomly samples additional ","element":"span"},{"style":{"height":10.62},"width":35.88,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-5.png","element":"img","alt":" c2","inline":true,"padRight":true},{"text":"columns according to the residual ","element":"span"},{"style":{"height":22.29},"width":243.16,"height":55.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-6.png","element":"img","alt":"A − C1C†1A","inline":true},{"text":". Here we prove a new and more general error bound for the same adaptive ","element":"span"},{"text":"sampling algorithm.","element":"span"}],[{"id":"id-43","style":{"fontWeight":"bold"},"text":"Theorem 5 (The Adaptive Sampling Algorithm) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a matrix ","element":"span"},{"style":{"height":13.93},"width":337.84,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-7.png","element":"img","alt":" A ∈ Rm×n and a","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"matrix ","element":"span"},{"style":{"height":19.53},"width":1585.8,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-8.png","element":"img","alt":" C ∈ Rm×c such that rank(C) = rank(CC†A) = ρ (ρ ≤ c ≤ n). We let R1 ∈ Rr1×n","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"consist of ","element":"span"},{"style":{"height":16.4},"width":264.24,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-9.png","element":"img","alt":" r1 rows of A","inline":true},{"style":{"fontStyle":"italic"},"text":", and define the residual ","element":"span"},{"style":{"height":22.29},"width":382.7,"height":55.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-10.png","element":"img","alt":" B = A − AR†1R1.","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Additionally, for","element":"span"}],[{"style":{"width":"60%"},"width":1054,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"We further sample ","element":"span"},{"style":{"height":10.62},"width":36.69,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-12.png","element":"img","alt":" r2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"rows i.i.d. from ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":", in each trial of which the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"style":{"fontStyle":"italic"},"text":"-th row is chosen with probability ","element":"span"},{"style":{"height":16.73},"width":401.8,"height":41.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-13.png","element":"img","alt":" pi. Let R2 ∈ Rr2×n ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"contain the ","element":"span"},{"style":{"height":10.62},"width":36.69,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-14.png","element":"img","alt":" r2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"sampled rows and let ","element":"span"},{"style":{"height":19.81},"width":361.41,"height":49.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-15.png","element":"img","alt":" R = [RT1 , RT2 ]T ∈","inline":true},{"style":{"height":15.93},"width":195.33,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-16.png","element":"img","alt":"R(r1+r2)×n","inline":true},{"style":{"fontStyle":"italic"},"text":". Then we have","element":"span"}],[{"id":"id-70","style":{"width":"70%"},"width":1211,"height":86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-17.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the expectation is taken w.r.t. ","element":"span"},{"style":{"height":14.62},"width":69.57,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-18.png","element":"img","alt":" R2.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Remark 6 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"This theorem shows a more general bound for adaptive sampling than the original one in Theorem 2.1 of ","element":"span"},{"href":"#id-8","referenceIndex":11,"style":{"fontStyle":"italic"},"text":"Deshpande et al. ","element":"a"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"style":{"fontStyle":"italic"},"text":"2006","element":"a"},{"style":{"fontStyle":"italic"},"text":"). The original one bounds the error incurred by projection onto the column space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"style":{"fontStyle":"italic"},"text":", while Theorem ","element":"span"},{"href":"#id-43","style":{"fontStyle":"italic"},"text":"5 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"bounds the error incurred by projection onto the column space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and row space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"R ","element":"span"},{"style":{"fontStyle":"italic"},"text":"simultaneously—such situation rises in problems such as CUR and the Nystr¨om approximation. It is worth pointing out that Theorem 2.1 of ","element":"span"},{"href":"#id-8","referenceIndex":11,"style":{"fontStyle":"italic"},"text":"Deshpande et al. ","element":"a"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-8","referenceIndex":11,"style":{"fontStyle":"italic"},"text":"2006","element":"a"},{"style":{"fontStyle":"italic"},"text":") is a direct corollary of this theorem when ","element":"span"},{"style":{"height":19.53},"width":901.25,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/10-19.png","element":"img","alt":" C = Ak (i.e., c = n, ρ = k, and CC†A = Ak).","inline":true}],[{"id":"id-117","text":"As discussed in Section ","element":"span"},{"href":"#id-67","text":"1.2","element":"a"},{"text":", selecting good columns or rows separately does not ensure good columns and rows together for CUR and the Nystr¨om approximation. Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"is thereby important for it guarantees the combined effect column and row selection. Guaranteed by Theorem ","element":"span"},{"href":"#id-43","text":"5","element":"a"},{"text":", any column selection algorithm with relative-error bound can be applied to CUR and the Nystr¨om approximation. We show the result in the following corollary.","element":"span"}],[{"id":"id-44","style":{"height":17.6},"width":1806.64,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-0.png","element":"img","alt":"Corollary 7 (Adaptive Sampling for CUR and the Nystr¨om Approximation) Given","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"a matrix ","element":"span"},{"style":{"height":13.93},"width":200.1,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-1.png","element":"img","alt":" A ∈ Rm×n","inline":true},{"style":{"fontStyle":"italic"},"text":", a target rank ","element":"span"},{"style":{"height":17.6},"width":211.22,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-2.png","element":"img","alt":" k (≪ m, n)","inline":true},{"style":{"fontStyle":"italic"},"text":", and a column selection algorithm ","element":"span"},{"style":{"height":15.64},"width":199.3,"height":39.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-3.png","element":"img","alt":" Acol which","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"achieves relative-error upper bound by selecting ","element":"span"},{"style":{"height":17.6},"width":221.83,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-4.png","element":"img","alt":" c ≥ C(k, ϵ)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns. Then we have the following results for CUR and the Nystr¨om approximation.","element":"span"}],[{"text":"(1) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"By selecting ","element":"span"},{"style":{"height":17.6},"width":214.57,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-5.png","element":"img","alt":" c ≥ C(k, ϵ)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"to construct ","element":"span"},{"style":{"height":15.02},"width":264.87,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-6.png","element":"img","alt":" C and r1 = c","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"rows to construct ","element":"span"},{"style":{"height":14.62},"width":54.64,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-7.png","element":"img","alt":"R1","inline":true},{"style":{"fontStyle":"italic"},"text":", both using algorithm ","element":"span"},{"style":{"height":15.64},"width":77.34,"height":39.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-8.png","element":"img","alt":" Acol","inline":true},{"style":{"fontStyle":"italic"},"text":", followed by selecting additional ","element":"span"},{"style":{"height":17.6},"width":155.51,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-9.png","element":"img","alt":" r2 = c/ϵ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"rows using the adaptive sampling algorithm to construct ","element":"span"},{"style":{"height":14.62},"width":54.64,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-10.png","element":"img","alt":" R2","inline":true},{"style":{"fontStyle":"italic"},"text":", the CUR matrix decomposition achieves relative-error upper bound in expectation:","element":"span"}],[{"style":{"width":"68%"},"width":1182,"height":153,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-11.png","element":"img"}],[{"text":"(2) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Suppose ","element":"span"},{"style":{"height":12.8},"width":284.79,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-12.png","element":"img","alt":" A is an m × m","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"symmetric matrix. By selecting ","element":"span"},{"style":{"height":17.6},"width":225.47,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-13.png","element":"img","alt":" c1 ≥ C(k, ϵ)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"to construct ","element":"span"},{"style":{"height":16.8},"width":265.16,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-14.png","element":"img","alt":" C1 using Acol","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and selecting ","element":"span"},{"style":{"height":17.6},"width":177.57,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-15.png","element":"img","alt":" c2 = c1/ϵ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"to construct ","element":"span"},{"style":{"height":16.4},"width":173.07,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-16.png","element":"img","alt":" C2 using","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the adaptive sampling algorithm, the modified Nystr¨om method achieves relative-error upper bound in expectation:","element":"span"}],[{"style":{"width":"69%"},"width":1195,"height":154,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-17.png","element":"img"}],[{"text":"Based on Corollary ","element":"span"},{"href":"#id-44","text":"7","element":"a"},{"text":", we attempt to solve CUR and the Nystr¨om by adaptive sampling algorithms. We present concrete algorithms in Section ","element":"span"},{"href":"#id-65","text":"4.2 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-36","text":"4.3","element":"a"},{"text":".","element":"span"}],[{"id":"id-65","style":{"fontWeight":"bold"},"text":"4.2 Adaptive Sampling for CUR Matrix Decomposition","element":"span"}],[{"text":"Guaranteed by the novel adaptive sampling bound in Theorem ","element":"span"},{"href":"#id-43","text":"5","element":"a"},{"text":", we combine the near-optimal column selection algorithm of ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Boutsidis et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":") and the adaptive sampling algorithm for solving the CUR problem, giving rise to an algorithm with a much tighter theoretical bound than existing algorithms. The algorithm is described in Algorithm ","element":"span"},{"href":"#id-45","text":"2 ","element":"a"},{"text":"and its analysis is given in Theorem ","element":"span"},{"href":"#id-46","text":"8","element":"a"},{"text":". Theorem ","element":"span"},{"href":"#id-46","text":"8 ","element":"a"},{"text":"follows immediately from Lemma ","element":"span"},{"href":"#id-60","text":"2 ","element":"a"},{"text":"and Corollary ","element":"span"},{"href":"#id-44","text":"7","element":"a"},{"text":".","element":"span"}],[{"id":"id-46","style":{"fontWeight":"bold"},"text":"Theorem 8 (Adaptive Sampling for CUR) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a matrix ","element":"span"},{"style":{"height":13.93},"width":200.1,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-18.png","element":"img","alt":" A ∈ Rm×n ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a positive integer ","element":"span"},{"style":{"height":17.6},"width":316.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-19.png","element":"img","alt":" k ≪ min{m, n}","inline":true},{"style":{"fontStyle":"italic"},"text":", the CUR algorithm described in Algorithm ","element":"span"},{"href":"#id-45","style":{"fontStyle":"italic"},"text":"2 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"randomly selects ","element":"span"},{"style":{"height":21.69},"width":289.69,"height":54.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-20.png","element":"img","alt":"c = 2kϵ (1+o(1))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"to construct ","element":"span"},{"style":{"height":13.93},"width":193.4,"height":34.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-21.png","element":"img","alt":" C ∈ Rm×c","inline":true},{"style":{"fontStyle":"italic"},"text":", and then selects ","element":"span"},{"style":{"height":19.22},"width":373.1,"height":48.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-22.png","element":"img","alt":" r = cϵ(1+ϵ) rows of","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"to construct ","element":"span"},{"style":{"height":13.93},"width":186.06,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-23.png","element":"img","alt":" R ∈ Rr×n","inline":true},{"style":{"fontStyle":"italic"},"text":". Then we have","element":"span"}],[{"style":{"width":"73%"},"width":1273,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-24.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"The algorithm costs time ","element":"span"},{"style":{"height":21.8},"width":1226.03,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/11-25.png","element":"img","alt":" O�(m + n)k3ϵ−2/3 + mk2ϵ−2 + nk2ϵ−4�+ TMultiply�mnkϵ−1�to","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"compute matrices ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"style":{"fontWeight":"bold"},"text":"U ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"R","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"id":"id-45","style":{"width":"99%"},"width":1728,"height":375,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-0.png","element":"img"}],[{"text":"When the algorithm is executed in a single-core processor, the time complexity of the CUR algorithm is linear in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mn","element":"span"},{"text":"; when executed in multi-processor environment where matrix multiplication is performed in parallel, ideally the algorithm costs time only linear in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":"+","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":". Another advantage of this algorithm is that it avoids loading the whole ","element":"span"},{"style":{"height":9.2},"width":98.25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-1.png","element":"img","alt":" m×n","inline":true,"padRight":true},{"text":"data matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"into RAM. Neither the near-optimal column selection algorithm nor the adaptive sampling algorithm requires loading the whole of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"into RAM. The most space-expensive operation throughout this algorithm is computation of the Moore-Penrose inverses of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"R","element":"span"},{"text":", which requires maintaining an ","element":"span"},{"style":{"height":9.2},"width":91.25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-2.png","element":"img","alt":" m×c","inline":true,"padRight":true},{"text":"matrix or an ","element":"span"},{"style":{"height":9.2},"width":80.84,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-3.png","element":"img","alt":" r×n","inline":true,"padRight":true},{"text":"matrix in RAM. To compute the intersection matrix ","element":"span"},{"style":{"height":15.53},"width":143.87,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-4.png","element":"img","alt":" C†AR†","inline":true},{"text":", the algorithm needs to visit each entry of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":", but it is not RAM expensive because the multiplication can be done by computing ","element":"span"},{"style":{"height":20.15},"width":402.57,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-5.png","element":"img","alt":" C†aj for j = 1, · · · , n","inline":true,"padRight":true},{"text":"separately. The above analysis is also valid for the Nystr¨om algorithm in Theorem ","element":"span"},{"href":"#id-48","text":"10","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 9 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"If we replace the near-optimal column selection algorithm in Theorem ","element":"span"},{"href":"#id-46","style":{"fontStyle":"italic"},"text":"8 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"by the optimal algorithm of ","element":"span"},{"href":"#id-11","referenceIndex":25,"style":{"fontStyle":"italic"},"text":"Guruswami and Sinop ","element":"a"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-11","referenceIndex":25,"style":{"fontStyle":"italic"},"text":"2012","element":"a"},{"style":{"fontStyle":"italic"},"text":"), it suffices to select ","element":"span"},{"style":{"height":19.13},"width":354.97,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-6.png","element":"img","alt":" c = kϵ−1(1 + o(1))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns and ","element":"span"},{"style":{"height":19.13},"width":295.34,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-7.png","element":"img","alt":" r = cϵ−1(1 + ϵ)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"rows totally. But the optimal algorithm is less efficient than the near-optimal algorithm.","element":"span"}],[{"id":"id-36","style":{"fontWeight":"bold"},"text":"4.3 Adaptive Sampling for the Nystr¨om Approximation","element":"span"}],[{"text":"Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"provides an approach for bounding the approximation errors incurred by projection simultaneously onto column space and row space. Thus this approach can be applied to solve the modified Nystr¨om method. The following theorem follows directly from Lemma ","element":"span"},{"href":"#id-60","text":"2 ","element":"a"},{"text":"and Corollary ","element":"span"},{"href":"#id-44","text":"7","element":"a"},{"text":".","element":"span"}],[{"id":"id-48","style":{"height":17.6},"width":1513.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-8.png","element":"img","alt":"Theorem 10 (Adaptive Sampling for the Modified Nystr¨om Method)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Given a symmetric matrix ","element":"span"},{"style":{"height":13.93},"width":210.43,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-9.png","element":"img","alt":" A ∈ Rm×m ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a target rank ","element":"span"},{"style":{"height":21.69},"width":483.61,"height":54.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-10.png","element":"img","alt":" k, with c1 = 2kϵ�1 + o(1)�","inline":true},{"style":{"fontStyle":"italic"},"text":"columns sampled by Algorithm ","element":"span"},{"href":"#id-59","style":{"fontStyle":"italic"},"text":"1 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"height":17.6},"width":185.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-11.png","element":"img","alt":" c2 = c1/ϵ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns sampled by the adaptive sampling algorithm, that is, with totally ","element":"span"},{"style":{"height":22.15},"width":322.88,"height":55.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-12.png","element":"img","alt":" c = 2kϵ2�1 + o(1)�","inline":true},{"style":{"fontStyle":"italic"},"text":"columns being sampled, the approximation error incurred by the modified Nystr¨om method is upper bounded by","element":"span"}],[{"style":{"width":"81%"},"width":1411,"height":83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-13.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"The algorithm costs time ","element":"span"},{"style":{"height":21.79},"width":841.58,"height":54.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/12-14.png","element":"img","alt":" O�mk2ϵ−4 + mk3ϵ−2/3�+ TMultiply�m2kϵ−2�","inline":true},{"style":{"fontStyle":"italic"},"text":"in computing ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"style":{"fontStyle":"italic"},"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"U","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 11 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The error bound in Theorem ","element":"span"},{"href":"#id-48","style":{"fontStyle":"italic"},"text":"10 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"is the only Frobenius norm relative-error bound for the Nystr¨om approximation at present, and it is also a constant-factor bound. If","element":"span"}],[{"id":"id-125","style":{"width":"96%"},"width":1666,"height":487,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-0.png","element":"img"}],[{"id":"id-50","text":"Table 3: Lower bounds of the standard Nystr¨om method and the ensemble Nystr¨om ","element":"figcaption","subtype":"caption"},{"text":"method. ","element":"figcaption","subtype":"caption"},{"text":"The blanks indicate the lower bounds are unknown to us. ","element":"figcaption","subtype":"caption"},{"text":"Here ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"m ","element":"figcaption","subtype":"caption"},{"text":"denotes the column/row number of the SPSD matrix, ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"c ","element":"figcaption","subtype":"caption"},{"text":"denotes the number of selected columns, and ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"k ","element":"figcaption","subtype":"caption"},{"text":"denotes the target rank.","element":"figcaption","subtype":"caption"}],[{"style":{"fontStyle":"italic"},"text":"one uses the optimal column selection algorithm of ","element":"span"},{"href":"#id-11","referenceIndex":25,"style":{"fontStyle":"italic"},"text":"Guruswami and Sinop ","element":"a"},{"style":{"fontStyle":"italic"},"text":"(","element":"span"},{"href":"#id-11","referenceIndex":25,"style":{"fontStyle":"italic"},"text":"2012","element":"a"},{"style":{"fontStyle":"italic"},"text":"), which is less efficient, the error bound is further improved: only ","element":"span"},{"style":{"height":22.15},"width":323.01,"height":55.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-1.png","element":"img","alt":" c = kϵ2 (1 + o(1))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"required. Furthermore, the theorem requires the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"to be symmetric, which is milder than the SPSD requirement made in the previous work.","element":"span"}],[{"text":"This is yet the strongest result for the Nystr¨om approximation problem—much stronger than the best possible algorithms for the conventional Nystr¨om method. We will illustrate this point by revealing the lower error bounds of the conventional Nystr¨om methods.","element":"span"}],[{"id":"id-37","style":{"fontWeight":"bold"},"text":"4.4 Lower Error Bounds of the Conventional Nystr¨om Methods","element":"span"}],[{"text":"We now demonstrate to what an extent our modified Nystr¨om method is superior over the conventional Nystr¨om methods (namely the standard Nystr¨om defined in (","element":"span"},{"href":"#id-68","text":"1","element":"a"},{"text":") and the ensemble Nystr¨om in (","element":"span"},{"href":"#id-69","text":"2","element":"a"},{"text":")) by showing the lower error bounds of the conventional Nystr¨om methods. The conventional Nystr¨om methods work no better than the lower error bounds unless additional assumptions are made on the original matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":". We show in Theorem ","element":"span"},{"href":"#id-49","text":"12 ","element":"a"},{"text":"the lower error bounds of the conventional Nystr¨om methods; the results are briefly summarized previously in Table ","element":"span"},{"href":"#id-51","text":"2","element":"a"},{"text":".","element":"span"}],[{"text":"To derive lower error bounds, we construct two adversarial cases for the Nystr¨om methods. To derive the spectral norm lower bounds, we use an SPSD matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"whose diagonal entries equal to 1 and off-diagonal entries equal to ","element":"span"},{"style":{"height":17.6},"width":129.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-2.png","element":"img","alt":" α ∈ [0,","inline":true,"padRight":true},{"text":"1). For the Frobenius norm and nuclear norm bounds, we construct an ","element":"span"},{"style":{"height":9.2},"width":119.81,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-3.png","element":"img","alt":" m×m","inline":true,"padRight":true},{"text":"block diagonal matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"which has ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"diagonal blocks, each of which is ","element":"span"},{"style":{"height":18.49},"width":116.88,"height":46.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-4.png","element":"img","alt":"mk × mk ","inline":true,"padRight":true},{"text":"in size and constructed in the same way as ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"text":". For the lower ","element":"span"},{"text":"bounds on ","element":"span"},{"style":{"height":31.37},"width":230.49,"height":78.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-5.png","element":"img","alt":"∥A− ˜A∥ξmaxi,j |aij|, α","inline":true,"padRight":true},{"text":"is set to be constant; for the bounds on ","element":"span"},{"style":{"height":31.68},"width":217.02,"height":79.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-6.png","element":"img","alt":"∥A− ˜A∥ξ∥A−Ak∥ξ , α","inline":true,"padRight":true},{"text":"is set to be ","element":"span"},{"style":{"height":9.6},"width":84.19,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-7.png","element":"img","alt":"α →","inline":true,"padRight":true},{"text":"1. The detailed proof of Theorem ","element":"span"},{"href":"#id-49","text":"12 ","element":"a"},{"text":"is deferred to Appendix ","element":"span"},{"text":"C","element":"span"},{"text":".","element":"span"}],[{"id":"id-49","style":{"height":17.6},"width":1332.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-8.png","element":"img","alt":"Theorem 12 (Lower Error Bounds of the Nystr¨om Methods)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"Assume we are given an SPSD matrix ","element":"span"},{"style":{"height":13.93},"width":209.1,"height":34.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-9.png","element":"img","alt":" A ∈ Rm×m ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"and a target rank ","element":"span"},{"style":{"height":15.24},"width":186.14,"height":38.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-10.png","element":"img","alt":" k. Let Ak","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denote the best rank-","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"style":{"fontStyle":"italic"},"text":"approximation to ","element":"span"},{"style":{"height":16.46},"width":185.08,"height":41.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/13-11.png","element":"img","alt":" A. Let ˜A","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denote either the rank-","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"style":{"fontStyle":"italic"},"text":"approximation to ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"constructed by the standard","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Nystr¨om method in (","element":"span"},{"href":"#id-68","style":{"fontStyle":"italic"},"text":"1","element":"a"},{"style":{"fontStyle":"italic"},"text":"), or the approximation constructed by the ensemble Nystr¨om method in (","element":"span"},{"href":"#id-69","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":") with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"style":{"fontStyle":"italic"},"text":"non-overlapping samples, each of which contains ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":". Then there exists an SPSD matrix such that for any sampling strategy the approximation errors of the conventional Nystr¨om methods, that is, ","element":"span"},{"style":{"height":21.3},"width":579.69,"height":53.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-0.png","element":"img","alt":" ∥A − ˜A∥ξ, (ξ = 2, F, or “∗”)","inline":true},{"style":{"fontStyle":"italic"},"text":", are lower bounded by some factors which are shown in Table ","element":"span"},{"href":"#id-50","style":{"fontStyle":"italic"},"text":"3","element":"a"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Remark 13 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The lower bounds in Table ","element":"span"},{"href":"#id-50","style":{"fontStyle":"italic"},"text":"3 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"(or Table ","element":"span"},{"href":"#id-51","style":{"fontStyle":"italic"},"text":"2","element":"a"},{"style":{"fontStyle":"italic"},"text":") show the conventional Nystr¨om methods can be sometimes very ineffective. The spectral norm and Frobenius norm bounds even depend on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"style":{"fontStyle":"italic"},"text":", so such bounds are not constant-factor bounds. Notice that the lower error bounds do not meet if ","element":"span"},{"style":{"height":15.53},"width":67.57,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-1.png","element":"img","alt":" W† ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is replaced by ","element":"span"},{"style":{"height":19.53},"width":202.46,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-2.png","element":"img","alt":" C†A(C†)T ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":", so our modified Nystr¨om method is not limited by such lower bounds.","element":"span"}],[{"id":"id-66","style":{"fontWeight":"bold"},"text":"4.5 Discussions of the Expected Relative-Error Bounds","element":"span"}],[{"text":"The upper error bounds established in this paper all hold in expectation. Now we show that the expected error bounds immediately extend to w.h.p. bounds using Markov’s inequality. Let the random variable ","element":"span"},{"style":{"height":20.46},"width":539.88,"height":51.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-3.png","element":"img","alt":" X = ∥A − ˜A∥F /∥A − Ak∥F","inline":true,"padRight":true},{"text":"denote the error ratio, where","element":"span"}],[{"style":{"width":"25%"},"width":446,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-4.png","element":"img"}],[{"text":"Then we have ","element":"span"},{"style":{"height":17.6},"width":259.75,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-5.png","element":"img","alt":" E(X) ≤ 1 + ϵ","inline":true,"padRight":true},{"text":"by the preceding theorems. By applying Markov’s inequality we have that","element":"span"}],[{"style":{"width":"40%"},"width":697,"height":84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-6.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"s ","element":"span"},{"text":"is an arbitrary constant greater than 1. Repeating the sampling procedure for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"times and letting ","element":"span"},{"style":{"height":18.75},"width":73.86,"height":46.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-7.png","element":"img","alt":" X(i)","inline":true,"padRight":true},{"text":"correspond to the error ratio of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th sample, we obtain an upper bound on the failure probability:","element":"span"}],[{"style":{"width":"92%"},"width":1607,"height":93,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-8.png","element":"img"}],[{"text":"which decays exponentially with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". Therefore, by repeating the sampling procedure multiple times and choosing the best sample, our CUR and Nystr¨om algorithms are also guaranteed with w.h.p. relative-error bounds. It follows directly from (","element":"span"},{"href":"#id-70","text":"4","element":"a"},{"text":") that, by repeating the sampling procedure for","element":"span"}],[{"style":{"width":"99%"},"width":1727,"height":546,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-9.png","element":"img"}],[{"text":"holds with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":76.68,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/14-10.png","element":"img","alt":" − δ.","inline":true}],[{"id":"id-122","text":"For another instance, we let ","element":"span"},{"style":{"height":16.4},"width":1100.74,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/15-0.png","element":"img","alt":" s = 2, then by repeating the sampling procedure for t ≥","inline":true,"padRight":true},{"text":"(1 + 1","element":"span"},{"style":{"height":17.6},"width":200.74,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/15-1.png","element":"img","alt":"/ϵ) log(1/δ","inline":true},{"text":") times, the inequality","element":"span"}],[{"style":{"width":"37%"},"width":655,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/15-2.png","element":"img"}],[{"text":"holds with probability at least 1 ","element":"span"},{"style":{"height":12.8},"width":76.68,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/15-3.png","element":"img","alt":" − δ.","inline":true}]]},{"heading":"5. Empirical Analysis","paragraphs":[[{"text":"In Section ","element":"span"},{"href":"#id-71","text":"5.1 ","element":"a"},{"text":"we empirical evaluate our CUR algorithms in comparison with the algorithms introduced in Section ","element":"span"},{"href":"#id-56","text":"3.3","element":"a"},{"text":". In Section ","element":"span"},{"href":"#id-72","text":"5.2 ","element":"a"},{"text":"we conduct empirical comparisons between the standard Nystr¨om and our modified Nystr¨om, and comparisons among three sampling algorithms. We report the approximation error incurred by each algorithm on each data set. The error ratio is defined by","element":"span"}],[{"style":{"width":"31%"},"width":544,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/15-4.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"˜","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontWeight":"bold"},"text":"CUR ","element":"span"},{"text":"for the CUR matrix decomposition, ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":15.53},"width":281.04,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/15-5.png","element":"img","alt":"A = CW†CT ","inline":true,"padRight":true},{"text":"for the standard Nystr¨om method, and ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":21},"width":437.48,"height":52.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/15-6.png","element":"img","alt":"A = C�C†A(C†)T �CT ","inline":true,"padRight":true},{"text":"for the modified Nystr¨om method.","element":"span"}],[{"text":"We conduct experiments on a workstation with two Intel Xeon 2","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"40GHz CPUs, 24GB RAM, and 64bit Windows Server 2008 system. We implement the algorithms in MATLAB R2011b, and use the MATLAB function ‘svds’ for truncated SVD. To compare the running time, all the computations are carried out in a single thread by setting ‘maxNumCompThreads(1)’ in MATLAB.","element":"span"}],[{"id":"id-71","style":{"fontWeight":"bold"},"text":"5.1 Comparison among the CUR Algorithms","element":"span"}],[{"text":"In this section we empirically compare our adaptive sampling based CUR algorithm (Algorithm ","element":"span"},{"href":"#id-45","text":"2","element":"a"},{"text":") with the subspace sampling algorithm of ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"Drineas et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":") and the deterministic sparse column-row approximation (SCRA) algorithm of ","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"Stewart ","element":"a"},{"text":"(","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"1999","element":"a"},{"text":"). For SCRA, we use the MATLAB code released by ","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"Stewart ","element":"a"},{"text":"(","element":"span"},{"href":"#id-14","referenceIndex":41,"text":"1999","element":"a"},{"text":"). As for the subspace sampling algorithm, we compute the leverages scores exactly via the truncated SVD. Although the fast approximation to leverage scores (","element":"span"},{"href":"#id-28","referenceIndex":16,"text":"Drineas et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-28","referenceIndex":16,"text":"2012","element":"a"},{"text":") can significantly speedup subspace sampling, we do not use it because the approximation has no theoretical guarantee when applied to subspace sampling.","element":"span"}],[{"style":{"width":"98%"},"width":1694,"height":225,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/15-7.png","element":"img"}],[{"id":"id-74","text":"Table 4: A summary of the data sets for CUR matrix decomposition.","element":"figcaption","subtype":"caption"}],[{"text":"We conduct experiments on four UCI data sets (","element":"span"},{"href":"#id-73","referenceIndex":19,"text":"Frank and Asuncion","element":"a"},{"text":", ","element":"span"},{"href":"#id-73","referenceIndex":19,"text":"2010","element":"a"},{"text":") which are summarized in Table ","element":"span"},{"href":"#id-74","text":"4","element":"a"},{"text":". Each data set is represented as a data matrix, upon which we apply the CUR algorithms. According to our analysis, the target rank ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"should be far less than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"}],[{"style":{"width":"78%"},"width":1353,"height":970,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/16-0.png","element":"img"}],[{"id":"id-75","text":"Figure 1: Results of the CUR algorithms on the Enron data set.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"78%"},"width":1353,"height":970,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/16-1.png","element":"img"}],[{"id":"id-76","text":"Figure 2: Results of the CUR algorithms on the Dexter data set.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"78%"},"width":1353,"height":970,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/17-0.png","element":"img"}],[{"id":"id-77","text":"Figure 3: Results of the CUR algorithms on the Farm Ads data set.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"78%"},"width":1353,"height":970,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/17-1.png","element":"img"}],[{"id":"id-78","text":"Figure 4: Results of the CUR algorithms on the Gisette data set.","element":"figcaption","subtype":"caption"}],[{"id":"id-123","text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n","element":"span"},{"text":", and the column number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"and row number ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"should be strictly greater than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":". For each data set and each algorithm, we set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"= 10 or 50, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ak","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ac","element":"span"},{"text":", where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"a ","element":"span"},{"text":"ranges in each set of experiments. We repeat each of the two randomized algorithms 10 times, and report the minimum error ratio and the total elapsed time of the 10 rounds. We depict the error ratios and the elapsed time of the three CUR matrix decomposition algorithms in Figures ","element":"span"},{"href":"#id-75","text":"1","element":"a"},{"text":", ","element":"span"},{"href":"#id-76","text":"2","element":"a"},{"text":", ","element":"span"},{"href":"#id-77","text":"3","element":"a"},{"text":", and ","element":"span"},{"href":"#id-78","text":"4","element":"a"},{"text":".","element":"span"}],[{"text":"We can see from Figures ","element":"span"},{"href":"#id-75","text":"1","element":"a"},{"text":", ","element":"span"},{"href":"#id-76","text":"2","element":"a"},{"text":", ","element":"span"},{"href":"#id-77","text":"3","element":"a"},{"text":", and ","element":"span"},{"href":"#id-78","text":"4 ","element":"a"},{"text":"that our adaptive sampling based CUR algorithm has much lower approximation error than the subspace sampling algorithm in all cases. Our adaptive sampling based algorithm is better than the deterministic SCRA on the Farm Ads data set and the Gisette data set, worse than SCRA on the Enron data set, and comparable to SCRA on the Dexter data set. In addition, the experimental results match our theoretical analysis in Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"very well. The empirical results all obey the theoretical relative-error upper bound","element":"span"}],[{"style":{"width":"62%"},"width":1077,"height":104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/18-0.png","element":"img"}],[{"text":"As for the running time, the subspace sampling algorithm and our adaptive sampling based algorithm are much more efficient than SCRA, especially when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"are large. Our adaptive sampling based algorithm is comparable to the subspace sampling algorithm when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"are small; however, our algorithm becomes less efficient when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"are large. This is due to the following reasons. First, the computational cost of the subspace sampling algorithm is dominated by the truncated SVD of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":", which is determined by the target rank ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"and the size and sparsity of the data matrix. However, the cost of our algorithm grows with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":". Thus, our algorithm becomes less efficient when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"are large. Second, the truncated SVD operation in MATLAB, that is, the ‘svds’ function, gains from sparsity, but our algorithm does not. The four data sets are all very sparse, so the subspace sampling algorithm has advantages. Third, the truncated SVD functions are very well implemented by MATLAB (not in MATLAB language but in Fortran/C). In contrast, our algorithm is implemented in MATLAB language, which is usually less efficient than Fortran/C.","element":"span"}],[{"id":"id-72","style":{"fontWeight":"bold"},"text":"5.2 Comparison among the Nystr¨om Algorithms","element":"span"}],[{"text":"In this section we empirically compare our adaptive sampling algorithm (in Theorem ","element":"span"},{"href":"#id-48","text":"10","element":"a"},{"text":") with some other sampling algorithms including the subspace sampling of ","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"Drineas et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-4","referenceIndex":15,"text":"2008","element":"a"},{"text":") and the uniform sampling, both without replacement. We also conduct comparison between the standard Nystr¨om and our modified Nystr¨om, both use the three sampling algorithms to select columns.","element":"span"}],[{"text":"We test the algorithms on three data sets which are summarized in Table ","element":"span"},{"href":"#id-79","text":"5","element":"a"},{"text":". ","element":"span"},{"text":"The experiment setting follows ","element":"span"},{"href":"#id-42","referenceIndex":21,"text":"Gittens and Mahoney ","element":"a"},{"text":"(","element":"span"},{"href":"#id-42","referenceIndex":21,"text":"2013","element":"a"},{"text":"). For each data set we generate a radial basis function (RBF) kernel matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"which is defined by","element":"span"}],[{"style":{"width":"30%"},"width":519,"height":109,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/18-1.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.42},"width":178.6,"height":43.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/18-2.png","element":"img","alt":" xi and xj","inline":true,"padRight":true},{"text":"are data instances and ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/18-3.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"is a scale parameter. Notice that the RBF kernel is dense in general. We set ","element":"span"},{"style":{"height":12},"width":118.5,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/18-4.png","element":"img","alt":" σ = 0.","inline":true},{"text":"2 or 1 in our experiments. For each data set with different","element":"span"}],[{"id":"id-118","style":{"width":"76%"},"width":1328,"height":588,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-0.png","element":"img"}],[{"id":"id-79","text":"Table 5: A summary of the data sets for the Nystr¨om approximation. In the second tabular","element":"figcaption","subtype":"caption"}],[{"style":{"width":"89%"},"width":1550,"height":160,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-1.png","element":"img"}],[{"text":"settings of ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-2.png","element":"img","alt":" σ","inline":true},{"text":", we fix a target rank ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"= 10, 20 or 50 and vary ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"in a very large range. We will discuss the choice of ","element":"span"},{"style":{"height":12.8},"width":149.66,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-3.png","element":"img","alt":" σ and k","inline":true,"padRight":true},{"text":"in the following two paragraphs. We run each algorithm for 10 times, and report the the minimum error ratio as well as the total elapsed time of the 10 re","element":"span"},{"href":"#id-79","text":"pe","element":"a"},{"text":"ats. The results are shown in Figures ","element":"span"},{"href":"#id-80","text":"5","element":"a"},{"text":", ","element":"span"},{"href":"#id-81","text":"6","element":"a"},{"text":", and ","element":"span"},{"href":"#id-82","text":"7","element":"a"},{"text":".","element":"span"}],[{"text":"Table ","element":"span"},{"href":"#id-79","text":"5 ","element":"a"},{"text":"provides useful implications on choosing the target rank ","element":"span"},{"href":"#id-79","style":{"height":27.65},"width":434.34,"height":69.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-4.png","element":"img","alt":" k. In Table 5, ∥A−Ak∥F∥A∥F","inline":true,"padRight":true},{"text":"denotes ratio that is not captured by the best rank-","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"approximation to the RBF kernel, and the parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-5.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"has an influence on the ratio ","element":"span"},{"style":{"height":17.6},"width":722.25,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-6.png","element":"img","alt":" ∥A − Ak∥F /∥A∥F . When σ is large,","inline":true,"padRight":true},{"text":"the RBF kernel can be well approximated by a low-rank matrix, which implies that (i) a small ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"suffices when ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-7.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"is large, and (ii) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"should be set large when ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-8.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"is small. So the settings (","element":"span"},{"style":{"height":17.6},"width":1547.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-9.png","element":"img","alt":"σ = 1, k = 10) and (σ = 0.2, k = 50) are more reasonable than the rest. Let","inline":true,"padRight":true},{"text":"us take the RBF kernel in the Abalone data set as an example. When ","element":"span"},{"style":{"height":15.6},"width":358.37,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-10.png","element":"img","alt":" σ = 1, the rank-10","inline":true,"padRight":true},{"text":"approximation well captures the kernel, so ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"can be safely set as small as 10; when ","element":"span"},{"style":{"height":14.8},"width":152.44,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-11.png","element":"img","alt":" σ = 0.2,","inline":true,"padRight":true},{"text":"the target rank ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"should be set large, say larger than 50, otherwise the approximation is rough.","element":"span"}],[{"text":"The standard deviation of the leverage scores reflects whether the advanced importance sampling techniques such as the subspace sampling and adaptive sampling are useful. Figures ","element":"span"},{"href":"#id-80","text":"5","element":"a"},{"text":", ","element":"span"},{"href":"#id-81","text":"6","element":"a"},{"text":", and ","element":"span"},{"href":"#id-82","text":"7 ","element":"a"},{"text":"show that the advantage of the subspace sampling and adaptive sampling over the uniform sampling is significant whenever the standard deviation of the leverage scores is large (see Table ","element":"span"},{"href":"#id-79","text":"5","element":"a"},{"text":"), and vise versa. Actually, as reflected in Table ","element":"span"},{"href":"#id-79","text":"5","element":"a"},{"text":", the parameter ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-12.png","element":"img","alt":" σ","inline":true,"padRight":true},{"text":"influences the homogeneity/heterogeneity of the leverage scores. Usually, when ","element":"span"},{"style":{"height":12},"width":70.73,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/19-13.png","element":"img","alt":" σ is","inline":true,"padRight":true},{"text":"small, the leverage scores become heterogeneous, and the effect of choosing “good” columns is significant.","element":"span"}],[{"text":"The experimental results also show that the subspace sampling and adaptive sampling algorithms significantly outperform the uniform sampling when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"is reasonably small, say ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c < ","element":"span"},{"text":"10","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":". This indicates that the subspace sampling and adaptive sampling algorithms are good at choosing “good” columns as basis vectors. The effect is especially evident on the","element":"span"}],[{"style":{"width":"99%"},"width":1721,"height":2283,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/20-0.png","element":"img"}],[{"id":"id-80","text":"Figure 5: Results of the Nystr¨om algorithms on the RBF kernel in the Abalone data set.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"99%"},"width":1715,"height":2284,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/21-0.png","element":"img"}],[{"id":"id-81","text":"Figure 6: Results of the Nystr¨om algorithms on the RBF kernel in the Wine Quality data ","element":"figcaption","subtype":"caption"},{"text":"set.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"98%"},"width":1710,"height":2283,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/22-0.png","element":"img"}],[{"id":"id-82","text":"Figure 7: Results of the Nystr¨om algorithms on the RBF kernel in the Letters data set.","element":"figcaption","subtype":"caption"}],[{"text":"RBF kernel with the scale parameter ","element":"span"},{"style":{"height":12},"width":118.5,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-0.png","element":"img","alt":" σ = 0.","inline":true},{"text":"2, where the leverage scores are heterogeneous. In most cases our adaptive sampling algorithm achieves the lowest approximation error among the three algorithms. The error ratios of our adaptive sampling for the modified Nystr¨om are in accordance with the theoretical bound in Theorem ","element":"span"},{"href":"#id-48","text":"10","element":"a"},{"text":"; that is,","element":"span"}],[{"style":{"width":"68%"},"width":1189,"height":113,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-1.png","element":"img"}],[{"text":"As for the running time, our adaptive sampling algorithm is more efficient than the subspace sampling algorithm. This is partly because the RBF kernel matrix is dense, and hence the subspace sampling algorithm costs ","element":"span"},{"style":{"height":19.13},"width":133.16,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-2.png","element":"img","alt":" O(m2k","inline":true},{"text":") time to compute the truncated SVD.","element":"span"}],[{"text":"Furthermore, the experimental results show that using ","element":"span"},{"style":{"height":19.53},"width":299.24,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-3.png","element":"img","alt":" U = C†A(C†)T ","inline":true,"padRight":true},{"text":"as the intersection matrix (denoted by “modified” in the figures) always leads to much lower error than using ","element":"span"},{"style":{"height":15.53},"width":176.27,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-4.png","element":"img","alt":" U = W† ","inline":true,"padRight":true},{"text":"(denoted by “standard”). However, our modified Nystr¨om method costs more time to compute the intersection matrix than the standard Nystr¨om method costs. Recall that the standard Nystr¨om costs ","element":"span"},{"style":{"height":19.13},"width":88.8,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-5.png","element":"img","alt":" O(c3","inline":true},{"text":") time to compute ","element":"span"},{"style":{"height":15.53},"width":164.36,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-6.png","element":"img","alt":" U = W† ","inline":true,"padRight":true},{"text":"and that the mod-ified Nystr¨om costs ","element":"span"},{"style":{"height":19.98},"width":440,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-7.png","element":"img","alt":" O(mc2)+TMultiply(m2c","inline":true},{"text":") time to compute ","element":"span"},{"style":{"height":19.53},"width":299.24,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-8.png","element":"img","alt":" U = C†A(C†)T ","inline":true,"padRight":true},{"text":". So the users should make a trade-off between time and accuracy and decide whether it is worthwhile to sacrifice extra computational overhead for the improvement in accuracy by using the modified Nystr¨om method.","element":"span"}]]},{"heading":"6. Conclusion","paragraphs":[[{"text":"In this paper we have built a novel and more general relative-error bound for the adaptive sampling algorithm. Accordingly, we have devised novel CUR matrix decomposition and Nystr¨om approximation algorithms which demonstrate significant improvement over the classical counterparts. Our relative-error CUR algorithm requires only ","element":"span"},{"style":{"height":19.13},"width":370.47,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-9.png","element":"img","alt":" c = 2kϵ−1(1 + o(1))","inline":true,"padRight":true},{"text":"columns and ","element":"span"},{"style":{"height":19.13},"width":264.07,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-10.png","element":"img","alt":" r = cϵ−1(1+ϵ","inline":true},{"text":") rows selected from the original matrix. To achieve relative-error bound, the best previous algorithm—the subspace sampling algorithm—requires ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"= ","element":"span"},{"style":{"height":19.13},"width":233.91,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-11.png","element":"img","alt":"O(kϵ−2 log k","inline":true},{"text":") columns and ","element":"span"},{"style":{"height":19.13},"width":303.78,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-12.png","element":"img","alt":" r = O(cϵ−2 log c","inline":true},{"text":") rows. Our modified Nystr¨om method is differ-ent from the conventional Nystr¨om methods in that it uses a different intersection matrix. We have shown that our adaptive sampling algorithm for the modified Nystr¨om achieves relative-error upper bound by sampling only ","element":"span"},{"style":{"height":19.13},"width":279.69,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/23-13.png","element":"img","alt":" c = 2kϵ−2(1+o","inline":true},{"text":"(1)) columns, which even beats the lower error bounds of the standard Nystr¨om and the ensemble Nystr¨om. Our proposed CUR and Nystr¨om algorithms are scalable because they need only to maintain a small fraction of columns or rows in RAM, and their time complexities are low provided that matrix multiplication can be highly efficiently executed. Finally, the empirical comparison has also demonstrated the effectiveness and efficiency of our algorithms.","element":"span"}]]},{"heading":"Acknowledgments","paragraphs":[[{"text":"This work has been supported in part by the Natural Science Foundations of China (No. 61070239) and the Scholarship Award for Excellent Doctoral Student granted by Chinese Ministry of Education.","element":"span"}]]},{"heading":"Appendix A. The Dual Set Sparsiﬁcation Algorithm","paragraphs":[[{"text":"For the sake of self-contained, we attach the dual set sparsification algorithm and describe some implementation details. The deterministic dual set sparsification algorithm is established by ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Boutsidis et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":") and severs as an important step in the near-optimal column selection algorithm (described in Lemma ","element":"span"},{"href":"#id-60","text":"2 ","element":"a"},{"text":"and Algorithm ","element":"span"},{"href":"#id-59","text":"1 ","element":"a"},{"text":"in this paper). We show the dual set sparsification algorithm algorithm in Algorithm ","element":"span"},{"href":"#id-83","text":"3 ","element":"a"},{"text":"and its bounds in Lemma ","element":"span"},{"href":"#id-84","text":"14","element":"a"},{"text":", and we also analyze the time complexity using our defined notation.","element":"span"}],[{"id":"id-84","style":{"height":17.6},"width":1728.06,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-0.png","element":"img","alt":"Lemma 14 (Dual Set Spectral-Frobenius Sparsification) Let U = {x1, · · · , xn} ⊂","inline":true},{"style":{"height":19.53},"width":187.63,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-1.png","element":"img","alt":"Rl (l < n)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"contain the columns of an arbitrary matrix ","element":"span"},{"style":{"height":19.53},"width":723.86,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-2.png","element":"img","alt":" X ∈ Rl×n. Let V = {v1, · · · , vn} ⊂ Rk","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"k < n","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be a decompositions of the identity, that is, ","element":"span"},{"style":{"height":20.29},"width":298.96,"height":50.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-3.png","element":"img","alt":"�ni=1 vivTi = Ik","inline":true},{"style":{"fontStyle":"italic"},"text":". Given an integer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"style":{"fontStyle":"italic"},"text":"with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k < r < n","element":"span"},{"style":{"fontStyle":"italic"},"text":", Algorithm ","element":"span"},{"href":"#id-83","style":{"fontStyle":"italic"},"text":"3 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"deterministically computes a set of weights ","element":"span"},{"style":{"height":17.6},"width":445,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-4.png","element":"img","alt":" si ≥ 0 (i = 1, · · · , n) at","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"most ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"style":{"fontStyle":"italic"},"text":"of which are non-zero, such that","element":"span"}],[{"style":{"width":"76%"},"width":1329,"height":126,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-5.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"The weights ","element":"span"},{"style":{"height":10.62},"width":32.45,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-6.png","element":"img","alt":" si","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"can be computed deterministically in ","element":"span"},{"style":{"height":20.8},"width":567.62,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-7.png","element":"img","alt":" O�rnk2�+ TMultiply�nl�time.","inline":true}],[{"text":"Here we mention some implementation issues of Algorithm ","element":"span"},{"href":"#id-83","text":"3 ","element":"a"},{"text":"which were not described in detail by ","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"Boutsidis et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-10","referenceIndex":5,"text":"2011","element":"a"},{"text":"). In each iteration the algorithm performs once eigenvalue decomposition: ","element":"span"},{"style":{"height":17.75},"width":471.86,"height":44.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-8.png","element":"img","alt":" Aτ = WΛWT . Here Aτ","inline":true,"padRight":true},{"text":"is guaranteed to be SPSD in each iteration. Since","element":"span"}],[{"style":{"width":"61%"},"width":1065,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-9.png","element":"img"}],[{"text":"(","element":"span"},{"style":{"height":17.6},"width":349.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-10.png","element":"img","alt":"Aτ − (Lτ + 1)Ik)q ","inline":true,"padRight":true},{"text":"can be efficiently computed based on the eigenvalue decomposition of ","element":"span"},{"style":{"height":15.02},"width":55.94,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-11.png","element":"img","alt":"Aτ","inline":true},{"text":". With the eigenvalues at hand, ","element":"span"},{"style":{"height":17.6},"width":148,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-12.png","element":"img","alt":" φ(L, Aτ","inline":true},{"text":") can also be computed directly.","element":"span"}],[{"text":"The algorithm runs in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r ","element":"span"},{"text":"iterations. In each iteration, the eigenvalue decomposition of ","element":"span"},{"style":{"height":15.02},"width":55.94,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-13.png","element":"img","alt":" Aτ","inline":true,"padRight":true},{"text":"requires ","element":"span"},{"style":{"height":19.13},"width":320.65,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-14.png","element":"img","alt":" O(k3), and the n","inline":true,"padRight":true},{"text":"comparisons in Line ","element":"span"},{"href":"#id-83","text":"6 ","element":"a"},{"text":"each requires ","element":"span"},{"style":{"height":19.13},"width":94.01,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-15.png","element":"img","alt":" O(k2","inline":true},{"text":"). Moreover, computing ","element":"span"},{"style":{"height":19.98},"width":705.14,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-16.png","element":"img","alt":"∥xi∥22 for each xi requires TMultiply(nl","inline":true},{"text":"). Overall, the running time of Algorithm ","element":"span"},{"href":"#id-83","text":"3 ","element":"a"},{"text":"is at most ","element":"span"},{"style":{"height":19.98},"width":1143.81,"height":49.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-17.png","element":"img","alt":"O(rk3) + O(rnk2) + TMultiply(nl) = O(rnk2) + TMultiply(nl).","inline":true}],[{"text":"The near-optimal column selection algorithm described in Lemma ","element":"span"},{"href":"#id-60","text":"2 ","element":"a"},{"text":"has three steps: randomized SVD via random projection which costs ","element":"span"},{"style":{"height":21.8},"width":791.73,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-18.png","element":"img","alt":" O�mk2ϵ−4/3�+TMultiply�mnkϵ−2/3�time,","inline":true,"padRight":true},{"text":"the dual set sparsification algorithm which costs ","element":"span"},{"style":{"height":21.8},"width":545.54,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-19.png","element":"img","alt":" O�nk3ϵ−2/3�+TMultiply�mn�","inline":true},{"text":"time, and the adaptive sampling algorithm which costs ","element":"span"},{"style":{"height":21.8},"width":684.85,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-20.png","element":"img","alt":" O�mk2ϵ−4/3�+ TMultiply�mnkϵ−2/3�","inline":true},{"text":"time. Therefore, the near-optimal column selection algorithm costs totally ","element":"span"},{"style":{"height":21.8},"width":518.2,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-21.png","element":"img","alt":" O�mk2ϵ−4/3 + nk3ϵ−2/3�+","inline":true},{"style":{"height":21.8},"width":492.73,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/24-22.png","element":"img","alt":"TMultiply�mnkϵ−2/3�time.","inline":true}]]},{"heading":"Appendix B. Proofs of the Adaptive Sampling Bounds","paragraphs":[[{"text":"We present the proofs of Theorem ","element":"span"},{"href":"#id-43","text":"5","element":"a"},{"text":", Corollary ","element":"span"},{"href":"#id-44","text":"7","element":"a"},{"text":", Theorem ","element":"span"},{"href":"#id-46","text":"8","element":"a"},{"text":", and Theorem ","element":"span"},{"href":"#id-48","text":"10 ","element":"a"},{"text":"in Appendices ","element":"span"},{"href":"#id-85","text":"B.1","element":"a"},{"text":", ","element":"span"},{"href":"#id-86","text":"B.2","element":"a"},{"text":", ","element":"span"},{"href":"#id-87","text":"B.3","element":"a"},{"text":", and ","element":"span"},{"href":"#id-88","text":"B.4","element":"a"},{"text":", respectively.","element":"span"}],[{"id":"id-85","style":{"fontWeight":"bold"},"text":"B.1 The Proof of Theorem ","element":"span"},{"href":"#id-43","style":{"fontWeight":"bold"},"text":"5","element":"a"}],[{"text":"Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"can be equivalently expressed in Theorem ","element":"span"},{"href":"#id-89","text":"15","element":"a"},{"text":". In order to stick to the column ","element":"span"},{"id":"id-89","text":"space convention throughout this paper, we prove Theorem ","element":"span"},{"href":"#id-89","text":"15 ","element":"a"},{"text":"instead of Theorem ","element":"span"},{"href":"#id-43","text":"5","element":"a"},{"text":".","element":"span"}],[{"id":"id-83","style":{"width":"99%"},"width":1728,"height":1016,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 15 (The Adaptive Sampling Algorithm) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Given a matrix ","element":"span"},{"style":{"height":13.93},"width":323.21,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-1.png","element":"img","alt":" A ∈ Rm×n and a","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"matrix ","element":"span"},{"style":{"height":19.53},"width":1581.47,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-2.png","element":"img","alt":" R ∈ Rr×n such that rank(R) = rank(AR†R) = ρ (ρ ≤ r ≤ m), let C1 ∈ Rm×c1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"consist of ","element":"span"},{"style":{"height":10.62},"width":35.88,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-3.png","element":"img","alt":" c1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":", and define the residual ","element":"span"},{"style":{"height":22.29},"width":750.12,"height":55.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-4.png","element":"img","alt":" B = A − C1C†1A. For i = 1, · · · , n, let","inline":true}],[{"style":{"width":"20%"},"width":362,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-5.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"height":15.02},"width":189.5,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-6.png","element":"img","alt":" bi is the i","inline":true},{"style":{"fontStyle":"italic"},"text":"-th column of the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontStyle":"italic"},"text":". Sample further ","element":"span"},{"style":{"height":10.62},"width":35.88,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-7.png","element":"img","alt":" c2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns from ","element":"span"},{"style":{"height":15.02},"width":247.54,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-8.png","element":"img","alt":" A in c2 i.i.d.","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"trials, where in each trial the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"style":{"fontStyle":"italic"},"text":"-th column is chosen with probability ","element":"span"},{"style":{"height":16.73},"width":392,"height":41.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-9.png","element":"img","alt":" pi. Let C2 ∈ Rm×c2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"contain the ","element":"span"},{"style":{"height":10.62},"width":35.88,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-10.png","element":"img","alt":" c2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"sampled columns and ","element":"span"},{"style":{"height":20.33},"width":504.63,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-11.png","element":"img","alt":" C = [C1, C2] ∈ Rm×(c1+c2) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"contain the columns of both ","element":"span"},{"style":{"height":15.02},"width":208.78,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-12.png","element":"img","alt":"C1 and C2","inline":true},{"style":{"fontStyle":"italic"},"text":", all of which are columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontStyle":"italic"},"text":". Then the following inequality holds:","element":"span"}],[{"style":{"width":"69%"},"width":1209,"height":86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-13.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where the expectation is taken w.r.t. ","element":"span"},{"style":{"height":15.02},"width":68.17,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-14.png","element":"img","alt":" C2.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"With a little abuse of symbols, we use bold uppercase letters to denote random matrices and bold lowercase to denote random vectors, without distinguishing between random matrices/vectors and non-random matrices/vectors.","element":"span"}],[{"text":"We denote the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"-th column of ","element":"span"},{"style":{"height":20.2},"width":425.5,"height":50.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-15.png","element":"img","alt":" VAR†R,ρ ∈ Rn×ρ as vj","inline":true},{"text":", and the (","element":"span"},{"style":{"fontStyle":"italic"},"text":"i, j","element":"span"},{"text":")-th entry of ","element":"span"},{"style":{"height":38.7},"width":1725.66,"height":96.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-16.png","element":"img","alt":" VAR†R,ρas vij","inline":true},{"text":". Define random vectors ","element":"span"},{"style":{"height":18.75},"width":203.22,"height":46.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-17.png","element":"img","alt":" xj,(l) ∈ Rm ","inline":true,"padRight":true},{"text":"such that for ","element":"span"},{"style":{"height":16.4},"width":578.23,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-18.png","element":"img","alt":" j = 1, · · · , n and l = 1, · · · , c2,","inline":true}],[{"id":"id-93","style":{"width":"81%"},"width":1416,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/25-19.png","element":"img"}],[{"text":"Notice that ","element":"span"},{"style":{"height":14.75},"width":86.21,"height":36.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-0.png","element":"img","alt":" xj,(l)","inline":true,"padRight":true},{"text":"is a linear function of a column of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"sampled from the above defined distribution. We have that","element":"span"}],[{"style":{"width":"80%"},"width":1387,"height":268,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-1.png","element":"img"}],[{"text":"Then we let ","element":"span"},{"style":{"height":23.06},"width":525.35,"height":57.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-2.png","element":"img","alt":" xj = 1c2�c2l=1 xj,(l), we have","inline":true}],[{"style":{"width":"91%"},"width":1587,"height":161,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-3.png","element":"img"}],[{"text":"According to the construction of ","element":"span"},{"style":{"height":13.42},"width":193.14,"height":33.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-4.png","element":"img","alt":" x1, · · · , xρ","inline":true},{"text":", we define the ","element":"span"},{"style":{"height":10.62},"width":35.88,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-5.png","element":"img","alt":" c2","inline":true,"padRight":true},{"text":"columns of ","element":"span"},{"style":{"height":15.02},"width":281.14,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-6.png","element":"img","alt":" A to be C2 ∈","inline":true},{"style":{"height":13.13},"width":117.5,"height":32.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-7.png","element":"img","alt":"Rm×c2","inline":true},{"text":". Note that all the random vectors ","element":"span"},{"style":{"height":13.42},"width":181.02,"height":33.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-8.png","element":"img","alt":" x1 · · · , xρ","inline":true,"padRight":true},{"text":"lie in the subspace span(","element":"span"},{"style":{"height":17.6},"width":300.57,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-9.png","element":"img","alt":"C1)+span(C2).","inline":true,"padRight":true},{"text":"We define random vectors","element":"span"}],[{"style":{"width":"71%"},"width":1244,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-10.png","element":"img"}],[{"text":"where the second equality follows from Lemma ","element":"span"},{"href":"#id-90","text":"16","element":"a"},{"text":"; that is, ","element":"span"},{"style":{"height":20.15},"width":593.64,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-11.png","element":"img","alt":" AR†Rvj = Avj if vj is one of","inline":true,"padRight":true},{"text":"the top ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-12.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"text":"right singular vectors of ","element":"span"},{"style":{"height":15.53},"width":130.62,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-13.png","element":"img","alt":" AR†R","inline":true},{"text":". Then we have that any set of random vectors ","element":"span"},{"style":{"height":18.62},"width":258.81,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-14.png","element":"img","alt":"{w1, · · · , wρ}","inline":true,"padRight":true},{"text":"lies in span(","element":"span"},{"style":{"height":18.62},"width":958.73,"height":46.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-15.png","element":"img","alt":"C) = span(C1) + span(C2). Let W = [w1, · · · , wρ","inline":true},{"text":"] be a random matrix, we have that span(","element":"span"},{"style":{"height":17.6},"width":269.21,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-16.png","element":"img","alt":"W) ⊂ span(C","inline":true},{"text":"). The expectation of ","element":"span"},{"style":{"height":16.62},"width":97.44,"height":41.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-17.png","element":"img","alt":" wj is","inline":true}],[{"style":{"width":"80%"},"width":1383,"height":198,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-18.png","element":"img"}],[{"text":"The expectation of ","element":"span"},{"style":{"height":19.75},"width":295.3,"height":49.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-19.png","element":"img","alt":" ∥wj − Avj∥22 is","inline":true}],[{"style":{"width":"90%"},"width":1564,"height":426,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-20.png","element":"img"}],[{"text":"To complete the proof, we denote","element":"span"}],[{"style":{"width":"29%"},"width":513,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-21.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.42},"width":187.06,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-22.png","element":"img","alt":" σq is the q","inline":true},{"text":"-th largest singular value of ","element":"span"},{"style":{"height":20.15},"width":267.06,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-23.png","element":"img","alt":" AR†R and uq","inline":true,"padRight":true},{"text":"is the corresponding left singular vector of ","element":"span"},{"style":{"height":15.53},"width":130.62,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-24.png","element":"img","alt":" AR†R","inline":true},{"text":". The column space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"F ","element":"span"},{"text":"is contained in span(","element":"span"},{"style":{"height":17.6},"width":288.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-25.png","element":"img","alt":"W) (⊂ span(C","inline":true},{"text":")), and thus","element":"span"}],[{"style":{"width":"80%"},"width":1389,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/26-26.png","element":"img"}],[{"text":"We use ","element":"span"},{"style":{"fontWeight":"bold"},"text":"F ","element":"span"},{"text":"to bound the error ","element":"span"},{"style":{"height":20.31},"width":657.11,"height":50.77,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-0.png","element":"img","alt":" ∥AR†R − CC†AR†R∥2F . That is,","inline":true}],[{"id":"id-91","style":{"width":"90%"},"width":1569,"height":213,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-1.png","element":"img"}],[{"text":"where (","element":"span"},{"href":"#id-91","text":"6","element":"a"},{"text":") is due to that ","element":"span"},{"style":{"height":19.53},"width":220.26,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-2.png","element":"img","alt":" A(I − R†R","inline":true},{"text":") is orthogonal to (","element":"span"},{"style":{"height":19.53},"width":670.92,"height":48.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-3.png","element":"img","alt":"I − CC†)AR†R. Since AR†R and","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"F ","element":"span"},{"text":"both lie on the space spanned by the right singular vectors of ","element":"span"},{"style":{"height":22.74},"width":488.26,"height":56.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-4.png","element":"img","alt":" AR†R (i.e., {vj}ρj=1), we","inline":true,"padRight":true},{"text":"decompose ","element":"span"},{"style":{"height":22.75},"width":492.6,"height":56.86,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-5.png","element":"img","alt":" AR†R − F along {vj}ρj=1","inline":true},{"text":", obtaining that","element":"span"}],[{"id":"id-92","style":{"width":"97%"},"width":1679,"height":768,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-6.png","element":"img"}],[{"text":"where (","element":"span"},{"href":"#id-92","text":"7","element":"a"},{"text":") follows from Lemma ","element":"span"},{"href":"#id-90","text":"16 ","element":"a"},{"text":"and (","element":"span"},{"href":"#id-92","text":"8","element":"a"},{"text":") follows from (","element":"span"},{"href":"#id-93","text":"5","element":"a"},{"text":").","element":"span"}],[{"id":"id-90","style":{"width":"104%"},"width":1807,"height":216,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-7.png","element":"img"}],[{"style":{"height":18.67},"width":599.26,"height":46.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-8.png","element":"img","alt":"Proof First let VR,ρ ∈ Rn×ρ ","inline":true,"padRight":true},{"text":"contain the top ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-9.png","element":"img","alt":" ρ","inline":true,"padRight":true},{"text":"right singular vectors of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"R","element":"span"},{"text":". Then the projection of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"onto the row space of ","element":"span"},{"style":{"height":23.14},"width":524.48,"height":57.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-10.png","element":"img","alt":" R is AR†R = AVR,ρVTR,ρ","inline":true},{"text":". Let the thin SVD of ","element":"span"},{"style":{"height":21.6},"width":839.59,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-11.png","element":"img","alt":"AVR,ρ ∈ Rm×ρ be ˜U ˜Σ ˜VT , where ˜V ∈ Rρ×ρ","inline":true},{"text":". Then the compact SVD of ","element":"span"},{"style":{"height":15.53},"width":173.93,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-12.png","element":"img","alt":" AR†R is","inline":true}],[{"style":{"width":"44%"},"width":762,"height":59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-13.png","element":"img"}],[{"text":"According to the definition, ","element":"span"},{"style":{"height":17.42},"width":207.88,"height":43.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-14.png","element":"img","alt":" vj is the j","inline":true},{"text":"-th column of (","element":"span"},{"style":{"height":21.6},"width":659.5,"height":53.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-15.png","element":"img","alt":"VR,ρ ˜V) ∈ Rn×ρ. Thus vj lies on","inline":true,"padRight":true},{"text":"the column space of ","element":"span"},{"style":{"height":17.94},"width":257.04,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-16.png","element":"img","alt":" VR,ρ, and vj","inline":true,"padRight":true},{"text":"is orthogonal to ","element":"span"},{"style":{"height":17.64},"width":119.87,"height":44.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-17.png","element":"img","alt":" VR,ρ⊥","inline":true},{"text":". Finally, since ","element":"span"},{"style":{"height":15.53},"width":278.54,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-18.png","element":"img","alt":" A − AR†R =","inline":true},{"style":{"height":23.25},"width":274.45,"height":58.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-19.png","element":"img","alt":"AVR,ρ⊥VTR,ρ⊥","inline":true},{"text":", we have that ","element":"span"},{"style":{"height":13.02},"width":41.48,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-20.png","element":"img","alt":" vj","inline":true,"padRight":true},{"text":"is orthogonal to ","element":"span"},{"style":{"height":15.53},"width":221.34,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-21.png","element":"img","alt":" A − AR†R","inline":true},{"text":", that is, (","element":"span"},{"style":{"height":20.15},"width":377.24,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/27-22.png","element":"img","alt":"A − AR†R)vj = 0,","inline":true,"padRight":true},{"text":"which directly proves the lemma.","element":"span"}],[{"id":"id-86","style":{"fontWeight":"bold"},"text":"B.2 The Proof of Corollary ","element":"span"},{"href":"#id-44","style":{"fontWeight":"bold"},"text":"7","element":"a"}],[{"text":"Since ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"is constructed by columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"and the column space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"is contained in the column space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":", we have rank(","element":"span"},{"style":{"height":19.53},"width":537.12,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-0.png","element":"img","alt":"CC†A) = rank(C) = ρ ≤ c","inline":true},{"text":". Consequently, the assumptions of Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"are satisfied. The assumptions in turn imply","element":"span"}],[{"style":{"width":"45%"},"width":780,"height":124,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-1.png","element":"img"}],[{"text":"and ","element":"span"},{"style":{"height":17.6},"width":155.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-2.png","element":"img","alt":" c/r2 = ϵ","inline":true},{"text":". It then follows from Theorem ","element":"span"},{"href":"#id-43","text":"5 ","element":"a"},{"text":"that","element":"span"}],[{"style":{"width":"90%"},"width":1565,"height":809,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-3.png","element":"img"}],[{"text":"which yields the error bound for CUR matrix decomposition.","element":"span"}],[{"text":"When the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"is symmetric, the matrix ","element":"span"},{"style":{"height":19.81},"width":60.24,"height":49.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-4.png","element":"img","alt":" CT1 ","inline":true,"padRight":true},{"text":"consists of the rows ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":", and thus we ","element":"span"},{"text":"can use Theorem ","element":"span"},{"href":"#id-89","text":"15 ","element":"a"},{"text":"(which is identical to Theorem ","element":"span"},{"href":"#id-43","text":"5","element":"a"},{"text":") to prove the error bound for the Nystr¨om approximation. By replacing ","element":"span"},{"style":{"fontWeight":"bold"},"text":"R ","element":"span"},{"text":"in Theorem ","element":"span"},{"href":"#id-89","text":"15 ","element":"a"},{"text":"by ","element":"span"},{"style":{"height":19.81},"width":60.24,"height":49.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-5.png","element":"img","alt":" CT1 ","inline":true,"padRight":true},{"text":", we have that","element":"span"}],[{"style":{"width":"90%"},"width":1570,"height":371,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-6.png","element":"img"}],[{"text":"given by Lemma ","element":"span"},{"href":"#id-94","text":"17","element":"a"},{"text":", we have that","element":"span"}],[{"style":{"width":"76%"},"width":1316,"height":286,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-7.png","element":"img"}],[{"id":"id-94","text":"Hence ","element":"span"},{"style":{"height":32.4},"width":1039.39,"height":81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/28-8.png","element":"img","alt":" E��A−CC†A(C†)T CT ��F ≤�E��A−CC†A(C†)T CT ��2F","inline":true}],[{"id":"id-126","style":{"height":17.6},"width":1448.59,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-0.png","element":"img","alt":"Lemma 17 Given an m×m matrix A and an m×c matrix C = [C1, C2]","inline":true},{"style":{"fontStyle":"italic"},"text":", the following inequality holds:","element":"span"}],[{"style":{"width":"59%"},"width":1032,"height":70,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-1.png","element":"img"}],[{"style":{"height":17.87},"width":531.82,"height":44.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-2.png","element":"img","alt":"Proof Let PCA = CC†A","inline":true,"padRight":true},{"text":"denote the projection of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"onto the column space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"text":", and ¯","element":"span"},{"style":{"height":17.87},"width":308.86,"height":44.68,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-3.png","element":"img","alt":"PC = Im − CC† ","inline":true,"padRight":true},{"text":"denote the projector onto the space orthogonal to the column space of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C","element":"span"},{"text":". It has been shown by ","element":"span"},{"href":"#id-53","referenceIndex":27,"text":"Halko et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-53","referenceIndex":27,"text":"2011","element":"a"},{"text":") that, for any matrix ","element":"span"},{"style":{"height":17.6},"width":514.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-4.png","element":"img","alt":" A, if span(M) ⊂ span(N),","inline":true,"padRight":true},{"text":"then the following inequalities hold:","element":"span"}],[{"style":{"width":"56%"},"width":970,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-5.png","element":"img"}],[{"text":"Accordingly, ","element":"span"},{"style":{"height":21.76},"width":314.57,"height":54.4,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-6.png","element":"img","alt":" APTRT = AR†R","inline":true,"padRight":true},{"text":"is the projection of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"onto the row space of ","element":"span"},{"style":{"height":13.93},"width":285.96,"height":34.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-7.png","element":"img","alt":" R ∈ Rr×n. We","inline":true,"padRight":true},{"text":"further have that","element":"span"}],[{"style":{"width":"77%"},"width":1347,"height":118,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-8.png","element":"img"}],[{"text":"and","element":"span"}],[{"style":{"width":"81%"},"width":1409,"height":127,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-9.png","element":"img"}],[{"text":"where the last equalities follow from ","element":"span"},{"style":{"height":17.55},"width":213.77,"height":43.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-10.png","element":"img","alt":" PC ⊥ ¯PC.","inline":true,"padRight":true},{"text":"Since span(","element":"span"},{"style":{"height":17.6},"width":503.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-11.png","element":"img","alt":"C1) ⊂ span(C), we have","inline":true},{"style":{"height":22.11},"width":534.27,"height":55.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-12.png","element":"img","alt":"∥PCA ¯PTC1∥2F ≥ ∥PCA ¯PTC∥2F ","inline":true,"padRight":true},{"text":", which proves the lemma.","element":"span"}],[{"id":"id-87","style":{"fontWeight":"bold"},"text":"B.3 The Proof of Theorem ","element":"span"},{"href":"#id-46","style":{"fontWeight":"bold"},"text":"8","element":"a"}],[{"text":"The error bound follows directly from Lemma ","element":"span"},{"href":"#id-60","text":"2 ","element":"a"},{"text":"and Corollary ","element":"span"},{"href":"#id-44","text":"7","element":"a"},{"text":". The near-optimal column selection algorithm costs ","element":"span"},{"style":{"height":21.8},"width":876.82,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-13.png","element":"img","alt":" O�mk2ϵ−4/3+nk3ϵ−2/3�+TMultiply�mnkϵ−2/3�","inline":true},{"text":"time to construct ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":21.8},"width":874.93,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-14.png","element":"img","alt":" O�nk2ϵ−4/3+mk3ϵ−2/3�+TMultiply�mnkϵ−2/3�","inline":true},{"text":"time to construct ","element":"span"},{"style":{"height":14.62},"width":54.64,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-15.png","element":"img","alt":" R1","inline":true},{"text":". Then the adaptive sampling algorithm costs ","element":"span"},{"style":{"height":20.8},"width":597.64,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-16.png","element":"img","alt":" O�nk2ϵ−2�+TMultiply�mnkϵ−1�","inline":true},{"text":"time to construct ","element":"span"},{"style":{"height":14.62},"width":54.64,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-17.png","element":"img","alt":" R2","inline":true},{"text":". Computing the Moore-Penrose inverses of ","element":"span"},{"style":{"height":20.8},"width":1129.26,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-18.png","element":"img","alt":" C and R costs O(mc2) + O(nr2) = O�mk2ϵ−2 + nk2ϵ−4�","inline":true,"padRight":true},{"text":"time. The multiplication of ","element":"span"},{"style":{"height":20.38},"width":921.46,"height":50.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-19.png","element":"img","alt":" C†AR† costs TMultiply(mnc) = TMultiply(mnkϵ−1","inline":true},{"text":") time. So the total time complexity is ","element":"span"},{"style":{"height":21.8},"width":1163.61,"height":54.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-20.png","element":"img","alt":" O�(m + n)k3ϵ−2/3 + mk2ϵ−2 + nk2ϵ−4�+ TMultiply�mnkϵ−1�.","inline":true}],[{"id":"id-88","style":{"fontWeight":"bold"},"text":"B.4 The Proof of Theorem ","element":"span"},{"href":"#id-48","style":{"fontWeight":"bold"},"text":"10","element":"a"}],[{"text":"The error bound follows immediately from Lemma ","element":"span"},{"href":"#id-60","text":"2 ","element":"a"},{"text":"and Corollary ","element":"span"},{"href":"#id-44","text":"7","element":"a"},{"text":". The near-optimal column selection algorithm costs ","element":"span"},{"style":{"height":21.8},"width":1080.1,"height":54.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-21.png","element":"img","alt":" O�mk2ϵ−4/3 + mk3ϵ−2/3�+ TMultiply�m2kϵ−2/3�time to","inline":true,"padRight":true},{"text":"select ","element":"span"},{"style":{"height":19.13},"width":245.46,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-22.png","element":"img","alt":" c1 = O(kϵ−1","inline":true},{"text":") columns of ","element":"span"},{"style":{"height":15.02},"width":304.45,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-23.png","element":"img","alt":" A construct C1","inline":true},{"text":". Then the adaptive sampling algorithm costs ","element":"span"},{"style":{"height":20.8},"width":616.36,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-24.png","element":"img","alt":" O�mk2ϵ−2�+ TMultiply�m2kϵ−1�","inline":true},{"text":"time to select ","element":"span"},{"style":{"height":19.13},"width":240.41,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-25.png","element":"img","alt":" c2 = O(kϵ−2","inline":true},{"text":") columns construct ","element":"span"},{"style":{"height":15.02},"width":67.17,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-26.png","element":"img","alt":" C2.","inline":true,"padRight":true},{"text":"Finally it costs ","element":"span"},{"style":{"height":20.8},"width":1104.32,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-27.png","element":"img","alt":" O(mc2)+TMultiply(m2c) = O(mk2ϵ−4)+TMultiply�m2kϵ−2�","inline":true},{"text":"time to construct the intersection matrix ","element":"span"},{"style":{"height":19.53},"width":333.34,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-28.png","element":"img","alt":" U = C†A(C†)T .","inline":true,"padRight":true},{"text":"So the total time complexity is ","element":"span"},{"style":{"height":20.8},"width":247.9,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-29.png","element":"img","alt":" O�mk2ϵ−4 +","inline":true},{"style":{"height":21.8},"width":603.74,"height":54.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/29-30.png","element":"img","alt":"mk3ϵ−2/3�+ TMultiply�m2kϵ−2�.","inline":true}]]},{"heading":"Appendix C. Proofs of the Lower Error Bounds","paragraphs":[[{"text":"In Appendix ","element":"span"},{"href":"#id-95","text":"C.1 ","element":"a"},{"text":"we construct two adversarial cases which will be used throughout this appendix. In Appendix ","element":"span"},{"href":"#id-96","text":"C.2 ","element":"a"},{"text":"we prove the lower bounds of the standard Nystr¨om method. In Appendix ","element":"span"},{"href":"#id-97","text":"C.3 ","element":"a"},{"text":"we prove the lower bounds of the ensemble Nystr¨om method. Theorems ","element":"span"},{"href":"#id-98","text":"20","element":"a"},{"text":", ","element":"span"},{"href":"#id-99","text":"21","element":"a"},{"text":", ","element":"span"},{"href":"#id-100","text":"22","element":"a"},{"text":", ","element":"span"},{"href":"#id-101","text":"24","element":"a"},{"text":", and ","element":"span"},{"href":"#id-102","text":"25 ","element":"a"},{"text":"are used for proving Theorem ","element":"span"},{"href":"#id-49","text":"12","element":"a"},{"text":".","element":"span"}],[{"id":"id-95","style":{"fontWeight":"bold"},"text":"C.1 Construction of the Adversarial Cases","element":"span"}],[{"text":"We now consider the construction of adversarial cases for the spectral norm bounds and the Frobenius norm and nuclear norm bounds, respectively.","element":"span"}],[{"style":{"width":"76%"},"width":1330,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-0.png","element":"img"}],[{"text":"We construct an ","element":"span"},{"style":{"height":9.2},"width":111.25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-1.png","element":"img","alt":" m×m","inline":true,"padRight":true},{"text":"positive definite matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"as follows:","element":"span"}],[{"id":"id-104","style":{"width":"85%"},"width":1480,"height":237,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-2.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":17.6},"width":134.57,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-3.png","element":"img","alt":" α ∈ [0,","inline":true,"padRight":true},{"text":"1). It is easy to verify ","element":"span"},{"style":{"height":15.93},"width":164.82,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-4.png","element":"img","alt":" xT Bx >","inline":true,"padRight":true},{"text":"0 for any nonzero ","element":"span"},{"style":{"height":12.8},"width":148.56,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-5.png","element":"img","alt":" x ∈ Rm","inline":true},{"text":". We show some properties of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"in Lemma ","element":"span"},{"href":"#id-103","text":"18","element":"a"},{"text":".","element":"span"}],[{"id":"id-103","style":{"height":14.84},"width":377.96,"height":37.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-6.png","element":"img","alt":"Lemma 18 Let Bk","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be the best rank-","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"style":{"fontStyle":"italic"},"text":"approximation to the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"style":{"fontStyle":"italic"},"text":"defined in (","element":"span"},{"href":"#id-104","style":{"fontStyle":"italic"},"text":"9","element":"a"},{"style":{"fontStyle":"italic"},"text":"). Then we have that","element":"span"}],[{"id":"id-105","style":{"width":"75%"},"width":1300,"height":183,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-7.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"text":"1 ","element":"span"},{"style":{"height":14.8},"width":254.8,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-8.png","element":"img","alt":" ≤ k ≤ m − 1.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"The squared Frobenius norm of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"is","element":"span"}],[{"style":{"width":"41%"},"width":716,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-9.png","element":"img"}],[{"text":"Then we study the singular values of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"text":". Since ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"is SPSD, here we do not distinguish between its singular values and eigenvalues. The spectral norm, that is, the largest singular value, of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"is","element":"span"}],[{"style":{"width":"93%"},"width":1621,"height":80,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-10.png","element":"img"}],[{"text":"where the maximum is attained when ","element":"span"},{"style":{"height":25.5},"width":605.14,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-11.png","element":"img","alt":" x = 1√m1m. Thus u1 = 1√m1m","inline":true,"padRight":true},{"text":"is the top singular vector of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"text":". Then the projection of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"onto the subspace orthogonal to ","element":"span"},{"style":{"height":14.22},"width":90.48,"height":35.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-12.png","element":"img","alt":" u1 is","inline":true}],[{"style":{"width":"63%"},"width":1092,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/30-13.png","element":"img"}],[{"text":"Then for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j > ","element":"span"},{"text":"1, the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"-th top eigenvalue ","element":"span"},{"style":{"height":12.62},"width":39.93,"height":31.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-0.png","element":"img","alt":" σj","inline":true,"padRight":true},{"text":"and eigenvector ","element":"span"},{"style":{"height":13.02},"width":42.88,"height":32.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-1.png","element":"img","alt":" uj","inline":true},{"text":", that is, the singular value and singular vector, of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"satisfy","element":"span"}],[{"style":{"width":"82%"},"width":1425,"height":90,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-2.png","element":"img"}],[{"text":"where the last equality follows from ","element":"span"},{"style":{"height":20.15},"width":952.74,"height":50.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-3.png","element":"img","alt":" uj ⊥ u1, that is, 1Tmuj = 0. Thus σj = 1 − α, and","inline":true}],[{"style":{"width":"32%"},"width":558,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-4.png","element":"img"}],[{"text":"for all 1 ","element":"span"},{"style":{"height":14.8},"width":167.36,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-5.png","element":"img","alt":" ≤ k < m","inline":true},{"text":". Finally we have that","element":"span"}],[{"style":{"width":"71%"},"width":1241,"height":335,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-6.png","element":"img"}],[{"text":"which complete our proofs.","element":"span"}],[{"text":"C.1.2 The Adversarial Case for The Frobenius Norm and Nuclear Norm Bounds","element":"span"}],[{"text":"Then we construct another adversarial case for proving the Frobenius norm and nuclear norm bounds. Let ","element":"span"},{"style":{"height":16},"width":244.7,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-7.png","element":"img","alt":" B be a p × p","inline":true,"padRight":true},{"text":"matrix with diagonal entries equal to one and off-diagonal entries equal to ","element":"span"},{"style":{"height":16},"width":301.19,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-8.png","element":"img","alt":" α. Let m = kp","inline":true,"padRight":true},{"text":"and we construct an ","element":"span"},{"style":{"height":9.2},"width":134.07,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-9.png","element":"img","alt":" m × m","inline":true,"padRight":true},{"text":"block diagonal matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"as follows:","element":"span"}],[{"style":{"width":"79%"},"width":1379,"height":237,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-10.png","element":"img"}],[{"id":"id-106","style":{"height":15.24},"width":378.19,"height":38.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-11.png","element":"img","alt":"Lemma 19 Let Ak","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be the best rank-","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"style":{"fontStyle":"italic"},"text":"approximation to the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"defined in (","element":"span"},{"href":"#id-105","style":{"fontStyle":"italic"},"text":"10","element":"a"},{"style":{"fontStyle":"italic"},"text":"). Then we have that","element":"span"}],[{"style":{"width":"49%"},"width":860,"height":253,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/31-12.png","element":"img"}],[{"text":"Lemma ","element":"span"},{"href":"#id-106","text":"19 ","element":"a"},{"text":"can be easily proved using Lemma ","element":"span"},{"href":"#id-103","text":"18","element":"a"},{"text":".","element":"span"}],[{"id":"id-96","style":{"fontWeight":"bold"},"text":"C.2 Lower Bounds of the Standard Nystr¨om Method","element":"span"}],[{"id":"id-98","style":{"height":12.8},"width":748.5,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-0.png","element":"img","alt":"Theorem 20 For an m × m matrix B","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with diagonal entries equal to one and off-diagonal entries equal to ","element":"span"},{"style":{"height":17.6},"width":173.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-1.png","element":"img","alt":" α ∈ [0, 1)","inline":true},{"style":{"fontStyle":"italic"},"text":", the approximation error incurred by the standard Nystr¨om method is lower bounded by","element":"span"}],[{"style":{"width":"63%"},"width":1104,"height":399,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-2.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Furthermore, the matrix ","element":"span"},{"text":"(","element":"span"},{"style":{"height":20.46},"width":380.59,"height":51.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-3.png","element":"img","alt":"B − ˜Bnysc ) is SPSD.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"The matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"is partitioned as in (","element":"span"},{"href":"#id-104","text":"9","element":"a"},{"text":"). The residual of the Nystr¨om approximation is","element":"span"}],[{"style":{"width":"70%"},"width":1221,"height":50,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-4.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":19.53},"width":920.68,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-5.png","element":"img","alt":" ξ = 2, F, or ∗. Since W = (1 − α)Ic + α1c1Tc ","inline":true,"padRight":true},{"text":"is nonsingular when ","element":"span"},{"style":{"height":17.6},"width":254.53,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-6.png","element":"img","alt":" α ∈ [0, 1), so","inline":true},{"style":{"height":15.53},"width":223.72,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-7.png","element":"img","alt":"W† = W−1","inline":true},{"text":". We apply the Sherman-Morrison-Woodbury formula","element":"span"}],[{"style":{"width":"63%"},"width":1101,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-8.png","element":"img"}],[{"text":"to compute ","element":"span"},{"style":{"height":19.13},"width":245.61,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-9.png","element":"img","alt":" W†, yielding","inline":true}],[{"style":{"width":"48%"},"width":839,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-10.png","element":"img"}],[{"text":"According to the construction, ","element":"span"},{"style":{"height":17.6},"width":396.62,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-11.png","element":"img","alt":" B21 is an (m−c) × c","inline":true,"padRight":true},{"text":"matrix with all entries equal to ","element":"span"},{"style":{"height":14.8},"width":86.36,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-12.png","element":"img","alt":" α, it","inline":true,"padRight":true},{"text":"follows that ","element":"span"},{"style":{"height":19.81},"width":599.01,"height":49.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-13.png","element":"img","alt":" B21W†BT21 is an (m−c)×(m−c","inline":true},{"text":") matrix with all entries equal to","element":"span"}],[{"id":"id-110","style":{"width":"67%"},"width":1171,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-14.png","element":"img"}],[{"text":"Then we obtain that","element":"span"}],[{"id":"id-107","style":{"width":"99%"},"width":1724,"height":700,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/32-15.png","element":"img"}],[{"text":"which proves the Frobenius norm of the residual.","element":"span"}],[{"text":"Now we compute the spectral norm of the residual. Based on the results above we have that","element":"span"}],[{"style":{"width":"56%"},"width":974,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-0.png","element":"img"}],[{"text":"Similar to the proof of Lemma ","element":"span"},{"href":"#id-103","text":"18","element":"a"},{"text":", it is easily obtained that ","element":"span"},{"style":{"height":25.5},"width":200.39,"height":63.75,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-1.png","element":"img","alt":"1√m−c1m−c","inline":true,"padRight":true},{"text":"is the top singular vector of the SPSD matrix (1 ","element":"span"},{"style":{"height":19.53},"width":567.06,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-2.png","element":"img","alt":" − α)Im−c + (α − η)1m−c1Tm−c","inline":true},{"text":", so the top singular value is","element":"span"}],[{"id":"id-108","style":{"width":"85%"},"width":1479,"height":145,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-3.png","element":"img"}],[{"text":"which proves the spectral norm bound because ","element":"span"},{"style":{"height":21.92},"width":568.28,"height":54.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-4.png","element":"img","alt":" ∥B − ˜Bnysc ∥2 = σ1�B − ˜Bnysc �.","inline":true,"padRight":true},{"text":"It is also easy to show the rest singular values obey","element":"span"}],[{"id":"id-109","style":{"width":"66%"},"width":1156,"height":129,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-5.png","element":"img"}],[{"text":"Thus we have, for ","element":"span"},{"style":{"height":15.2},"width":321.8,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-6.png","element":"img","alt":" i = 2, · · · , m − c,","inline":true}],[{"style":{"width":"67%"},"width":1162,"height":104,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-7.png","element":"img"}],[{"text":"The nuclear norm of the residual","element":"span"},{"style":{"height":21.92},"width":259.51,"height":54.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-8.png","element":"img","alt":"�B − ˜Bnysc �is","inline":true}],[{"style":{"width":"82%"},"width":1431,"height":382,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-9.png","element":"img"}],[{"text":"The theorem follows from equalities (","element":"span"},{"href":"#id-107","text":"14","element":"a"},{"text":"), (","element":"span"},{"href":"#id-108","text":"15","element":"a"},{"text":"), and (","element":"span"},{"href":"#id-109","text":"16","element":"a"},{"text":").","element":"span"}],[{"text":"Now we use the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"constructed in (","element":"span"},{"href":"#id-105","text":"10","element":"a"},{"text":") to show the Frobenius norm and nuclear norm lower bound. The bound is stronger than the one in Theorem ","element":"span"},{"href":"#id-98","text":"20 ","element":"a"},{"text":"by a factor of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":".","element":"span"}],[{"id":"id-99","style":{"height":12.8},"width":585.25,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-10.png","element":"img","alt":"Theorem 21 For the m × m","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"SPSD matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"defined in (","element":"span"},{"href":"#id-105","style":{"fontStyle":"italic"},"text":"10","element":"a"},{"style":{"fontStyle":"italic"},"text":"), the approximation error incurred by the standard Nystr¨om method is lower bounded by","element":"span"}],[{"id":"id-113","style":{"width":"66%"},"width":1153,"height":256,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/33-11.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k < m ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is an arbitrary positive integer.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"Let ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"consist of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"column sampled from ","element":"span"},{"style":{"height":19.48},"width":188.64,"height":48.7,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-0.png","element":"img","alt":" A and ˆCi","inline":true,"padRight":true},{"text":"consist of ","element":"span"},{"style":{"height":10.62},"width":30.88,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-1.png","element":"img","alt":" ci","inline":true,"padRight":true},{"text":"columns sampled from the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th block diagonal matrix in ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"text":". Without loss of generality, we assume ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":15.02},"width":208.04,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-2.png","element":"img","alt":"Ci consists","inline":true,"padRight":true},{"text":"of the first ","element":"span"},{"style":{"height":10.62},"width":30.88,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-3.png","element":"img","alt":" ci","inline":true,"padRight":true},{"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"text":", and accordingly ","element":"span"},{"text":"ˆ","element":"span"},{"style":{"height":14.62},"width":63.88,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-4.png","element":"img","alt":"Wi","inline":true,"padRight":true},{"text":"consists of the top left ","element":"span"},{"style":{"height":15.02},"width":345.87,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-5.png","element":"img","alt":" ci × ci block of B.","inline":true,"padRight":true},{"text":"Thus ","element":"span"},{"style":{"height":22.72},"width":1152.48,"height":56.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-6.png","element":"img","alt":" C = BlkDiag� ˆC1, · · · , ˆCk�and W = BlkDiag� ˆW1, · · · , ˆWk�.","inline":true}],[{"style":{"width":"94%"},"width":1625,"height":624,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-7.png","element":"img"}],[{"text":"Then it follows from Theorem ","element":"span"},{"href":"#id-98","text":"20 ","element":"a"},{"text":"that","element":"span"}],[{"id":"id-112","style":{"width":"64%"},"width":1113,"height":591,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-8.png","element":"img"}],[{"text":"where ˆ","element":"span"},{"style":{"height":22.86},"width":1601.51,"height":57.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-9.png","element":"img","alt":"p = p + 1−αα and ˆci = ci + 1−αα . Since �ki=1 ˆci = c + 1−αα k ≜ ˆc, the term �ki=1 ˆc−2i is","inline":true,"padRight":true},{"text":"minimized when ˆ","element":"span"},{"style":{"height":24.25},"width":851.16,"height":60.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-10.png","element":"img","alt":"c1 = · · · = ˆck. Thus �ki=1 ˆc−2i = k k2ˆc2 = k3ˆc−2","inline":true},{"text":". Finally we have that","element":"span"}],[{"style":{"width":"63%"},"width":1101,"height":268,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/34-11.png","element":"img"}],[{"text":"by which the Frobenius norm bound follows.","element":"span"}],[{"id":"id-127","text":"Since the matrices ","element":"span"},{"style":{"height":22.5},"width":259.22,"height":56.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-0.png","element":"img","alt":" B− ˆCi ˆW†i ˆCTi ","inline":true,"padRight":true},{"text":"are all SPSD by Theorem ","element":"span"},{"href":"#id-98","text":"20","element":"a"},{"text":", so the matrix (","element":"span"},{"style":{"height":20.46},"width":187.88,"height":51.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-1.png","element":"img","alt":"A− ˜Anysc )","inline":true,"padRight":true},{"text":"is also SPSD. We have that","element":"span"}],[{"style":{"width":"59%"},"width":1022,"height":534,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-2.png","element":"img"}],[{"text":"where the former inequality follows from Theorem ","element":"span"},{"href":"#id-98","text":"20","element":"a"},{"text":", and the latter inequality follows by minimizing w.r.t. ","element":"span"},{"style":{"height":11.2},"width":178.94,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-3.png","element":"img","alt":" c1, · · · , ck","inline":true,"padRight":true},{"text":"subjecting to ","element":"span"},{"style":{"height":13.24},"width":323.82,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-4.png","element":"img","alt":" c1 + · · · + ck = c.","inline":true}],[{"id":"id-100","style":{"fontWeight":"bold"},"text":"Theorem 22 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"There exists an ","element":"span"},{"style":{"height":9.2},"width":111.25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-5.png","element":"img","alt":" m×m","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"SPSD matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"such that the approximation error incurred by the standard Nystr¨om method is lower bounded by","element":"span"}],[{"style":{"width":"45%"},"width":781,"height":380,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-6.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k < m ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is an arbitrary positive integer.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"For the spectral norm bound we use the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"constructed in (","element":"span"},{"href":"#id-104","text":"9","element":"a"},{"text":") and set ","element":"span"},{"style":{"height":14.8},"width":129.77,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-7.png","element":"img","alt":" α → 1,","inline":true,"padRight":true},{"text":"then it follows directly from Lemma ","element":"span"},{"href":"#id-103","text":"18 ","element":"a"},{"text":"and Theorem ","element":"span"},{"href":"#id-98","text":"20","element":"a"},{"text":". For the Frobenius norm and nuclear norm bounds, we use the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"text":"constructed in (","element":"span"},{"href":"#id-105","text":"10","element":"a"},{"text":") and set ","element":"span"},{"style":{"height":9.6},"width":84.2,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-8.png","element":"img","alt":" α →","inline":true,"padRight":true},{"text":"1, then it follows directly from Lemma ","element":"span"},{"href":"#id-106","text":"19 ","element":"a"},{"text":"and Theorem ","element":"span"},{"href":"#id-99","text":"21","element":"a"},{"text":".","element":"span"}],[{"id":"id-97","style":{"fontWeight":"bold"},"text":"C.3 Lower Bounds of the Ensemble Nystr¨om Method","element":"span"}],[{"text":"The ensemble Nystr¨om method (","element":"span"},{"href":"#id-35","referenceIndex":30,"text":"Kumar et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-35","referenceIndex":30,"text":"2009","element":"a"},{"text":") is previously defined in (","element":"span"},{"href":"#id-69","text":"2","element":"a"},{"text":"). To derive lower bounds of the ensemble Nystr¨om method, we assume that the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"samples are non-overlapping. ","element":"span"},{"text":"According to the construction of the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"in (","element":"span"},{"href":"#id-104","text":"9","element":"a"},{"text":"), each of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"non-overlapping samples are equally “important”, so without loss of generality we set the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"samples with equal weights: ","element":"span"},{"style":{"height":21.96},"width":402.38,"height":54.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-9.png","element":"img","alt":" µ(1) = · · · = µ(t) = 1t .","inline":true}],[{"id":"id-114","style":{"fontWeight":"bold"},"text":"Lemma 23 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume that the ensemble Nystr¨om method selects a collection of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"style":{"fontStyle":"italic"},"text":"samples, each sample ","element":"span"},{"style":{"height":20.33},"width":544.38,"height":50.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-10.png","element":"img","alt":" C(i) (i = 1, · · · , t) contains c","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"style":{"fontStyle":"italic"},"text":"without overlapping. For an ","element":"span"},{"style":{"height":9.2},"width":122.64,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/35-11.png","element":"img","alt":" m×m","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"style":{"fontStyle":"italic"},"text":"with all diagonal entries equal to one and off-diagonal entries equal to ","element":"span"},{"style":{"height":17.6},"width":186.53,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-0.png","element":"img","alt":" α ∈ [0, 1),","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the approximation error incurred by the ensemble Nystr¨om method is lower bounded by","element":"span"}],[{"style":{"width":"71%"},"width":1230,"height":266,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-1.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"where ","element":"span"},{"text":"˜","element":"span"},{"style":{"height":26.37},"width":574.48,"height":65.92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-2.png","element":"img","alt":"Benst,c = 1t�ti=1 C(i)W(i)†C(i)T","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":". Furthermore, the matrix ","element":"span"},{"text":"(","element":"span"},{"style":{"height":22.78},"width":379.72,"height":56.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-3.png","element":"img","alt":"B − ˜Benst,c ) is SPSD.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"We use the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"constructed in (","element":"span"},{"href":"#id-104","text":"9","element":"a"},{"text":"). It is easy to check that ","element":"span"},{"style":{"height":19.13},"width":370.27,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-4.png","element":"img","alt":" W(1) = · · · = W(t),","inline":true,"padRight":true},{"text":"so we use the notation ","element":"span"},{"style":{"fontWeight":"bold"},"text":"W ","element":"span"},{"text":"instead. We assume that the samples contain the firs ","element":"span"},{"style":{"fontStyle":"italic"},"text":"tc ","element":"span"},{"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"text":"and each sample contains neighboring columns, that is,","element":"span"}],[{"style":{"width":"36%"},"width":630,"height":59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-5.png","element":"img"}],[{"text":"If a sample ","element":"span"},{"style":{"fontWeight":"bold"},"text":"C ","element":"span"},{"text":"contains the first ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"text":", then","element":"span"}],[{"style":{"width":"97%"},"width":1692,"height":338,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-6.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":12},"width":40,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-7.png","element":"img","alt":" Π","inline":true,"padRight":true},{"text":"is a permutation matrix. ","element":"span"},{"text":"As was shown in Equation (","element":"span"},{"href":"#id-110","text":"12","element":"a"},{"text":"), ","element":"span"},{"style":{"height":19.81},"width":328.91,"height":49.53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-8.png","element":"img","alt":" B21W†BT21 is an","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":17.6},"width":250.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-9.png","element":"img","alt":"m−c)×(m−c","inline":true},{"text":") matrix with all entries equal to","element":"span"}],[{"style":{"width":"18%"},"width":327,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-10.png","element":"img"}],[{"text":"Based on the properties of the matrix ","element":"span"},{"style":{"height":20.05},"width":376.62,"height":50.13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-11.png","element":"img","alt":" B − C(i)W(i)†C(i)T ","inline":true,"padRight":true},{"text":", we study the values of the entries of ","element":"span"},{"style":{"height":22.78},"width":171.58,"height":56.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-12.png","element":"img","alt":" B − ˜Benst,c ","inline":true,"padRight":true},{"text":". We can express it as","element":"span"}],[{"style":{"width":"86%"},"width":1502,"height":128,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-13.png","element":"img"}],[{"text":"and then a discreet examination reveals that ","element":"span"},{"style":{"height":22.77},"width":166.49,"height":56.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-14.png","element":"img","alt":" B − ˜Benst,c ","inline":true,"padRight":true},{"text":"can be partitioned into four kinds of ","element":"span"},{"text":"regions as illustrated in Figure ","element":"span"},{"href":"#id-111","text":"8","element":"a"},{"text":". We annotate the regions in the figure and summarize the values of entries in each region in the table below. (Region 1 and 4 are further partitioned into diagonal entries and off-diagonal entries.)","element":"span"}],[{"style":{"width":"95%"},"width":1649,"height":136,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/36-15.png","element":"img"}],[{"style":{"width":"43%"},"width":754,"height":586,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-0.png","element":"img"}],[{"id":"id-111","text":"Figure 8: An illustration of the matrix ","element":"figcaption","subtype":"caption"},{"style":{"height":18.71},"width":172.29,"height":46.78,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-1.png","element":"img","alt":" B − Benst,c ","inline":true,"padRight":true},{"text":"for the ensemble Nystr¨om method where ","element":"figcaption","subtype":"caption"},{"style":{"fontWeight":"bold"},"text":"B ","element":"figcaption","subtype":"caption"},{"text":"is defined in (","element":"figcaption","subtype":"caption"},{"href":"#id-104","text":"9","element":"a","subtype":"caption"},{"text":"). Here we set ","element":"figcaption","subtype":"caption"},{"style":{"height":15.6},"width":898.17,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-2.png","element":"img","alt":" m = 100, c = 20, α = 0.8, and t = 3. For the","inline":true,"padRight":true},{"text":"ensemble Nystr¨om method without overlapping, the matrix ","element":"figcaption","subtype":"caption"},{"style":{"height":19.12},"width":389.19,"height":47.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-3.png","element":"img","alt":" B − Benst,c can always","inline":true,"padRight":true},{"text":"be partitioned into four regions as annotated.","element":"figcaption","subtype":"caption"}],[{"text":"Now we do summation over the entries of ","element":"span"},{"style":{"height":22.78},"width":173.87,"height":56.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-4.png","element":"img","alt":" B − ˜Benst,c ","inline":true,"padRight":true},{"text":"to compute its squared Frobenius ","element":"span"},{"text":"norm:","element":"span"}],[{"style":{"width":"99%"},"width":1712,"height":537,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-5.png","element":"img"}],[{"text":"Furthermore, since the matrices ","element":"span"},{"style":{"height":20.05},"width":326.69,"height":50.12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-6.png","element":"img","alt":" B−C(i)W†C(i)T ","inline":true,"padRight":true},{"text":"are all SPSD by Theorem ","element":"span"},{"href":"#id-98","text":"20","element":"a"},{"text":", so their sum is also SPSD. Then the SPSD property of (","element":"span"},{"style":{"height":22.78},"width":165.58,"height":56.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-7.png","element":"img","alt":"B − ˜Benst,c ","inline":true,"padRight":true},{"text":") follows from (","element":"span"},{"href":"#id-112","text":"18","element":"a"},{"text":"). Therefore, the ","element":"span"},{"text":"nuclear norm of (","element":"span"},{"style":{"height":22.78},"width":171.58,"height":56.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-8.png","element":"img","alt":"B − ˜Benst,c ","inline":true,"padRight":true},{"text":") equals to the matrix trace, that is,","element":"span"}],[{"style":{"width":"59%"},"width":1022,"height":296,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/37-9.png","element":"img"}],[{"id":"id-101","text":"which proves the nuclear norm bound in the lemma.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Theorem 24 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume that the ensemble Nystr¨om method selects a collection of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"style":{"fontStyle":"italic"},"text":"samples, each sample ","element":"span"},{"style":{"height":20.33},"width":572.9,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-0.png","element":"img","alt":" C(i) (i = 1, · · · , t) contains c","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"without overlapping. For a the matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"defined in (","element":"span"},{"href":"#id-105","style":{"fontStyle":"italic"},"text":"10","element":"a"},{"style":{"fontStyle":"italic"},"text":"), the approximation error incurred by the ensemble Nystr¨om method is lower bounded by","element":"span"}],[{"style":{"width":"90%"},"width":1567,"height":366,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-1.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"According to the construction of ","element":"span"},{"href":"#id-105","style":{"height":20.33},"width":626.92,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-2.png","element":"img","alt":" A in (10), the i-th sample C(i) ","inline":true,"padRight":true},{"text":"is also block diagonal. We denote it by ","element":"span"},{"style":{"height":24.66},"width":576.94,"height":61.66,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-3.png","element":"img","alt":" C(i) = BlkDiag� ˆC(i)1 , · · · , ˆC(i)k �","inline":true},{"text":". Akin to (","element":"span"},{"href":"#id-113","text":"17","element":"a"},{"text":"), we have","element":"span"}],[{"style":{"width":"74%"},"width":1282,"height":203,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-4.png","element":"img"}],[{"text":"Thus the approximation error of the ensemble Nystr¨om method is","element":"span"}],[{"style":{"width":"89%"},"width":1545,"height":456,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-5.png","element":"img"}],[{"text":"where the inequality follows from Lemma ","element":"span"},{"href":"#id-114","text":"23","element":"a"},{"text":", and the last equality follows from ","element":"span"},{"style":{"height":24.4},"width":227.74,"height":61.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-6.png","element":"img","alt":"�kj=1 cj = c","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"kp ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":". The summation in the last equality equals to","element":"span"}],[{"style":{"width":"65%"},"width":1131,"height":436,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-7.png","element":"img"}],[{"text":"Here the inequality holds because the function is minimized when ","element":"span"},{"style":{"height":17.6},"width":418.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-8.png","element":"img","alt":" c1 = · · · = ck = c/k.","inline":true,"padRight":true},{"text":"Finally we have that","element":"span"}],[{"style":{"width":"80%"},"width":1397,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/38-9.png","element":"img"}],[{"style":{"width":"99%"},"width":1727,"height":346,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/39-0.png","element":"img"}],[{"text":"which proves the nuclear norm bound in the theorem.","element":"span"}],[{"id":"id-102","style":{"fontWeight":"bold"},"text":"Theorem 25 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume that the ensemble Nystr¨om method selects a collection of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"style":{"fontStyle":"italic"},"text":"samples, each sample ","element":"span"},{"style":{"height":20.33},"width":562.7,"height":50.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/39-1.png","element":"img","alt":" C(i) (i = 1, · · · , t) contains c","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"columns of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"without overlapping. Then there exists an ","element":"span"},{"style":{"height":9.2},"width":111.25,"height":23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/39-2.png","element":"img","alt":" m×m","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"SPSD matrix ","element":"span"},{"style":{"fontWeight":"bold"},"text":"A ","element":"span"},{"style":{"fontStyle":"italic"},"text":"such that the relative-error ratio of the ensemble Nystr¨om method is lower bounded by","element":"span"}],[{"style":{"width":"84%"},"width":1457,"height":365,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/39-3.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Proof ","element":"span"},{"text":"The theorem follows directly from Theorem ","element":"span"},{"href":"#id-101","text":"24 ","element":"a"},{"text":"and Lemma ","element":"span"},{"href":"#id-106","text":"19 ","element":"a"},{"text":"by setting ","element":"span"},{"style":{"height":12},"width":129.77,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1303.4207/images/39-4.png","element":"img","alt":" α → 1.","inline":true}]]},{"heading":"References","paragraphs":[[{"id":"id-52","text":"A. Ben-Israel and T. N. E. Greville. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Generalized Inverses: Theory and Applications. Second Edition","element":"span"},{"text":". Springer, 2003. ","element":"span"},{"text":"7","element":"span"}],[{"id":"id-3","text":"M. W. Berry, S. A. Pulatova, and G. W. Stewart. Algorithm 844: computing sparse reduced- ","element":"span"},{"text":"rank approximations to sparse matrices. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ACM Transactions on Mathematical Software","element":"span"},{"text":", 31(2):252–269, 2005. ","element":"span"},{"href":"#id-115","text":"2","element":"a"},{"text":", ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"id":"id-25","text":"J. Bien, Y. Xu, and M. W. Mahoney. ","element":"span"},{"text":"CUR from a sparse optimization viewpoint. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems (NIPS)","element":"span"},{"text":". 2010. ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"id":"id-15","text":"C. H. Bischof and P. C. Hansen. Structure-preserving and rank-revealing QR-factorizations. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Scientific and Statistical Computing","element":"span"},{"text":", 12(6):1332–1350, 1991. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"id":"id-10","text":"C. Boutsidis, P. Drineas, and M. Magdon-Ismail. Near-optimal column-based matrix recon- ","element":"span"},{"text":"struction. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CoRR","element":"span"},{"text":", abs/1103.0995, 2011. ","element":"span"},{"href":"#id-115","text":"2","element":"a"},{"text":", ","element":"span"},{"text":"8","element":"span"},{"text":", ","element":"span"},{"href":"#id-59","text":"9","element":"a"},{"text":", ","element":"span"},{"href":"#id-117","text":"12","element":"a"},{"text":", ","element":"span"},{"text":"25","element":"span"}],[{"id":"id-13","text":"T. F. Chan. Rank revealing QR factorizations. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Linear Algebra and Its Applications","element":"span"},{"text":", 88: 67–82, 1987. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"id":"id-17","text":"S. Chandrasekaran and I. C. F. Ipsen. On rank-revealing factorisations. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Matrix Analysis and Applications","element":"span"},{"text":", 15(2):592–622, 1994. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"text":"P. Cortez, A. Cerdeira, F. Almeida, T. Matos, and J. Reis. Modeling wine preferences by ","element":"span"},{"text":"data mining from physicochemical properties. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Decision Support Systems","element":"span"},{"text":", 47(4):547–553, 2009. ","element":"span"},{"href":"#id-118","text":"20","element":"a"}],[{"id":"id-2","text":"S. Deerwester, S. T. Dumais, G. W. Furnas, T. K. Landauer, and R. Harshman. Indexing ","element":"span"},{"text":"by latent semantic analysis. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of The American Society for Information Science","element":"span"},{"text":", 41(6):391–407, 1990. ","element":"span"},{"text":"1","element":"span"}],[{"id":"id-9","text":"A. Deshpande and L. Rademacher. Efficient volume sampling for row/column subset selec- ","element":"span"},{"text":"tion. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 51st IEEE Annual Symposium on Foundations of Computer Science (FOCS)","element":"span"},{"text":", pages 329–338, 2010. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"id":"id-8","text":"A. Deshpande, L. Rademacher, S. Vempala, and G. Wang. ","element":"span"},{"text":"Matrix approximation and projective clustering via volume sampling. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Theory of Computing","element":"span"},{"text":", 2(2006):225–247, 2006. ","element":"span"},{"href":"#id-115","text":"2","element":"a"},{"text":", ","element":"span"},{"href":"#id-119","text":"5","element":"a"},{"text":", ","element":"span"},{"text":"8","element":"span"},{"text":", ","element":"span"},{"href":"#id-59","text":"9","element":"a"},{"text":", ","element":"span"},{"href":"#id-120","text":"11","element":"a"}],[{"id":"id-29","text":"P. Drineas and R. Kannan. Pass-efficient algorithms for approximating large matrices. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceeding of the 14th Annual ACM-SIAM Symposium on Dicrete Algorithms (SODA)","element":"span"},{"text":", pages 223–232, 2003. ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"id":"id-23","text":"P. Drineas and M. W. Mahoney. On the Nystr¨om method for approximating a gram matrix ","element":"span"},{"text":"for improved kernel-based learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research","element":"span"},{"text":", 6:2153–2175, 2005. ","element":"span"},{"href":"#id-116","text":"3","element":"a"},{"text":", ","element":"span"},{"href":"#id-119","text":"5","element":"a"}],[{"id":"id-26","text":"P. Drineas, R. Kannan, and M. W. Mahoney. ","element":"span"},{"text":"Fast Monte Carlo algorithms for matrices III: computing a compressed approximate matrix decomposition. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Computing","element":"span"},{"text":", 36(1):184–206, 2006. ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"id":"id-4","text":"P. Drineas, M. W. Mahoney, and S. Muthukrishnan. Relative-error CUR matrix decompo- ","element":"span"},{"text":"sitions. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Matrix Analysis and Applications","element":"span"},{"text":", 30(2):844–881, September 2008. ","element":"span"},{"href":"#id-115","text":"2","element":"a"},{"text":", ","element":"span"},{"href":"#id-116","text":"3","element":"a"},{"text":", ","element":"span"},{"href":"#id-121","text":"6","element":"a"},{"text":", ","element":"span"},{"text":"10","element":"span"},{"text":", ","element":"span"},{"href":"#id-122","text":"16","element":"a"},{"text":", ","element":"span"},{"href":"#id-123","text":"19","element":"a"}],[{"id":"id-28","text":"P. Drineas, M. Magdon-Ismail, M. W. Mahoney, and D. P. Woodruff. Fast approximation ","element":"span"},{"text":"of matrix coherence and statistical leverage. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning (ICML)","element":"span"},{"text":", 2012. ","element":"span"},{"href":"#id-116","text":"3","element":"a"},{"text":", ","element":"span"},{"href":"#id-122","text":"16","element":"a"}],[{"id":"id-12","text":"L. V. Foster. Rank and null space calculations using matrix decomposition without column ","element":"span"},{"text":"interchanges. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Linear Algebra and its Applications","element":"span"},{"text":", 74:47–71, 1986. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"id":"id-32","text":"C. Fowlkes, S. Belongie, F. Chung, and J. Malik. Spectral grouping using the Nystr¨om ","element":"span"},{"text":"method. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Transactions on Pattern Analysis and Machine Intelligence","element":"span"},{"text":", 26(2):214– 225, 2004. ","element":"span"},{"href":"#id-124","text":"4","element":"a"}],[{"id":"id-73","text":"A. Frank and A. Asuncion. UCI machine learning repository, 2010. URL ","element":"span"},{"href":"http://archive.ics.uci.edu/ml","style":{"fontFamily":"monospace"},"text":"http://archive. ","element":"a"},{"href":"http://archive.ics.uci.edu/ml","style":{"fontFamily":"monospace"},"text":"ics.uci.edu/ml","element":"a"},{"text":". ","element":"span"},{"href":"#id-122","text":"16","element":"a"},{"text":", ","element":"span"},{"href":"#id-118","text":"20","element":"a"}],[{"id":"id-7","text":"A. Frieze, R. Kannan, and S. Vempala. Fast Monte Carlo algorithms for finding low-rank ","element":"span"},{"text":"approximations. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the ACM","element":"span"},{"text":", 51(6):1025–1041, November 2004. ISSN 0004-5411. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"id":"id-42","text":"A. Gittens and M. W. Mahoney. Revisiting the Nystr¨om method for improved large-scale ","element":"span"},{"text":"machine learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1303.1849","element":"span"},{"text":", 2013. ","element":"span"},{"href":"#id-119","text":"5","element":"a"},{"text":", ","element":"span"},{"text":"10","element":"span"},{"text":", ","element":"span"},{"href":"#id-123","text":"19","element":"a"}],[{"id":"id-20","text":"S. A. Goreinov, E. E. Tyrtyshnikov, and N. L. Zamarashkin. A theory of pseudoskeleton ","element":"span"},{"text":"approximations. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Linear Algebra and Its Applications","element":"span"},{"text":", 261:1–21, 1997a. ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"id":"id-21","text":"S. A. Goreinov, N. L. Zamarashkin, and E. E. Tyrtyshnikov. Pseudo-skeleton approxima- ","element":"span"},{"text":"tions by matrices of maximal volume. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematical Notes","element":"span"},{"text":", 62(4):619–623, 1997b. ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"id":"id-18","text":"M. Gu and S. C. Eisenstat. Efficient algorithms for computing a strong rank-revealing QR ","element":"span"},{"text":"factorization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Scientific Computing","element":"span"},{"text":", 17(4):848–869, 1996. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"id":"id-11","text":"V. Guruswami and A. K. Sinop. Optimal column-based low-rank matrix reconstruction. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 23rd Annual ACM-SIAM Symposium on Discrete Algorithms (SODA)","element":"span"},{"text":", 2012. ","element":"span"},{"href":"#id-115","text":"2","element":"a"},{"text":", ","element":"span"},{"text":"8","element":"span"},{"text":", ","element":"span"},{"href":"#id-59","text":"9","element":"a"},{"text":", ","element":"span"},{"href":"#id-45","text":"13","element":"a"},{"text":", ","element":"span"},{"href":"#id-125","text":"14","element":"a"}],[{"text":"I. Guyon, S. Gunn, A. Ben-Hur, and G. Dror. Result analysis of the NIPS 2003 feature ","element":"span"},{"text":"selection challenge. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems (NIPS)","element":"span"},{"text":", 2004. ","element":"span"},{"href":"#id-122","text":"16","element":"a"}],[{"id":"id-53","text":"N. Halko, P.-G. Martinsson, and J. A. Tropp. Finding structure with randomness: proba- ","element":"span"},{"text":"bilistic algorithms for constructing approximate matrix decompositions. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Review","element":"span"},{"text":", 53(2):217–288, 2011. ","element":"span"},{"text":"7","element":"span"},{"text":", ","element":"span"},{"text":"8","element":"span"},{"text":", ","element":"span"},{"href":"#id-126","text":"30","element":"a"}],[{"id":"id-16","text":"Y. P. Hong and C. T. Pan. Rank-revealing QR factorizations and the singular value de- ","element":"span"},{"text":"composition. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Mathematics of Computation","element":"span"},{"text":", 58(197):213–232, 1992. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"id":"id-39","text":"R. Jin, T. Yang, and M. Mahdavi. ","element":"span"},{"text":"Improved bound for the Nystr¨om method and its application to kernel classification. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CoRR","element":"span"},{"text":", abs/1111.2262, 2011. ","element":"span"},{"href":"#id-119","text":"5","element":"a"}],[{"id":"id-35","text":"S. Kumar, M. Mohri, and A. Talwalkar. Ensemble Nystr¨om method. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems (NIPS)","element":"span"},{"text":", 2009. ","element":"span"},{"href":"#id-124","text":"4","element":"a"},{"text":", ","element":"span"},{"href":"#id-119","text":"5","element":"a"},{"text":", ","element":"span"},{"href":"#id-127","text":"36","element":"a"}],[{"id":"id-40","text":"S. Kumar, M. Mohri, and A. Talwalkar. Sampling methods for the Nystr¨om method. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research","element":"span"},{"text":", 13:981–1006, 2012. ","element":"span"},{"href":"#id-119","text":"5","element":"a"}],[{"id":"id-6","text":"F. G. Kuruvilla, P. J. Park, and S. L. Schreiber. Vector algebra in the analysis of genome- ","element":"span"},{"text":"wide expression data. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Genome Biology","element":"span"},{"text":", 3:research0011–research0011.1, 2002. ","element":"span"},{"href":"#id-115","text":"2","element":"a"}],[{"id":"id-38","text":"M. Li, J. T. Kwok, and B.-L. Lu. Making large-scale Nystr¨om approximation possible. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning (ICML)","element":"span"},{"text":", 2010. ","element":"span"},{"href":"#id-119","text":"5","element":"a"}],[{"id":"id-27","text":"L. Mackey, A. Talwalkar, and M. I. Jordan. Divide-and-conquer matrix factorization. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems (NIPS)","element":"span"},{"text":". 2011. ","element":"span"},{"href":"#id-116","text":"3","element":"a"},{"text":", ","element":"span"},{"href":"#id-119","text":"5","element":"a"}],[{"id":"id-19","text":"M. W. Mahoney. Randomized algorithms for matrices and data. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Foundations and Trends in Machine Learning","element":"span"},{"text":", 3(2):123–224, 2011. ","element":"span"},{"href":"#id-115","text":"2","element":"a"},{"text":", ","element":"span"},{"href":"#id-119","text":"5","element":"a"}],[{"id":"id-5","text":"M. W. Mahoney and P. Drineas. CUR matrix decompositions for improved data analysis. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the National Academy of Sciences","element":"span"},{"text":", 106(3):697–702, 2009. ","element":"span"},{"href":"#id-115","text":"2","element":"a"},{"text":", ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"id":"id-24","text":"M. W. Mahoney, M. Maggioni, and P. Drineas. Tensor-CUR decompositions for tensor- ","element":"span"},{"text":"based data. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"SIAM Journal on Matrix Analysis and Applications","element":"span"},{"text":", 30(3):957–987, 2008. ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"text":"C. Mesterharm and M. J. Pazzani. ","element":"span"},{"text":"Active learning using on-line algorithms. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)","element":"span"},{"text":", 2011. ","element":"span"},{"href":"#id-122","text":"16","element":"a"}],[{"text":"D. Michie, D. J. Spiegelhalter, and C. C. Taylor. Machine learning, neural and statistical ","element":"span"},{"text":"classification. 1994. ","element":"span"},{"href":"#id-118","text":"20","element":"a"}],[{"id":"id-0","text":"L. Sirovich and M. Kirby. Low-dimensional procedure for the characterization of human ","element":"span"},{"text":"faces. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of the Optical Society of America A","element":"span"},{"text":", 4(3):519–524, Mar 1987. ","element":"span"},{"text":"1","element":"span"}],[{"id":"id-14","text":"G. W. Stewart. Four algorithms for the the efficient computation of truncated pivoted QR ","element":"span"},{"text":"approximations to a sparse matrix. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Numerische Mathematik","element":"span"},{"text":", 83(2):313–323, 1999. ","element":"span"},{"href":"#id-115","text":"2","element":"a"},{"text":", ","element":"span"},{"href":"#id-116","text":"3","element":"a"},{"text":", ","element":"span"},{"href":"#id-59","text":"9","element":"a"},{"text":", ","element":"span"},{"href":"#id-122","text":"16","element":"a"}],[{"id":"id-41","text":"A. Talwalkar and A. Rostamizadeh. Matrix coherence and the Nystr¨om method. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1004.2008","element":"span"},{"text":", 2010. ","element":"span"},{"href":"#id-119","text":"5","element":"a"}],[{"id":"id-33","text":"A. Talwalkar, S. Kumar, and H. Rowley. Large-scale manifold learning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","element":"span"},{"text":", 2008. ","element":"span"},{"href":"#id-124","text":"4","element":"a"}],[{"id":"id-1","text":"M. Turk and A. Pentland. Eigenfaces for recognition. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Cognitive Neuroscience","element":"span"},{"text":", 3 (1):71–86, 1991. ","element":"span"},{"text":"1","element":"span"}],[{"id":"id-22","text":"E. E. Tyrtyshnikov. Incomplete cross approximation in the mosaic-skeleton method. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Computing","element":"span"},{"text":", 64:367–380, 2000. ","element":"span"},{"href":"#id-116","text":"3","element":"a"}],[{"id":"id-30","text":"C. Williams and M. Seeger. Using the Nystr¨om method to speed up kernel machines. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems (NIPS)","element":"span"},{"text":", 2001. ","element":"span"},{"href":"#id-124","text":"4","element":"a"}],[{"id":"id-34","text":"K. Zhang and J. T. Kwok. Clustered Nystr¨om method for large scale manifold learning and ","element":"span"},{"text":"dimension reduction. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Transactions on Neural Networks","element":"span"},{"text":", 21(10):1576–1587, 2010. ","element":"span"},{"href":"#id-124","text":"4","element":"a"}],[{"id":"id-31","text":"K. Zhang, I. W. Tsang, and J. T. Kwok. Improved Nystr¨om low-rank approximation and ","element":"span"},{"text":"error analysis. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning (ICML)","element":"span"},{"text":", 2008. ","element":"span"},{"href":"#id-124","text":"4","element":"a"}]]}],"_version":"3.3.4"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]