1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTkwNS4wNDM3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2021-02-05T16:44:25.000Z","paperID":"1905.04374","published":"2019-05-05T16:41:25.000Z","authors":"[\"El-Mahdi El-Mhamdi\",\"Rachid Guerraoui\",\"Sébastien Rouault\"]","title":"Fast and Robust Distributed Learning in High Dimension","scoreTrending":null,"summary":"Could a gradient aggregation rule (GAR) for distributed machine learning be\nboth robust and fast? This paper answers by the affirmative through\nmulti-Bulyan. Given $n$ workers, $f$ of which are arbitrary malicious\n(Byzantine) and $m=n-f$ are not, we prove that multi-Bulyan can ensure a strong\nform of Byzantine resilience, as well as an ${\\frac{m}{n}}$ slowdown, compared\nto averaging, the fastest (but non Byzantine resilient) rule for distributed\nmachine learning. When $m \\approx n$ (almost all workers are correct),\nmulti-Bulyan reaches the speed of averaging. We also prove that multi-Bulyan's\ncost in local computation is $O(d)$ (like averaging), an important feature for\nML where $d$ commonly reaches $10^9$, while robust alternatives have at least\nquadratic cost in $d$.\n Our theoretical findings are complemented with an experimental evaluation\nwhich, in addition to supporting the linear $O(d)$ complexity argument, conveys\nthe fact that multi-Bulyan's parallelisability further adds to its efficiency.","lastCheckedForCode":"2022-09-03T11:20:21.974Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci8xOTA1MDQzNzQifQ==","type":"pwc","url":"https://paperswithcode.com/paper/190504374","data":null},{"id":"eyJ1cmwiOiJodHRwczovL2dpdGh1Yi5jb20vTFBELUVQRkwvQWdncmVnYVRob3IuIn0=","type":"code","url":"https://github.com/LPD-EPFL/AggregaThor.","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[{"model":"gpt-4o-mini","header":"paper.summary.expertise.beginner","summary":"The paper \"Fast and Robust Distributed Learning in High Dimension\" discusses a new method for training machine learning models using multiple computers at once. This approach helps the models learn quickly and perform well, even when there is a lot of data and variables involved. The researchers show that their method is efficient and can handle problems that traditional methods struggle with, making it easier for people to use machine learning in real-world situations."}],"emailsConnection":{"edges":[{"author":"rachid guerraoui","node":{"id":"eyJhZGRyZXNzIjoicmFjaGlkLmd1ZXJyYW91aUBlcGZsLmNoIn0=","address":"rachid.guerraoui@epfl.ch","name":"Rachid Guerraoui","avatar":null,"linkedin":"https://www.linkedin.com/in/rachid-guerraoui-40587b15","bio":null,"site":null,"override":null,"membership":[{"name":"EPFL"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"_TR-7CEAAAAJ"}],"twitter":[],"location":[{"formatted":"Switzerland"}],"owner":[{"id":"eyJ1aWQiOiI3ZTBmYTZjMi01YTJjLTRkOTItYjM3Zi0wMzdiODNjMTc3NWQifQ==","name":"Rachid Guerraoui","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjEwNi4wMjM5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.02398"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wMzg1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.03853"},{"id":"eyJwYXBlcklEIjoiMjIwOS4xNTI1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2209.15259"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNzI3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.07273"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wODE2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.08166"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wNDM3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.04374"},{"id":"eyJwYXBlcklEIjoiMTcxMS4wNjEwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1711.06100"},{"id":"eyJwYXBlcklEIjoiMjAwOC4wMDc0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.00742"},{"id":"eyJwYXBlcklEIjoiMTcwNy4wODE2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1707.08167"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wNzI3MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.07272"},{"id":"eyJwYXBlcklEIjoiMjMwOS4wNTM5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2309.05395"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wODg4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.08884"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wMTY4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.01686"},{"id":"eyJwYXBlcklEIjoiMTgwNS4xMTQ0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.11447"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wMjUxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.02510"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNzgzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.07834"},{"id":"eyJwYXBlcklEIjoiMTgwMS4xMDQzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1801.10437"},{"id":"eyJwYXBlcklEIjoiMjMwNC4xMzU0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.13540"}]}]}},{"author":"el mahdi el mhamdi","node":{"id":"eyJhZGRyZXNzIjoiZWxtYWhkaS5lbG1oYW1kaUBlcGZsLmNoIn0=","address":"elmahdi.elmhamdi@epfl.ch","name":"El Mahdi El Mhamdi","avatar":"https://img.fullcontact.com/static/42cf7ed0e966916700e762106360d5e3_cefee546d5c177efe0624625a1ab802b641fb39050e7d8bfe6126d63f8fd11ec","linkedin":"https://www.linkedin.com/in/mahdielmhamdi","bio":null,"site":null,"override":null,"membership":[{"name":"Swiss Federal Institute of Technology"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"kNA-WLQAAAAJ"}],"twitter":[{"avatar":null,"username":"M_h_i_l_h"}],"location":[{"formatted":"91120 Palaiseau, France"}],"owner":[{"id":"eyJ1aWQiOiI0YWU3YWVlMS1jNmNiLTRmNDMtYWExOS00Y2FhMzczYWE1Y2IifQ==","name":"chayti el mahdi","github":[],"email":[{"avatar":"https://img.fullcontact.com/static/42cf7ed0e966916700e762106360d5e3_cefee546d5c177efe0624625a1ab802b641fb39050e7d8bfe6126d63f8fd11ec"}],"authored":[{"id":"eyJwYXBlcklEIjoiMTkwNS4wMzg1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.03853"},{"id":"eyJwYXBlcklEIjoiMjExMS4wNTk2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.05968"},{"id":"eyJwYXBlcklEIjoiMjIwOS4xNTI1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2209.15259"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wNDM3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.04374"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMDM5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.00395"},{"id":"eyJwYXBlcklEIjoiMjAwOC4wMDc0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.00742"},{"id":"eyJwYXBlcklEIjoiMTcwNy4wODE2NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1707.08167"},{"id":"eyJwYXBlcklEIjoiMjQwOS4wMzY4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2409.03682"},{"id":"eyJwYXBlcklEIjoiMjMwMi4xMTk2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.11962"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wODg4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.08884"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wMTY4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.01686"},{"id":"eyJwYXBlcklEIjoiMTgwNS4xMTQ0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.11447"},{"id":"eyJwYXBlcklEIjoiMTgwNi4wMjUxMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1806.02510"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wNzgzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.07834"},{"id":"eyJwYXBlcklEIjoiMjIxMi4wMDc4MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2212.00781"}]}]}}]},"__typename":"paper","authorArray":["El-Mahdi El-Mhamdi","Rachid Guerraoui","Sébastien Rouault"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2b",null,{"publisher":"arxiv","paperID":"1905.04374","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2c",null,{"article":"$L2d","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2e",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L2f",null,{"paperID":"1905.04374","publisher":"arxiv","paperJSON":{"title":"Fast and Robust Distributed Learning in High Dimension","paperID":"1905.04374","avgLineHeight":13.55,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"Modern machine learning is distributed and the work of several machines is typically aggregated by ","element":"span"},{"style":{"fontStyle":"italic"},"text":"averaging ","element":"span"},{"text":"which is the optimal rule in terms of speed, offering a speedup of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"(with respect to using a single machine) when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"processes are learning together.","element":"span"}],[{"text":"Distributing data and models poses however fundamental vulnerabilities, be they to software bugs, asynchrony, or worse, to malicious attackers controlling some machines or injecting misleading data in the network. Such behavior is best modeled as Byzantine failures, and averaging does not tolerate a single one from a worker.","element":"span"}],[{"text":"Krum, the first provably Byzantine resilient aggregation rule for SGD only uses one worker per step, which hampers its speed of convergence, especially in best case conditions when none of the workers is actually Byzantine. An idea, coined multi-Krum, of using ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"different workers per step was mentioned, without however any proof neither on its Byzantine resilience nor on its slowdown. More recently, it was shown that in high dimensional machine learning, guaranteeing convergence is not a sufficient condition for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"strong ","element":"span"},{"text":"Byzantine resilience. A improvement on Krum, coined Bulyan, was proposed and proved to guarantee stronger resilience. However, Bulyan suffers from the same weakness of Krum: using only one worker per step. This adds up to the aforementioned open problem and leaves the crucial need for both fast and strong Byzantine resilience unfulfilled.","element":"span"}],[{"text":"The present paper tackles both open problems and proposes using Bulyan over Multi-Krum (we call it ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN","element":"span"},{"text":"), a combination for which we provide proofs of strong Byzantine resilience, as well as an ","element":"span"},{"style":{"height":16.57},"width":28,"height":41.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/0-0.png","element":"img","alt":"mn ","inline":true,"padRight":true},{"text":"slowdown, compared to averaging, the fastest (but non Byzantine resilient) rule for distributed ","element":"span"},{"text":"machine learning.","element":"span"}],[{"text":"Finally, modern machine learning involves data of unprecedentedly high dimension: some models are nowadays vectors of dimension ","element":"span"},{"style":{"height":13.38},"width":129.72,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/0-1.png","element":"img","alt":" d = 109","inline":true},{"text":". In order to deliver results within a reasonable time, learning algorithms should be at most linear in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"and avoid using classic security mechanisms, most of which are at least quadratic in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"and hence impractical. A strength of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"is that it inherits the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":") ","element":"span"},{"text":"merits of both multi-Krum and Bulyan.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"The ongoing data deluge has been both a blessing and burden for machine learning system designers. A blessing since machine learning provably performs better with more training datan ","element":"span"},{"href":"#id-0","referenceIndex":10,"text":"[10]","element":"a"},{"text":", and a burden since the numbers are beyond previous orders of magnitude. For instance, machine learning set of parameters are now in the gigabyte ","element":"span"},{"href":"#id-1","referenceIndex":5,"text":"[5]","element":"a"},{"text":", training data is several orders of magnitude beyond that ","element":"span"},{"href":"#id-1","referenceIndex":5,"text":"[5]","element":"a"},{"text":". For the latter reason, distributed machine learning is not an option, it the only way to deliver results in a reasonable time for the user. For instance, Stochastic Gradient Descent (SGD), an algorithm which is the workhorse of today’s machine learning.","element":"span"}],[{"text":"With the amounts of workload involved in today’s machine learning, distributed systems are the only option to deliver results in a reasonable time.","element":"span"}],[{"text":"This constraint is even more crucial since ML, given its workload, relies on large scale distributed systems for which communications costs are additional constraints to local computation costs.","element":"span"}],[{"text":"We prove the similar ","element":"span"},{"style":{"height":18.49},"width":30,"height":46.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-0.png","element":"img","alt":"mn ","inline":true,"padRight":true},{"text":"slowdown of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"and its (strong) Byzantine resilience. We deduce ","element":"span"},{"text":"that ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"ensures strong Byzantine resilience and the very fact that it is ","element":"span"},{"style":{"height":18.49},"width":30,"height":46.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-1.png","element":"img","alt":"mn ","inline":true,"padRight":true},{"text":"times as fast as the ","element":"span"},{"text":"optimal algorithm (averaging) in the absence of Byzantine workers.","element":"span"}],[{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"can be viewed as generalization (also using ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"different workers per step to leverage the fact that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":", possibly less than a minority can be faulty) of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Bulyan","element":"span"},{"text":", the defense mechanism we present in ","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"[6]","element":"a"},{"text":". Before presenting in Section ","element":"span"},{"text":"3, ","element":"span"},{"text":"our proofs of convergence and slow down of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"text":"and in Section ","element":"span"},{"text":"4 ","element":"span"},{"text":"our proofs of convergence and slow down of B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"and hence ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN","element":"span"},{"text":", we introduce in Section ","element":"span"},{"text":"2 ","element":"span"},{"text":"a toolbox of formal definitions: weak, strong, and ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-2.png","element":"img","alt":" (α, f)","inline":true},{"text":"–Byzantine resilience. We also present a necessary context on non-convex optimization, as well as its interplay with the high dimensionality of machine learning together with the","element":"span"},{"style":{"height":17.6},"width":59.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-3.png","element":"img","alt":"√d","inline":true,"padRight":true},{"text":"leeway it provides to strong attackers","element":"span"}]]},{"heading":"2 Model","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"2.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Stochastic Gradient Descent","element":"span"}],[{"text":"The learning task consists in making accurate predictions for the labels of each data instance ","element":"span"},{"style":{"height":16.4},"width":31.09,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-4.png","element":"img","alt":" ξi","inline":true,"padRight":true},{"text":"using a high dimentional model (for example, a neural network); we denote the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"parameters of that model by the vector ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"x","element":"span"},{"text":". Each data instance has a set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"features ","element":"span"},{"text":"(image pixels), and a set of labels (e.g., ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"text":"cat, person","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":"). The CNN is trained with the popular backpropagation algorithm based on SGD. Specifically, SGD addresses the following optimization problem.","element":"span"}],[{"style":{"width":"61%"},"width":1149,"height":72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-6.png","element":"img","alt":" ξ","inline":true,"padRight":true},{"text":"is a random variable representing a total of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"data instances and ","element":"span"},{"style":{"height":17.6},"width":137.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-7.png","element":"img","alt":" F(x; ξ)","inline":true,"padRight":true},{"text":"is the loss function. The function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"is smooth but not convex.","element":"span"}],[{"text":"SGD computes the gradient (","element":"span"},{"style":{"height":17.6},"width":397.98,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-8.png","element":"img","alt":"G(x, ξ) ≜ ∇F(x; ξ)","inline":true},{"text":") and then updates the model parameters (","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"x","element":"span"},{"text":") in a direction opposite to that of the gradient (descent). The vanilla SGD update rule given a sequence of learning rates ","element":"span"},{"style":{"height":17.6},"width":86.89,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-9.png","element":"img","alt":" {γk}","inline":true,"padRight":true},{"text":"at any given step","element":"span"},{"href":"#id-3","text":"1 ","element":"a"},{"text":"is the following:","element":"span"}],[{"style":{"width":"65%"},"width":1226,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-10.png","element":"img"}],[{"text":"The popularity of SGD stems from its ability to employ noisy approximations of the actual gradient. In a distributed setup, SGD employs a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mini-batch ","element":"span"},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"b < B ","element":"span"},{"text":"training instances for the gradient computation:","element":"span"}],[{"id":"id-3","style":{"width":"99%"},"width":1870,"height":158,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/1-11.png","element":"img"}],[{"style":{"width":"22%"},"width":428,"height":281,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-0.png","element":"img"}],[{"id":"id-5","style":{"fontWeight":"bold"},"text":"Figure 1: Correct workers (black dashed arrows) estimating the real gradient (blue full arrow) while ","element":"figcaption","subtype":"caption"},{"style":{"fontWeight":"bold"},"text":"a Byzantine worker (red dotted)","element":"figcaption","subtype":"caption"}],[{"text":"The size of the mini-batch (","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":") induces a trade-off between the robustness of a given update (noise in the gradient approximation) and the time required to compute this update. The mini-batch also affects the amount of parallelism (Equation ","element":"span"},{"href":"#id-3","text":"3) ","element":"a"},{"text":"that modern computing clusters (multi-GPU etc.) largely benefit from. Scaling the mini-batch size to exploit additional parallelism requires however a non-trivial selection of the sequence of learning rates ","element":"span"},{"href":"#id-4","referenceIndex":7,"text":"[7]","element":"a"},{"text":". A very important assumption for the convergence properties of SGD is that each gradient is an unbiased estimation of the actual gradient, which is typically ensured through uniform random sampling, i.e., gradients that are on expectation equal to the actual gradient (Figure ","element":"span"},{"href":"#id-5","text":"1)","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Algorithms","element":"span"}],[{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"relies on two algorithmic components: ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1] ","element":"a"},{"text":"and B","element":"span"},{"text":"ULYAN ","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"[6]","element":"a"},{"text":". The former rule requires that ","element":"span"},{"style":{"height":16.4},"width":207.59,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-1.png","element":"img","alt":" n ≥ 2f + 3","inline":true,"padRight":true},{"text":"and the second requires that ","element":"span"},{"style":{"height":16.4},"width":218.4,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-2.png","element":"img","alt":" n ≥ 4f + 3.","inline":true}],[{"text":"Intuitively, the goal of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"text":"is to select the gradients that deviate less from the “majority” based on their relative distances. Given gradients ","element":"span"},{"style":{"height":14.62},"width":182.77,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-3.png","element":"img","alt":" G1 . . . Gn","inline":true,"padRight":true},{"text":"proposed by workers 1 to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"respectively, ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"text":"selects the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"gradients with the smallest sum of scores (i.e., L2 norm from the other gradients) as follows:","element":"span"}],[{"style":{"width":"63%"},"width":1197,"height":101,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-4.png","element":"img"}],[{"text":"where given a function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":")","element":"span"},{"text":", ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":") arg min(","element":"span"},{"style":{"fontStyle":"italic"},"text":"X","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":")) ","element":"span"},{"text":"denotes the indexes ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"with the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"smallest ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":") ","element":"span"},{"text":"values, and ","element":"span"},{"style":{"height":16},"width":103.12,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-5.png","element":"img","alt":" i → j","inline":true,"padRight":true},{"text":"means that ","element":"span"},{"style":{"height":17.02},"width":53.7,"height":42.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-6.png","element":"img","alt":" Gj","inline":true,"padRight":true},{"text":"is among the ","element":"span"},{"style":{"height":16.4},"width":181.89,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-7.png","element":"img","alt":" n − f − 2","inline":true,"padRight":true},{"text":"closest gradients to ","element":"span"},{"style":{"height":14.62},"width":232.4,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-8.png","element":"img","alt":" Gi. BULYAN","inline":true,"padRight":true},{"text":"in turn takes the aforementioned ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"vectors, computes their coordinate-wise median and produces a gradient which coordinates are the average of the ","element":"span"},{"style":{"height":16.4},"width":138.46,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-9.png","element":"img","alt":" m − 2f","inline":true,"padRight":true},{"text":"closest values to the median.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Byzantine Resilience","element":"span"}],[{"text":"Intuitively, weak Byzantine resilience requires a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR ","element":"span"},{"text":"to guarantee convergence despite the presence of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"Byzantine workers. It can be formally stated as follows.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 1 ","element":"span"},{"text":"(Weak Byzantine resilience)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We say that a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ensures weak ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"style":{"fontStyle":"italic"},"text":"-Byzantine resilience if the sequence ","element":"span"},{"style":{"height":15.93},"width":73.42,"height":39.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-10.png","element":"img","alt":" x(k) ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"(Equation 2 in the main paper) converges almost surely to some ","element":"span"},{"style":{"height":17.6},"width":448.18,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/2-11.png","element":"img","alt":" x∗ where ∇Q(x∗) = 0,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"despite the presence of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Byzantine workers.","element":"span"}],[{"text":"On the other hand, strong Byzantine resilience requires that this convergence does not lead to ”bad” optimums, and is related to more intricate problem of non-convex optimization, which, in the presence of Byzantine workers, is highly aggravated by the dimension of the problem as explained in what follows.","element":"span"}],[{"id":"id-9","style":{"width":"40%"},"width":750,"height":658,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Figure 2: In a non-convex situation, two correct vectors (black arrows) are pointing towards the deep optimum located in area B, both vectors belong to the plane formed by lines L1 and L2. A Byzantine worker (magenta) is taking benefit from the third dimension, and the non-convex landscape, to place a vector that is heading towards one of the bad local optimums of area A. This Byzantine vector is located in the plane (L1,L3). Due to the variance of the correct workers on the plane (L1,L2), the Byzantine one has a budget of about","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":58.36,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-1.png","element":"img","alt":"√3","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"times the disagreement of the correct workers, to put as a deviation towards A, on the line (L3), while still being selected by a weak Byzantine resilient ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"GAR","element":"figcaption","subtype":"caption"},{"style":{"fontWeight":"bold"},"text":", since its projection on the plane (L1,L2) lies exactly on the line (L1), unlike that of the correct workers. In very high dimensions, the situation is amplified by","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":70.07,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-2.png","element":"img","alt":"√d.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Specificity of non-convex optimization. ","element":"span"},{"text":"Non-convex optimization is one of the earliest established NPhard problems ","element":"span"},{"href":"#id-7","referenceIndex":8,"text":"[8]","element":"a"},{"text":". In fact, many interesting but hard questions in machine learning boil down to one answer: ”because the cost function is not necessarily convex”.","element":"span"}],[{"text":"In distributed machine learning, the non-convexity of the cost function creates two non-intuitive behaviours that are important to highlight.","element":"span"}],[{"text":"(1) A ”mild” Byzantine worker can make the system converge faster. For instance, it has been reported several times in the literature that noise accelerates learning ","element":"span"},{"href":"#id-8","referenceIndex":2,"text":"[2, ","element":"a"},{"href":"#id-7","referenceIndex":8,"text":"8]","element":"a"},{"text":". This can be understood from the ”S” (stochasticity) of SGD: as (correct) workers cannot have a full picture of the surrounding landscape of the loss, they can only draw a sample at random and estimate the best direction based on that sample, which can be, and is probably biased compared to the true gradient. Moreover, due to non-convexity, even the true gradient might be leading to the local minima where the parameter server is. By providing a wrong direction (i.e. not the true gradient, or a correct stochastic estimation), a Byzantine worker whose resources cannot face the high-dimensional landscape of the loss, might end up providing a direction to get out of that local minima.","element":"span"}],[{"text":"(2) Combined with high dimensional issues, non-convexity explains the need for strong Byzantine resilience. Unlike the ”mild” Byzantine worker, a strong adversary with more resources than the workers and the server, can see a larger picture and provide an attack that requires a stronger requirement. Namely, a requirement that would cut the","element":"span"},{"style":{"height":17.6},"width":59.37,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-3.png","element":"img","alt":"√d","inline":true,"padRight":true},{"text":"leeway offered to an attacker in each dimension. Figure ","element":"span"},{"href":"#id-9","text":"2 ","element":"a"},{"text":"provides an illustration.","element":"span"}],[{"text":"This motivates the following formalization of strong Byzantine resilience.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Definition 2 ","element":"span"},{"text":"(Strong Byzantine resilience)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"We say that a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ensures strong ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"style":{"fontStyle":"italic"},"text":"-Byzantine resilient if for every ","element":"span"},{"style":{"height":17.6},"width":156.41,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-4.png","element":"img","alt":" i ∈ [1, d]","inline":true},{"style":{"fontStyle":"italic"},"text":", there exists a correct gradient ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"G ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(i.e., computed by a non-Byzantine worker) s.t. ","element":"span"},{"style":{"height":17.6},"width":205.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-5.png","element":"img","alt":" E|GARi−","inline":true},{"style":{"height":25.5},"width":251.86,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-6.png","element":"img","alt":"Gi| = O( 1√d)","inline":true},{"style":{"fontStyle":"italic"},"text":". The the expectation is taken over the random samples (","element":"span"},{"style":{"height":16.4},"width":20,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-7.png","element":"img","alt":"ξ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"in Equation 1)and ","element":"span"},{"style":{"height":11.02},"width":36.73,"height":27.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/3-8.png","element":"img","alt":" vi","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"denotes the","element":"span"}],[{"style":{"height":15.53},"width":47.27,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-0.png","element":"img","alt":"ith ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"coordinate of a vector ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"v","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Weak vs. strong Byzantine resilience. ","element":"span"},{"text":"To attack non-Byzantine resilient ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR","element":"span"},{"text":"s such as averaging, it only takes the computation of an estimate of the gradient, which can be done in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"n.d","element":"span"},{"text":") ","element":"span"},{"text":"operations per round by a Byzantine worker. This attack is reasonably cheap: within the usual cost of the workload of other workers, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":")","element":"span"},{"text":", and the server, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"n.d","element":"span"},{"text":")","element":"span"},{"text":".","element":"span"}],[{"text":"To attack weakly Byzantine-resilient ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR","element":"span"},{"text":"s however, one needs to find the ’most legitimate but harmful vector possible’, i.e one that will (1) be selected by a weakly Byzantine-resilient ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR","element":"span"},{"text":", and (2) be misleading convergence (red arrow in Figure 1). To find this vector, an attacker has to first collect every correct worker’s vector (before they reach the server), and solve an optimization problem (by linear regression) to approximate this harmful but legitimate vector ","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"[6]","element":"a"},{"text":". If the desired quality of the approximation is ","element":"span"},{"style":{"height":15.2},"width":96.53,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-1.png","element":"img","alt":" ϵ, the","inline":true,"padRight":true},{"text":"Byzantine worker would need at least ","element":"span"},{"style":{"height":21.69},"width":122.44,"height":54.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-2.png","element":"img","alt":" Ω( n.dϵ )","inline":true,"padRight":true},{"text":"operation to reach it with regression. This is a tight lower ","element":"span"},{"text":"bound for a regression problem in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"dimensions with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"vectors ","element":"span"},{"href":"#id-7","referenceIndex":8,"text":"[8]","element":"a"},{"text":". In practice, if the required precision is of order ","element":"span"},{"style":{"height":17.53},"width":273.22,"height":43.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-3.png","element":"img","alt":" 10−9, with 100","inline":true,"padRight":true},{"text":"workers and a neural network model of dimension ","element":"span"},{"style":{"height":15.13},"width":60.64,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-4.png","element":"img","alt":" 109","inline":true},{"text":", the cost of the attack becomes quickly prohibitive (","element":"span"},{"style":{"height":15.13},"width":123.63,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-5.png","element":"img","alt":"≈ 1020 ","inline":true,"padRight":true},{"text":"operations to be done in each step by the attacker).","element":"span"}],[{"text":"To summarize, weak Byzantine resilience can be enough as a practical solution against attackers whose resources are comparable to the server’s. However, strong Byzantine resilience remains the only provable solution against attackers with significant resources.","element":"span"}],[{"text":"For the sake of our theoretical analysis, we also recall the definition of ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-6.png","element":"img","alt":" (α, f)","inline":true},{"text":"–Byzantine resilience ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1] ","element":"a"},{"text":"(Definition ","element":"span"},{"href":"#id-10","text":"3)","element":"a"},{"text":". This definition is a sufficient condition (as proved in ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1] ","element":"a"},{"text":"based on ","element":"span"},{"href":"#id-8","referenceIndex":2,"text":"[2]","element":"a"},{"text":") for weak Byzantine resilience.Even-though the property of ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-7.png","element":"img","alt":" (α, f)","inline":true},{"text":"–Byzantine resilience is a sufficient, but not a necessary condition for (weak) Byzantine resilience, it has been so far used as the defacto standard ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1,","element":"a"},{"href":"#id-11","referenceIndex":4,"text":"4,","element":"a"},{"href":"#id-12","referenceIndex":11,"text":"11] ","element":"a"},{"text":"to guarantee (weak) Byzantine resilience for SGD. We will therefore follow this standard and require ","element":"span"},{"style":{"height":17.6},"width":306.86,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-8.png","element":"img","alt":" (α, f)–Byzantine","inline":true,"padRight":true},{"text":"resilience from any ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR ","element":"span"},{"text":"that is plugged into ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN","element":"span"},{"text":", in particular, we will require it from ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM","element":"span"},{"text":". The theoretical analysis done in ","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"[6] ","element":"a"},{"text":"guarantees that B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"inherits it.","element":"span"}],[{"text":"Intuitively, Definition ","element":"span"},{"href":"#id-10","text":"3 ","element":"a"},{"text":"states that the gradient aggregation rule ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR ","element":"span"},{"text":"produces an output vector that lives, on average (over random samples used by SGD), in the cone of angle ","element":"span"},{"style":{"height":8.4},"width":28,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-9.png","element":"img","alt":" α","inline":true,"padRight":true},{"text":"around the true gradient. We simply call this the ”correct cone”.","element":"span"}],[{"id":"id-10","style":{"fontWeight":"bold"},"text":"Definition 3 ","element":"span"},{"text":"(","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-10.png","element":"img","alt":"(α, f)","inline":true},{"text":"–Byzantine resilience (as in ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1]","element":"a"},{"text":"))","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":17.6},"width":261.78,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-11.png","element":"img","alt":" 0 ≤ α < π/2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be any angular value, and any integer ","element":"span"},{"style":{"height":16.4},"width":535.5,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-12.png","element":"img","alt":" 0 ≤ f ≤ n. Let V1, . . . , Vn","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be any independent identically distributed random vectors in ","element":"span"},{"style":{"height":17.53},"width":62.94,"height":43.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-13.png","element":"img","alt":" Rd,","inline":true},{"style":{"height":17.64},"width":701,"height":44.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-14.png","element":"img","alt":"Vi ∼ G, with EG = g. Let B1, . . . , Bf","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be any random vectors in ","element":"span"},{"style":{"height":15.13},"width":49.52,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-15.png","element":"img","alt":" Rd","inline":true},{"style":{"fontStyle":"italic"},"text":", possibly dependent on the ","element":"span"},{"style":{"height":14.62},"width":147.42,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-16.png","element":"img","alt":" Vi’s. An","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"aggregation rule ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is said to be ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-17.png","element":"img","alt":" (α, f)","inline":true},{"style":{"fontStyle":"italic"},"text":"-Byzantine resilient if, for any ","element":"span"},{"style":{"height":17.24},"width":541.18,"height":43.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-18.png","element":"img","alt":" 1 ≤ j1 < · · · < jf ≤ n, vector","inline":true}],[{"style":{"width":"44%"},"width":828,"height":114,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-19.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"satisfies ","element":"span"},{"text":"(i) ","element":"span"},{"style":{"height":19.13},"width":1300.13,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-20.png","element":"img","alt":" ⟨EGAR, g⟩ ≥ (1 − sin α) · ∥g∥2 > 0 2 and (ii) for r = 2, 3, 4, E ∥GAR∥r ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is bounded above by a linear combination of terms ","element":"span"},{"style":{"height":18.04},"width":872.05,"height":45.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/4-21.png","element":"img","alt":" E ∥G∥r1 . . . E ∥G∥rn−1 with r1 + · · · + rn−1 = r.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Choice of ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"f","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"text":"The properties of the existing Byzantine-resilient SGD algorithms all depend on one important parameter, i.e., the number of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"potentially ","element":"span"},{"text":"Byzantine nodes ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":". It is important to notice that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"denotes a contract between the designer of the fault-tolerant solution and the user of the solution (who implements a service on top of the solution and deploys it in a specific setting). As long as the number of Byzantine workers is less than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":", the solution is safe. Fixing an optimal value for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is an orthogonal problem. For example, if daily failures in a data center are about 1%, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"= 0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"01","element":"span"},{"style":{"fontStyle":"italic"},"text":".n ","element":"span"},{"text":"would be a suggested choice to tune the algorithm, and suffer from only a 99% slowdown.","element":"span"}],[{"text":"The performance (convergence time) of certain existing Byzantine-resilient SGD algorithms in a nonByzantine environment is independent of the choice of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":". These algorithms do not exploit the full potential of the choice of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":". Modern large-scale systems are versatile and often undergo important structural changes while providing online services (e.g., addition or maintenance of certain worker nodes). Intuitively, there should be a fine granularity between the level of pessimism (i.e., value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":") and the performance of the SGD algorithm in the setting with no Byzantine failures.","element":"span"}]]},{"heading":"3 MULTI-KRUM: Weak Byzantine Resilience and Slowdown","paragraphs":[[{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"be any integer greater than ","element":"span"},{"text":"2","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"any integer s.t ","element":"span"},{"style":{"height":21.29},"width":293.86,"height":53.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-0.png","element":"img","alt":" f ≤ n−22 and m","inline":true,"padRight":true},{"text":"an integer s.t ","element":"span"},{"style":{"height":16.4},"width":381.7,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-1.png","element":"img","alt":" m ≤ n − f − 2. Let","inline":true},{"style":{"height":16.4},"width":288.23,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-2.png","element":"img","alt":"˜m = n − f − 2.","inline":true}],[{"text":"We first prove the ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-3.png","element":"img","alt":" (α, f)","inline":true},{"text":"–Byzantine resilience of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"text":"(Lemma 1), then prove its almost sure convergence (Lemma 2) based on that, which proves the weak Byzantine resilience of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"text":"(Theorem 1).","element":"span"}],[{"text":"In all what follows, expectations are taken over random samples used by correct workers to estimate the gradient, i.e the ”S” (stochasticity) that is inherent to SGD. It is worth noting that this analysis in expectation is not an average case analysis from the point of view of Byzantine fault tolerance. For instance, the Byzantine worker is always assumed to follow arbitrarily bad policies and the analysis is a worst-case one.","element":"span"}],[{"text":"The Byzantine resilience proof (Lemma 1) relies on the following observation: given ","element":"span"},{"style":{"height":16.4},"width":295.67,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-4.png","element":"img","alt":" m ≤ n − f − 2,","inline":true,"padRight":true},{"text":"and in particular ","element":"span"},{"href":"#id-13","style":{"height":18.73},"width":388.96,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-5.png","element":"img","alt":" m = n − f − 2 3, m","inline":true},{"text":"-Krum averages ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"gradients that are all in the ”correct cone”, and a cone is a convex set, thus stable by averaging. The resulting vectors therefore also live in that cone. The angle of the cone will depend on a variable ","element":"span"},{"style":{"height":17.6},"width":121.58,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-6.png","element":"img","alt":" η(n.f)","inline":true,"padRight":true},{"text":"as in ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1]","element":"a"},{"text":", the value of ","element":"span"},{"style":{"height":17.6},"width":121.58,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-7.png","element":"img","alt":" η(n.f)","inline":true,"padRight":true},{"text":"itself depends on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":". This is what enables us to use multi-Krum as the basis of our ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM","element":"span"},{"text":", unlike ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1] ","element":"a"},{"text":"where a restriction is made on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"= 1","element":"span"},{"text":".","element":"span"}],[{"text":"The proof of Lemma 2 is the same as the one in ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1] ","element":"a"},{"text":"which itself draws on the rather classic analysis of SGD made by L.Bottou ","element":"span"},{"href":"#id-8","referenceIndex":2,"text":"[2]","element":"a"},{"text":". The key concepts are (1) a global confinement of the sequence of parameter vectors and (2) a bound on the statistical moments of the random sequence of estimators built by the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR ","element":"span"},{"text":"of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM","element":"span"},{"text":". As in ","element":"span"},{"href":"#id-6","referenceIndex":1,"text":"[1,","element":"a"},{"href":"#id-8","referenceIndex":2,"text":"2]","element":"a"},{"text":", reasonable assumptions are made on the cost function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q","element":"span"},{"text":", those assumption are not restrictive and are common in practical machine learning.","element":"span"}],[{"id":"id-18","style":{"fontWeight":"bold"},"text":"Theorem 1 ","element":"span"},{"text":"(Byzantine resilience and slowdown of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM","element":"span"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be any integer s.t. ","element":"span"},{"style":{"height":16.4},"width":259.48,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-8.png","element":"img","alt":" m ≤ n−f −2.","inline":true,"padRight":true},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"(i) ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"style":{"fontStyle":"italic"},"text":"has weak Byzantine resilience against ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"failures. ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"(ii) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"In the absence of Byzantine workers, ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"style":{"fontStyle":"italic"},"text":"has a slowdown (expressed in ratio with averaging) of ","element":"span"},{"style":{"height":21.29},"width":115.98,"height":53.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-9.png","element":"img","alt":" Ω( ˜mn ).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Proof of (i). ","element":"span"},{"text":"To prove ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(i)","element":"span"},{"text":", we will require Lemma 1 and Lemma 2, then conclude by construction of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"text":"as a multi-Krum algorithm with ","element":"span"},{"style":{"height":16.4},"width":288.23,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-10.png","element":"img","alt":" m = n − f − 2.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Lemma 1. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Let ","element":"span"},{"style":{"height":15.2},"width":187.81,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-11.png","element":"img","alt":" V1, . . . , Vn","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"be any independent and identically distributed random ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"style":{"fontStyle":"italic"},"text":"-dimensional vectors s.t ","element":"span"},{"style":{"height":21.69},"width":1355.12,"height":54.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-12.png","element":"img","alt":" Vi ∼ G, with EG = g and E ∥G − g∥2 = dσ2. Let B1, . . . , Bf be any f","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"random vectors, possibly dependent on the ","element":"span"},{"style":{"height":20.08},"width":923.01,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-13.png","element":"img","alt":" Vi’s. If 2f + 2 < n and η(n, f)√d · σ < ∥g∥, where","inline":true}],[{"id":"id-13","style":{"width":"73%"},"width":1381,"height":162,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/5-14.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"then the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GAR ","element":"span"},{"style":{"fontStyle":"italic"},"text":"function of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-0.png","element":"img","alt":" (α, f)","inline":true},{"style":{"fontStyle":"italic"},"text":"-Byzantine resilient where ","element":"span"},{"style":{"height":17.6},"width":236.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-1.png","element":"img","alt":" 0 ≤ α < π/2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is defined by","element":"span"}],[{"style":{"width":"23%"},"width":442,"height":110,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-2.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Without loss of generality, we assume that the Byzantine vectors ","element":"span"},{"style":{"height":17.24},"width":201.1,"height":43.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-3.png","element":"img","alt":" B1, . . . , Bf","inline":true,"padRight":true},{"text":"occupy the last ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"positions in the list of arguments of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM","element":"span"},{"text":", i.e., ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"style":{"height":18.44},"width":802.88,"height":46.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-4.png","element":"img","alt":" = MULTI-KRUM(V1, . . . , Vn−f, B1, . . . , Bf).","inline":true,"padRight":true},{"text":"An index is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"correct ","element":"span"},{"text":"if it refers to a vector among ","element":"span"},{"style":{"height":17.24},"width":232.71,"height":43.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-5.png","element":"img","alt":" V1, . . . , Vn−f","inline":true},{"text":". An index is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Byzantine ","element":"span"},{"text":"if it refers to a vector among ","element":"span"},{"style":{"height":17.24},"width":201.1,"height":43.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-6.png","element":"img","alt":" B1, . . . , Bf","inline":true},{"text":". For each index (correct or Byzantine) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":", we denote by ","element":"span"},{"style":{"height":17.6},"width":293.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-7.png","element":"img","alt":" δc(i) (resp. δb(i)","inline":true},{"text":") the number of correct (resp. Byzantine) indices ","element":"span"},{"style":{"height":16.4},"width":306.28,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-8.png","element":"img","alt":" j such that i → j","inline":true,"padRight":true},{"text":"(the notation we introduced in Section 3 when defining ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM","element":"span"},{"text":"), i.e the number of workers, among the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"neighbors of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"that are correct (resp. Byzantine). We have","element":"span"}],[{"style":{"width":"22%"},"width":428,"height":177,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-9.png","element":"img"}],[{"text":"We focus first on the condition ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(i) ","element":"span"},{"text":"of ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-10.png","element":"img","alt":" (α, f)","inline":true},{"text":"-Byzantine resilience. We determine an upper bound on the squared distance ","element":"span"},{"style":{"height":19.13},"width":418.68,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-11.png","element":"img","alt":" ∥EMULTI-KRUM − g∥2","inline":true},{"text":". Note that, for any correct ","element":"span"},{"style":{"height":17.02},"width":193.92,"height":42.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-12.png","element":"img","alt":" j, EVj = g","inline":true},{"text":". We denote by ","element":"span"},{"style":{"height":15.02},"width":205.76,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-13.png","element":"img","alt":" i∗ the index","inline":true,"padRight":true},{"text":"of the worst scoring among the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"vectors chosen by the ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"text":"function, i.e one that ranks with the ","element":"span"},{"style":{"height":15.53},"width":70.56,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-14.png","element":"img","alt":"mth ","inline":true,"padRight":true},{"text":"smallest score in Equation 5 of the main paper (Section 3).","element":"span"}],[{"style":{"width":"85%"},"width":1596,"height":711,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-15.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"I ","element":"span"},{"text":"denotes the indicator function","element":"span"},{"href":"#id-14","text":"4","element":"a"},{"text":". We examine the case ","element":"span"},{"style":{"height":14.62},"width":107.14,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-16.png","element":"img","alt":" i∗ = i","inline":true,"padRight":true},{"text":"for some correct index ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":". ","element":"span"},{"style":{"height":63.96},"width":206.39,"height":159.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-17.png","element":"img","alt":"��Vi − 1δc(i)","inline":true}],[{"id":"id-14","style":{"width":"87%"},"width":1636,"height":330,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/6-18.png","element":"img"}],[{"style":{"width":"7%"},"width":141,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-0.png","element":"img"}],[{"text":"We now examine the case ","element":"span"},{"style":{"height":15.02},"width":118.38,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-1.png","element":"img","alt":" i∗ = k","inline":true,"padRight":true},{"text":"for some Byzantine index ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":". The fact that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"minimizes the score implies that for all correct indices ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"}],[{"style":{"width":"93%"},"width":1742,"height":599,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-2.png","element":"img"}],[{"text":"We focus on the term ","element":"span"},{"style":{"height":19.13},"width":105.27,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-3.png","element":"img","alt":" D2(i)","inline":true},{"text":". Each correct process ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"has ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"neighbors, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"+ 1 ","element":"span"},{"text":"non-neighbors. Thus there exists a correct worker ","element":"span"},{"style":{"height":17.6},"width":71.31,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-4.png","element":"img","alt":" ζ(i)","inline":true,"padRight":true},{"text":"which is farther from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"than any of the neighbors of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":". In particular, for each Byzantine index ","element":"span"},{"style":{"height":24.94},"width":979.43,"height":62.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-5.png","element":"img","alt":" l such that i → l, ∥Vi − Bl∥2 ≤��Vi − Vζ(i)��2. Whence","inline":true}],[{"style":{"width":"84%"},"width":1587,"height":589,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-6.png","element":"img"}],[{"text":"Putting everything back together, we obtain","element":"span"}],[{"style":{"width":"86%"},"width":1628,"height":296,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-7.png","element":"img"}],[{"text":"By assumption, ","element":"span"},{"style":{"height":20.08},"width":713.98,"height":50.19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-8.png","element":"img","alt":" η(n, f)√dσ < ∥g∥, i.e., EMULTI-KRUM","inline":true,"padRight":true},{"text":"belongs to a ball centered at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"g ","element":"span"},{"text":"with radius ","element":"span"},{"style":{"height":17.6},"width":144.37,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-9.png","element":"img","alt":" η(n, f)·","inline":true},{"style":{"height":17.6},"width":115.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-10.png","element":"img","alt":"√d · σ","inline":true},{"text":". This implies","element":"span"}],[{"style":{"width":"71%"},"width":1339,"height":79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-11.png","element":"img"}],[{"text":"To sum up, condition ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(i) ","element":"span"},{"text":"of the ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-12.png","element":"img","alt":" (α, f)","inline":true},{"text":"-Byzantine resilience property holds. We now focus on condition ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(ii)","element":"span"},{"text":".","element":"span"}],[{"style":{"width":"68%"},"width":1275,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/7-13.png","element":"img"}],[{"style":{"width":"93%"},"width":1753,"height":700,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-0.png","element":"img"}],[{"text":"The second inequality comes from the equivalence of norms in finite dimension. Now","element":"span"}],[{"style":{"width":"57%"},"width":1071,"height":407,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-1.png","element":"img"}],[{"text":"Since the ","element":"span"},{"style":{"height":14.62},"width":37.46,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-2.png","element":"img","alt":" Vi","inline":true},{"text":"’s are independent, we finally obtain that ","element":"span"},{"style":{"height":18.04},"width":348.13,"height":45.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-3.png","element":"img","alt":" E ∥MULTI-KRUM∥r ","inline":true,"padRight":true},{"text":"is bounded above by a linear combination of terms of the form ","element":"span"},{"style":{"height":18.89},"width":1256.89,"height":47.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-4.png","element":"img","alt":" E ∥V1∥r1 · · · E ∥Vn−f∥rn−f = E ∥G∥r1 · · · E ∥G∥rn−f with r1 + · · · +","inline":true},{"style":{"height":13.24},"width":166.55,"height":33.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-5.png","element":"img","alt":"rn−f = r","inline":true},{"text":". This completes the proof of condition ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(ii)","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Lemma 2. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume that ","element":"span"},{"text":"(i) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"the cost function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Q ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is three times differentiable with continuous derivatives, and is non-negative, ","element":"span"},{"style":{"height":17.6},"width":264.6,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-6.png","element":"img","alt":" Q(x) ≥ 0; (ii)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the learning rates satisfy ","element":"span"},{"style":{"height":19.89},"width":623.28,"height":49.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-7.png","element":"img","alt":"�t γt = ∞ and �t γ2t < ∞; (iii)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the gradient ","element":"span"},{"style":{"fontStyle":"italic"},"text":"estimator satisfies ","element":"span"},{"style":{"height":17.6},"width":1536.68,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-8.png","element":"img","alt":" EG(x, ξ) = ∇Q(x) and ∀r ∈ {2, . . . , 4}, E∥G(x, ξ)∥r ≤ Ar + Br∥x∥r for some","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"constants ","element":"span"},{"style":{"height":16},"width":209.49,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-9.png","element":"img","alt":" Ar, Br; (iv)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"there exists a constant ","element":"span"},{"style":{"height":17.6},"width":236.52,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-10.png","element":"img","alt":" 0 ≤ α < π/2","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"such that for all ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"}],[{"style":{"width":"37%"},"width":696,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-11.png","element":"img"}],[{"text":"(v) ","element":"span"},{"style":{"fontStyle":"italic"},"text":"finally, beyond a certain horizon, ","element":"span"},{"style":{"height":19.13},"width":181.68,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-12.png","element":"img","alt":" ∥x∥2 ≥ D","inline":true},{"style":{"fontStyle":"italic"},"text":", there exist ","element":"span"},{"style":{"height":17.6},"width":672.55,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-13.png","element":"img","alt":" ϵ > 0 and 0 ≤ β < π/2 − α such that","inline":true}],[{"style":{"width":"23%"},"width":439,"height":169,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-14.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Then the sequence of gradients ","element":"span"},{"style":{"height":17.6},"width":143.99,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-15.png","element":"img","alt":" ∇Q(xt)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"converges almost surely to zero.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"For the sake of simplicity, we write ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM","element":"span"},{"style":{"height":19.01},"width":559.69,"height":47.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-16.png","element":"img","alt":"t = MULTI-KRUM(V t1 , . . . , V tn)","inline":true},{"text":". Before proving ","element":"span"},{"text":"the main claim of the proposition, we first show that the sequence ","element":"span"},{"style":{"height":10.62},"width":36.94,"height":26.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-17.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"is almost surely globally confined within the region ","element":"span"},{"style":{"height":19.13},"width":194.02,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/8-18.png","element":"img","alt":" ∥x∥2 ≤ D.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"(Global confinement). ","element":"span"},{"text":"Let ","element":"span"},{"style":{"height":19.13},"width":377.2,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-0.png","element":"img","alt":" ut = φ(∥xt∥2) where","inline":true}],[{"id":"id-17","style":{"width":"99%"},"width":1870,"height":246,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-1.png","element":"img"}],[{"text":"This becomes an equality when ","element":"span"},{"style":{"height":15.6},"width":155.37,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-2.png","element":"img","alt":" a, b ≥ D","inline":true},{"text":". Applying this inequality to ","element":"span"},{"style":{"height":16.4},"width":290.2,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-3.png","element":"img","alt":" ut+1 − ut yields","inline":true}],[{"style":{"width":"101%"},"width":1905,"height":264,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-4.png","element":"img"}],[{"text":"Let ","element":"span"},{"style":{"height":14.62},"width":42.35,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-5.png","element":"img","alt":" Pt","inline":true,"padRight":true},{"text":"denote the ","element":"span"},{"style":{"height":8},"width":25,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-6.png","element":"img","alt":" σ","inline":true},{"text":"-algebra encoding all the information up to round ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":". Taking the conditional expectation with respect to ","element":"span"},{"style":{"height":16.4},"width":159.76,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-7.png","element":"img","alt":" Pt yields","inline":true}],[{"style":{"width":"109%"},"width":2057,"height":126,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-8.png","element":"img"}],[{"text":"Thanks to condition ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(ii) ","element":"span"},{"text":"of ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-9.png","element":"img","alt":" (α, f)","inline":true},{"text":"-Byzantine resilience, and the assumption on the first four moments of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G","element":"span"},{"text":", there exist positive constants ","element":"span"},{"style":{"height":16},"width":291.2,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-10.png","element":"img","alt":" A0, B0 such that","inline":true}],[{"style":{"width":"76%"},"width":1433,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-11.png","element":"img"}],[{"text":"Thus, there exist positive constant ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B ","element":"span"},{"text":"such that","element":"span"}],[{"style":{"width":"72%"},"width":1361,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-12.png","element":"img"}],[{"text":"When ","element":"span"},{"style":{"height":19.13},"width":202.93,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-13.png","element":"img","alt":" ∥xt∥2 < D","inline":true},{"text":", the first term of the right hand side is null because ","element":"span"},{"style":{"height":19.13},"width":622.26,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-14.png","element":"img","alt":" φ′(∥xt∥2) = 0. When ∥xt∥2 ≥ D,","inline":true,"padRight":true},{"text":"this first term is negative because (see Figure ","element":"span"},{"href":"#id-15","text":"3)","element":"a"}],[{"style":{"width":"82%"},"width":1546,"height":193,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-15.png","element":"img"}],[{"text":"We define two auxiliary sequences","element":"span"}],[{"style":{"width":"27%"},"width":514,"height":196,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-16.png","element":"img"}],[{"text":"Note that the sequence ","element":"span"},{"style":{"height":12},"width":38.3,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-17.png","element":"img","alt":" µt","inline":true,"padRight":true},{"text":"converges because ","element":"span"},{"style":{"height":19.9},"width":327.61,"height":49.74,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-18.png","element":"img","alt":"�t γ2t < ∞. Then","inline":true}],[{"style":{"width":"26%"},"width":492,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-19.png","element":"img"}],[{"text":"Consider the indicator of the positive variations of the left-hand side","element":"span"}],[{"style":{"width":"34%"},"width":637,"height":106,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/9-20.png","element":"img"}],[{"style":{"width":"78%"},"width":1463,"height":100,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-0.png","element":"img"}],[{"text":"The right-hand side of the previous inequality is the summand of a convergent series. ","element":"span"},{"text":"By the quasi-martingale convergence theorem ","element":"span"},{"href":"#id-16","referenceIndex":9,"text":"[9]","element":"a"},{"text":", this shows that the sequence ","element":"span"},{"style":{"height":12.32},"width":40.98,"height":30.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-1.png","element":"img","alt":" u′t ","inline":true,"padRight":true},{"text":"converges almost surely, which in ","element":"span"},{"text":"turn shows that the sequence ","element":"span"},{"style":{"height":10.62},"width":36.98,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-2.png","element":"img","alt":" ut","inline":true,"padRight":true},{"text":"converges almost surely, ","element":"span"},{"style":{"height":14.22},"width":258.93,"height":35.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-3.png","element":"img","alt":" ut → u∞ ≥ 0.","inline":true},{"text":"Let us assume that ","element":"span"},{"style":{"height":15.02},"width":315.69,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-4.png","element":"img","alt":" u∞ > 0. When t","inline":true,"padRight":true},{"text":"is large enough, this implies that ","element":"span"},{"style":{"height":19.13},"width":334.65,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-5.png","element":"img","alt":" ∥xt∥2 and ∥xt+1∥2 ","inline":true,"padRight":true},{"text":"are greater than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"D","element":"span"},{"text":". Inequality ","element":"span"},{"href":"#id-17","text":"5 ","element":"a"},{"text":"becomes an equality, which implies that the following infinite sum converges almost surely","element":"span"}],[{"style":{"width":"41%"},"width":778,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-6.png","element":"img"}],[{"text":"Note that the sequence ","element":"span"},{"style":{"height":19.13},"width":172.88,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-7.png","element":"img","alt":" φ′(∥xt∥2)","inline":true,"padRight":true},{"text":"converges to a positive value. In the region ","element":"span"},{"style":{"height":19.13},"width":362.34,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-8.png","element":"img","alt":" ∥xt∥2 > D, we have","inline":true}],[{"style":{"width":"75%"},"width":1420,"height":222,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-9.png","element":"img"}],[{"text":"This contradicts the fact that ","element":"span"},{"style":{"height":19.2},"width":269.52,"height":48.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-10.png","element":"img","alt":"�∞t=1 γt = ∞","inline":true},{"text":". Therefore, the sequence ","element":"span"},{"style":{"height":10.62},"width":36.98,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-11.png","element":"img","alt":" ut","inline":true,"padRight":true},{"text":"converges to zero. This con- ","element":"span"},{"text":"vergence implies that the sequence ","element":"span"},{"style":{"height":19.13},"width":99.8,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-12.png","element":"img","alt":" ∥xt∥2 ","inline":true,"padRight":true},{"text":"is bounded, i.e., the vector ","element":"span"},{"style":{"height":10.62},"width":36.94,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-13.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"is confined in a bounded region containing the origin. As a consequence, any continuous function of ","element":"span"},{"style":{"height":10.62},"width":36.94,"height":26.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-14.png","element":"img","alt":" xt","inline":true,"padRight":true},{"text":"is also bounded, such as, e.g., ","element":"span"},{"style":{"height":19.14},"width":112.73,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-15.png","element":"img","alt":" ∥xt∥2,","inline":true},{"style":{"height":20.84},"width":244.9,"height":52.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-16.png","element":"img","alt":"E ∥G(xt, ξ)∥2 ","inline":true,"padRight":true},{"text":"and all the derivatives of the cost function ","element":"span"},{"style":{"height":17.6},"width":107.63,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-17.png","element":"img","alt":" Q(xt)","inline":true},{"text":". In the sequel, positive constants ","element":"span"},{"style":{"height":15.2},"width":143.38,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-18.png","element":"img","alt":" K1, K2,","inline":true,"padRight":true},{"text":"etc. . . are introduced whenever such a bound is used.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"(Convergence). ","element":"span"},{"text":"We proceed to show that the gradient ","element":"span"},{"style":{"height":17.6},"width":144,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-19.png","element":"img","alt":" ∇Q(xt)","inline":true,"padRight":true},{"text":"converges almost surely to zero. We define","element":"span"}],[{"style":{"width":"11%"},"width":212,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-20.png","element":"img"}],[{"text":"Using a first-order Taylor expansion and bounding the second derivative with ","element":"span"},{"style":{"height":14.62},"width":54.06,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-21.png","element":"img","alt":" K1","inline":true},{"text":", we obtain","element":"span"}],[{"style":{"width":"70%"},"width":1326,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-22.png","element":"img"}],[{"text":"Therefore","element":"span"}],[{"style":{"width":"92%"},"width":1725,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-23.png","element":"img"}],[{"text":"By the properties of ","element":"span"},{"style":{"height":17.6},"width":107.5,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-24.png","element":"img","alt":" (α, f)","inline":true},{"text":"-Byzantine resiliency, this implies","element":"span"}],[{"style":{"width":"28%"},"width":525,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-25.png","element":"img"}],[{"text":"which in turn implies that the positive variations of ","element":"span"},{"style":{"height":15.02},"width":37.14,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-26.png","element":"img","alt":" ht","inline":true,"padRight":true},{"text":"are also bounded","element":"span"}],[{"style":{"width":"30%"},"width":575,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-27.png","element":"img"}],[{"text":"The right-hand side is the summand of a convergent infinite sum. By the quasi-martingale convergence theorem, the sequence ","element":"span"},{"style":{"height":15.02},"width":37.14,"height":37.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-28.png","element":"img","alt":" ht","inline":true,"padRight":true},{"text":"converges almost surely, ","element":"span"},{"style":{"height":17.6},"width":256.84,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-29.png","element":"img","alt":" Q(xt) → Q∞.","inline":true}],[{"style":{"width":"96%"},"width":1803,"height":34,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-30.png","element":"img"}],[{"text":"that","element":"span"}],[{"style":{"width":"40%"},"width":767,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/10-31.png","element":"img"}],[{"style":{"width":"37%"},"width":706,"height":274,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-0.png","element":"img"}],[{"id":"id-15","style":{"fontWeight":"bold"},"text":"Figure 3: Condition on the angles between ","element":"figcaption","subtype":"caption"},{"style":{"height":17.6},"width":211.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-1.png","element":"img","alt":" xt, ∇Q(xt)","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"and the the ","element":"figcaption","subtype":"caption"},{"style":{"height":13.2},"width":569.43,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-2.png","element":"img","alt":" GAR of MULTI-KRUM vector","inline":true},{"style":{"height":14.62},"width":308.67,"height":36.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-3.png","element":"img","alt":"EMULTI-KRUMt","inline":true},{"style":{"fontWeight":"bold"},"text":", in the region ","element":"figcaption","subtype":"caption"},{"style":{"height":19.13},"width":208.25,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-4.png","element":"img","alt":" ∥xt∥2 > D.","inline":true}],[{"style":{"width":"58%"},"width":1093,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-5.png","element":"img"}],[{"text":"Using a Taylor expansion, as demonstrated for the variations of ","element":"span"},{"style":{"height":15.02},"width":37.14,"height":37.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-6.png","element":"img","alt":" ht","inline":true},{"text":", we obtain","element":"span"}],[{"style":{"width":"84%"},"width":1584,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-7.png","element":"img"}],[{"text":"Taking the conditional expectation, and bounding the second derivatives by ","element":"span"},{"style":{"height":14.8},"width":66.99,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-8.png","element":"img","alt":" K4,","inline":true}],[{"style":{"width":"63%"},"width":1181,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-9.png","element":"img"}],[{"text":"The positive expected variations of ","element":"span"},{"style":{"height":12},"width":34.56,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-10.png","element":"img","alt":" ρt","inline":true,"padRight":true},{"text":"are bounded","element":"span"}],[{"style":{"width":"67%"},"width":1261,"height":51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-11.png","element":"img"}],[{"text":"The two terms on the right-hand side are the summands of convergent infinite series. By the quasi-martingale convergence theorem, this shows that ","element":"span"},{"style":{"height":12},"width":34.56,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-12.png","element":"img","alt":" ρt","inline":true,"padRight":true},{"text":"converges almost surely. We have","element":"span"}],[{"style":{"width":"75%"},"width":1405,"height":202,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-13.png","element":"img"}],[{"text":"This implies that the following infinite series converge almost surely","element":"span"}],[{"style":{"width":"15%"},"width":284,"height":121,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-14.png","element":"img"}],[{"text":"Since ","element":"span"},{"style":{"height":12},"width":34.56,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-15.png","element":"img","alt":" ρt","inline":true,"padRight":true},{"text":"converges almost surely, and the series ","element":"span"},{"style":{"height":19.2},"width":264.54,"height":48.01,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-16.png","element":"img","alt":"�∞t=1 γt = ∞","inline":true,"padRight":true},{"text":"diverges, we conclude that the sequence ","element":"span"},{"style":{"height":17.6},"width":187.78,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-17.png","element":"img","alt":"∥∇Q(xt)∥","inline":true,"padRight":true},{"text":"converges almost surely to zero.","element":"span"}],[{"text":"We conclude the proof of ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(i) ","element":"span"},{"text":"by recalling the definition of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM","element":"span"},{"text":", as the instance of ","element":"span"},{"style":{"height":12.4},"width":212.66,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-18.png","element":"img","alt":" m − Krum","inline":true,"padRight":true},{"text":"with ","element":"span"},{"style":{"height":16.4},"width":288.23,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-19.png","element":"img","alt":" m = n − f − 2.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Proof of (ii). ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(ii) ","element":"span"},{"text":"is a consequence of the fact that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":"-Krum is the average of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"estimators of the gradient. In the absence of Byzantine workers, all those estimators will not only be from the ”correct cone”, but from correct workers (Byzantine workers can also be in the correct cone, but in this case there are none). As SGD converges in ","element":"span"},{"style":{"height":21.29},"width":292.12,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/11-20.png","element":"img","alt":" O( 1m), where m","inline":true,"padRight":true},{"text":"is the number of used estimators of the gradient, the slowdown result ","element":"span"},{"text":"follows.","element":"span"}]]},{"heading":"4 MULTI-BULYAN: Strong Byzantine Resilience and Slowdown","paragraphs":[[{"text":"Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"be any integer greater than ","element":"span"},{"text":"2","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"any integer s.t ","element":"span"},{"style":{"height":21.29},"width":289.94,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/12-0.png","element":"img","alt":" f ≤ n−34 and m","inline":true,"padRight":true},{"text":"an integer s.t ","element":"span"},{"style":{"height":16.4},"width":396.86,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/12-1.png","element":"img","alt":" m ≤ n − 2f − 2. Let","inline":true},{"style":{"height":16.4},"width":310.05,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/12-2.png","element":"img","alt":"˜m = n − 2f − 2.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Theorem 2 ","element":"span"},{"text":"(Byzantine resilience and slowdown of ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN","element":"span"},{"text":")","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(i) ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN ","element":"span"},{"style":{"fontStyle":"italic"},"text":"provides strong Byzantine resilience against ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"style":{"fontStyle":"italic"},"text":"failures. (ii) In the absence of Byzantine workers, ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-B","element":"span"},{"text":"ULYAN ","element":"span"},{"style":{"fontStyle":"italic"},"text":"has a slowdown (expressed in ratio with averaging) of ","element":"span"},{"style":{"height":21.29},"width":115.98,"height":53.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/12-3.png","element":"img","alt":" Ω( ˜mn ).","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"If the number of iterations over ","element":"span"},{"text":"MULTI","element":"span"},{"text":"-K","element":"span"},{"text":"RUM ","element":"span"},{"text":"is ","element":"span"},{"style":{"height":16.4},"width":121.04,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/12-4.png","element":"img","alt":" n − 2f","inline":true},{"text":", then the leeway, defined by the coordinate-wise distance between the output of B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"and a correct gradient is upper bounded by ","element":"span"},{"style":{"height":25.5},"width":117.2,"height":63.76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/12-5.png","element":"img","alt":" O( 1√d)","inline":true},{"text":". This is due ","element":"span"},{"text":"to the fact that B","element":"span"},{"text":"ULYAN ","element":"span"},{"text":"relies on a component-wise median, that, as proven in ","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"[6] ","element":"a"},{"text":"guarantees this bound. The proof is then a direct consequence of Theorem ","element":"span"},{"href":"#id-18","text":"1 ","element":"a"},{"text":"and the properties of Bulyan ","element":"span"},{"href":"#id-2","referenceIndex":6,"text":"[6]","element":"a"},{"text":".","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-6","text":"[1] B","element":"span"},{"text":"LANCHARD","element":"span"},{"text":", P., E","element":"span"},{"text":"L ","element":"span"},{"text":"M","element":"span"},{"text":"HAMDI","element":"span"},{"text":", E. M., G","element":"span"},{"text":"UERRAOUI","element":"span"},{"text":", R., ","element":"span"},{"text":"AND ","element":"span"},{"text":"S","element":"span"},{"text":"TAINER","element":"span"},{"text":", J. Machine learning with adversaries: Byzantine tolerant gradient descent. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"NIPS ","element":"span"},{"text":"(2017), pp. 118–128.","element":"span"}],[{"id":"id-8","text":"[2] B","element":"span"},{"text":"OTTOU","element":"span"},{"text":", L. Online learning and stochastic approximations. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Online learning in neural networks 17","element":"span"},{"text":", 9 (1998), 142.","element":"span"}],[{"text":"[3] D","element":"span"},{"text":"AMASKINOS","element":"span"},{"text":", G., E","element":"span"},{"text":"L","element":"span"},{"text":"-M","element":"span"},{"text":"HAMDI","element":"span"},{"text":", E.-M., G","element":"span"},{"text":"UERRAOUI","element":"span"},{"text":", R., G","element":"span"},{"text":"UIRGUIS","element":"span"},{"text":", A., ","element":"span"},{"text":"AND ","element":"span"},{"text":"R","element":"span"},{"text":"OUAULT","element":"span"},{"text":", S. Aggregathor: : Byzantine machine learning via robust gradient aggregation. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the ","element":"span"},{"style":{"height":14.34},"width":49.48,"height":35.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1905.04374/images/12-6.png","element":"img","alt":" 1st","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"SysML Conference ","element":"span"},{"text":"(2019).","element":"span"}],[{"id":"id-11","text":"[4] D","element":"span"},{"text":"AMASKINOS","element":"span"},{"text":", G., E","element":"span"},{"text":"L ","element":"span"},{"text":"M","element":"span"},{"text":"HAMDI","element":"span"},{"text":", E. M., G","element":"span"},{"text":"UERRAOUI","element":"span"},{"text":", R., P","element":"span"},{"text":"ATRA","element":"span"},{"text":", R., T","element":"span"},{"text":"AZIKI","element":"span"},{"text":", M., ","element":"span"},{"text":"ET AL","element":"span"},{"text":". Asynchronous Byzantine machine learning (the case of sgd). In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 35th International Conference on Machine Learning ","element":"span"},{"text":"(Stockholmsm¨assan, Stockholm Sweden, 10–15 Jul 2018), vol. 80 of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of Machine Learning Research","element":"span"},{"text":", PMLR, pp. 1153–1162.","element":"span"}],[{"id":"id-1","text":"[5] D","element":"span"},{"text":"EAN","element":"span"},{"text":", J., C","element":"span"},{"text":"ORRADO","element":"span"},{"text":", G., M","element":"span"},{"text":"ONGA","element":"span"},{"text":", R., C","element":"span"},{"text":"HEN","element":"span"},{"text":", K., D","element":"span"},{"text":"EVIN","element":"span"},{"text":", M., M","element":"span"},{"text":"AO","element":"span"},{"text":", M., S","element":"span"},{"text":"ENIOR","element":"span"},{"text":", A., T","element":"span"},{"text":"UCKER","element":"span"},{"text":", P., Y","element":"span"},{"text":"ANG","element":"span"},{"text":", K., L","element":"span"},{"text":"E","element":"span"},{"text":", Q. V., ","element":"span"},{"text":"ET AL","element":"span"},{"text":". Large scale distributed deep networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"NIPS ","element":"span"},{"text":"(2012), pp. 1223– 1231.","element":"span"}],[{"id":"id-2","text":"[6] E","element":"span"},{"text":"L ","element":"span"},{"text":"M","element":"span"},{"text":"HAMDI","element":"span"},{"text":", E. M., G","element":"span"},{"text":"UERRAOUI","element":"span"},{"text":", R., ","element":"span"},{"text":"AND ","element":"span"},{"text":"R","element":"span"},{"text":"OUAULT","element":"span"},{"text":", S. The hidden vulnerability of distributed learning in Byzantium. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 35th International Conference on Machine Learning ","element":"span"},{"text":"(Stockholmsm¨assan, Stockholm Sweden, 10–15 Jul 2018), vol. 80 of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of Machine Learning Research","element":"span"},{"text":", PMLR, pp. 3521–3530.","element":"span"}],[{"id":"id-4","text":"[7] G","element":"span"},{"text":"OYAL","element":"span"},{"text":", P., D","element":"span"},{"text":"OLL ","element":"span"},{"text":"´","element":"span"},{"text":"AR","element":"span"},{"text":", P., G","element":"span"},{"text":"IRSHICK","element":"span"},{"text":", R., N","element":"span"},{"text":"OORDHUIS","element":"span"},{"text":", P., W","element":"span"},{"text":"ESOLOWSKI","element":"span"},{"text":", L., K","element":"span"},{"text":"YROLA","element":"span"},{"text":", A., T","element":"span"},{"text":"UL","element":"span"},{"text":"- ","element":"span"},{"text":"LOCH","element":"span"},{"text":", A., J","element":"span"},{"text":"IA","element":"span"},{"text":", Y., ","element":"span"},{"text":"AND ","element":"span"},{"text":"H","element":"span"},{"text":"E","element":"span"},{"text":", K. Accurate, large minibatch sgd: training imagenet in 1 hour. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint ","element":"span"},{"href":"http://arxiv.org/abs/1706.02677","style":{"fontStyle":"italic"},"text":"arXiv:1706.02677 ","element":"a"},{"text":"(2017).","element":"span"}],[{"id":"id-7","text":"[8] H","element":"span"},{"text":"AYKIN","element":"span"},{"text":", S. S. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural networks and learning machines","element":"span"},{"text":", vol. 3. Pearson Upper Saddle River, NJ, USA:, 2009.","element":"span"}],[{"id":"id-16","text":"[9] M´","element":"span"},{"text":"ETIVIER","element":"span"},{"text":", M. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Semi-Martingales","element":"span"},{"text":". Walter de Gruyter, 1983.","element":"span"}],[{"id":"id-0","text":"[10] S","element":"span"},{"text":"HALEV","element":"span"},{"text":"-S","element":"span"},{"text":"HWARTZ","element":"span"},{"text":", S., ","element":"span"},{"text":"AND ","element":"span"},{"text":"B","element":"span"},{"text":"EN","element":"span"},{"text":"-D","element":"span"},{"text":"AVID","element":"span"},{"text":", S. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Understanding machine learning: From theory to algorithms","element":"span"},{"text":". Cambridge university press, 2014.","element":"span"}],[{"id":"id-12","text":"[11] X","element":"span"},{"text":"IE","element":"span"},{"text":", C., K","element":"span"},{"text":"OYEJO","element":"span"},{"text":", O., ","element":"span"},{"text":"AND ","element":"span"},{"text":"G","element":"span"},{"text":"UPTA","element":"span"},{"text":", I. ","element":"span"},{"text":"Generalized Byzantine-tolerant sgd. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint ","element":"span"},{"href":"http://arxiv.org/abs/1802.10116","style":{"fontStyle":"italic"},"text":"arXiv:1802.10116 ","element":"a"},{"text":"(2018).","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]