1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMi4wOTc2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2020-02-22T20:19:53.000Z","paperID":"2002.09766","published":"2020-02-22T20:19:53.000Z","authors":"[\"Chen Zhu\",\"Renkun Ni\",\"Ping-yeh Chiang\",\"Hengduo Li\",\"Furong Huang\",\"Tom Goldstein\"]","title":"Improving the Tightness of Convex Relaxation Bounds for Training Certifiably Robust Classifiers","scoreTrending":null,"summary":"Convex relaxations are effective for training and certifying neural networks\nagainst norm-bounded adversarial attacks, but they leave a large gap between\ncertifiable and empirical robustness. In principle, convex relaxation can\nprovide tight bounds if the solution to the relaxed problem is feasible for the\noriginal non-convex problem. We propose two regularizers that can be used to\ntrain neural networks that yield tighter convex relaxation bounds for\nrobustness. In all of our experiments, the proposed regularizers result in\nhigher certified accuracy than non-regularized baselines.","lastCheckedForCode":"2022-09-03T22:39:24.486Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9pbXByb3ZpbmctdGhlLXRpZ2h0bmVzcy1vZi1jb252ZXgtcmVsYXhhdGlvbiJ9","type":"pwc","url":"https://paperswithcode.com/paper/improving-the-tightness-of-convex-relaxation","data":null}],"reposConnection":{"edges":[]},"models":[],"tags":[],"summaries":[],"emailsConnection":{"edges":[{"author":"tom goldstein","node":{"id":"eyJhZGRyZXNzIjoidG9tZ0Bjcy51bWQuZWR1In0=","address":"tomg@cs.umd.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/31602207?v=4","username":"tomgoldstein"}],"scholar":[{"thirdPartyID":"KmSuVtgAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIwN2YyYWQ0ZC04YWMwLTQyODQtODI0ZS1jYTRkNWY4ZDQwOWMifQ==","name":"tom goldstein","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTcxMi4wOTkxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.09913"},{"id":"eyJwYXBlcklEIjoiMTgwNC4wMDc5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1804.00792"},{"id":"eyJwYXBlcklEIjoiMTkwNC4xMjg0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.12843"},{"id":"eyJwYXBlcklEIjoiMTkwOS4xMTc2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.11764"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wMTM0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.01342"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wMzY2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.03668"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wOTg5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.09891"},{"id":"eyJwYXBlcklEIjoiMTYwNS4wOTUyNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.09527"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wMjM3OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.02379"},{"id":"eyJwYXBlcklEIjoiMjAwOS4wMjI3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.02276"},{"id":"eyJwYXBlcklEIjoiMTYxMC4wNTc5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1610.05792"},{"id":"eyJwYXBlcklEIjoiMjEwNC4wODg5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.08894"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wNTg5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.05897"},{"id":"eyJwYXBlcklEIjoiMjIxMi4wMzg2MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2212.03860"},{"id":"eyJwYXBlcklEIjoiMTgxMS4xMTMwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.11304"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wMzA4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.03082"},{"id":"eyJwYXBlcklEIjoiMTcwNS4wNzM2NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1705.07364"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wMzI5MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.03291"},{"id":"eyJwYXBlcklEIjoiMjAxMi4xMDU0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.10544"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wNjY5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.06693"},{"id":"eyJwYXBlcklEIjoiMjQwMi4xNDAyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.14020"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wODIzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.08232"},{"id":"eyJwYXBlcklEIjoiMjMwNS4yMDA4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.20086"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMjU1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.12557"},{"id":"eyJwYXBlcklEIjoiMTYwOC4wMjE2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1608.02165"},{"id":"eyJwYXBlcklEIjoiMjEwMS4wNzkyMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2101.07922"},{"id":"eyJwYXBlcklEIjoiMTkwOC4wMjg3OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.02878"},{"id":"eyJwYXBlcklEIjoiMjEwOS4xNDExOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2109.14119"},{"id":"eyJwYXBlcklEIjoiMjAwNC4wMDIyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2004.00225"},{"id":"eyJwYXBlcklEIjoiMTUxMC4wNDYwOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1510.04609"},{"id":"eyJwYXBlcklEIjoiMTgwNC4xMDM0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1804.10343"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wMDU4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.00580"},{"id":"eyJwYXBlcklEIjoiMjExMS4xMjg4MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.12880"},{"id":"eyJwYXBlcklEIjoiMTgwNy4wNTI0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.05247"},{"id":"eyJwYXBlcklEIjoiMjQwMS4wODU3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.08573"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wODk3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.08970"},{"id":"eyJwYXBlcklEIjoiMjIwMS4xMjY3NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.12675"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNjM5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.06398"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wODkzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.08937"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMDM1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.00359"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xMDMyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.10328"},{"id":"eyJwYXBlcklEIjoiMjQwNi4xMDMyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2406.10323"},{"id":"eyJwYXBlcklEIjoiMTkxMC4xMTU4NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.11585"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMDk4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.00982"},{"id":"eyJwYXBlcklEIjoiMjExMS4wMDg2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.00861"},{"id":"eyJwYXBlcklEIjoiMTkwOS4xMzM1NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.13355"},{"id":"eyJwYXBlcklEIjoiMjEwOS4wNDE3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2109.04176"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wNzE1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.07153"},{"id":"eyJwYXBlcklEIjoiMjQwMi4wNjY1OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2402.06659"},{"id":"eyJwYXBlcklEIjoiMjIxMi4xNDAzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2212.14034"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wODA5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.08098"},{"id":"eyJwYXBlcklEIjoiMTUxMi4wMjk3MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1512.02970"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNzA5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.07092"},{"id":"eyJwYXBlcklEIjoiMTkwMS4wMTQ5OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.01499"},{"id":"eyJwYXBlcklEIjoiMTYwNS4wMTgxMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1605.01813"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNTEzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.05137"},{"id":"eyJwYXBlcklEIjoiMjAwNy4wMzczMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.03730"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wODc0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.08742"},{"id":"eyJwYXBlcklEIjoiMjEwOC4xMjUwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.12508"},{"id":"eyJwYXBlcklEIjoiMjQwNC4wMzQ0MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.03441"},{"id":"eyJwYXBlcklEIjoiMjIwMy4wODEyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.08124"},{"id":"eyJwYXBlcklEIjoiMjAwOS4wODk2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.08965"},{"id":"eyJwYXBlcklEIjoiMjIwMS4xMjQ0MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.12440"},{"id":"eyJwYXBlcklEIjoiMjAwOS4wODA2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.08061"},{"id":"eyJwYXBlcklEIjoiMjMwMi4wMzAxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2302.03015"},{"id":"eyJwYXBlcklEIjoiMjMwNC4wMjIzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2304.02234"},{"id":"eyJwYXBlcklEIjoiMjAwNy4xMzI0MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2007.13242"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xMjk4OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.12989"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wMDc2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.00762"},{"id":"eyJwYXBlcklEIjoiMjMwMS4wMjY1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2301.02650"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wNDQzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.04436"},{"id":"eyJwYXBlcklEIjoiMjAwMi4wOTc2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.09766"},{"id":"eyJwYXBlcklEIjoiMjIwNC4wODYxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2204.08615"},{"id":"eyJwYXBlcklEIjoiMjMxMi4wOTMyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2312.09323"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wMTMzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.01335"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xOTI1NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.19254"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMzI2MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.13262"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wNzg3NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.07877"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wOTcwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.09701"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wODExNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.08116"},{"id":"eyJwYXBlcklEIjoiMjIwMS4xMDA0NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.10047"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wOTY0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.09643"},{"id":"eyJwYXBlcklEIjoiMjAwNi4wMDM4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.00387"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wMzM4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.03386"},{"id":"eyJwYXBlcklEIjoiMjMwNy4wMDAyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.00028"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNzMzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.07334"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wMzY5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.03693"},{"id":"eyJwYXBlcklEIjoiMTUxMi4wMTcwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1512.01708"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNjcxNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.06715"},{"id":"eyJwYXBlcklEIjoiMjAwMS4xMDUwOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2001.10509"},{"id":"eyJwYXBlcklEIjoiMjAwNi4xMTkxOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2006.11918"},{"id":"eyJwYXBlcklEIjoiMjAxMS4xMjkxOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2011.12919"},{"id":"eyJwYXBlcklEIjoiMjIxMS4xNTkzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2211.15937"},{"id":"eyJwYXBlcklEIjoiMjExMC4xNDM2MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2110.14363"},{"id":"eyJwYXBlcklEIjoiMjEwODIiLCJwdWJsaXNoZXIiOiJjdnByIn0=","publisher":"cvpr","paperID":"21082"},{"id":"eyJwYXBlcklEIjoiNTU3NjUiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"55765"},{"id":"eyJwYXBlcklEIjoiNTM0ODYiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53486"},{"id":"eyJwYXBlcklEIjoiNTI4MDEiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"52801"},{"id":"eyJwYXBlcklEIjoiNzEyNTkiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71259"},{"id":"eyJwYXBlcklEIjoiNzE0NTgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"71458"},{"id":"eyJwYXBlcklEIjoiNjk5NTYiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"69956"},{"id":"eyJwYXBlcklEIjoiNzIxNDgiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"72148"}]}]}},{"author":"chen zhu","node":{"id":"eyJhZGRyZXNzIjoiemh1QGNzLnVtZC5lZHUifQ==","address":"zhu@cs.umd.edu","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"m-om5O8AAAAJ"}],"twitter":[],"location":[],"owner":[]}}]},"__typename":"paper","authorArray":["Chen Zhu","Renkun Ni","Ping-yeh Chiang","Hengduo Li","Furong Huang","Tom Goldstein"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2b",null,{"publisher":"arxiv","paperID":"2002.09766","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2c",null,{"article":"$L2d","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2e",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L2f",null,{"paperID":"2002.09766","publisher":"arxiv","paperJSON":{"title":"Improving the Tightness of Convex Relaxation Bounds for Training Certifiably Robust Classifiers","paperID":"2002.09766","avgLineHeight":11.96,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"Convex relaxations are effective for training and certifying neural networks against norm-bounded adversarial attacks, but they leave a large gap between certifiable and empirical robustness. In principle, convex relaxation can provide tight bounds if the solution to the relaxed problem is feasible for the original non-convex problem. We propose two regularizers that can be used to train neural networks that yield tighter convex relaxation bounds for robustness. In all of our experiments, the proposed regularizers result in higher certified accuracy than non-regularized baselines.","element":"span"}]]},{"heading":"1. Introduction","paragraphs":[[{"text":"Neural networks have achieved excellent performances on many computer vision tasks, but they are often vulnerable to small, adversarially chosen perturbations that are barely perceptible to humans while having a catastrophic impact on model performance (","element":"span"},{"href":"#id-0","referenceIndex":29,"text":"Szegedy et al.","element":"a"},{"href":"#id-0","referenceIndex":29,"text":", ","element":"a"},{"href":"#id-0","referenceIndex":29,"text":"2013","element":"a"},{"text":"; ","element":"span"},{"href":"#id-1","referenceIndex":11,"text":"Goodfellow ","element":"a"},{"href":"#id-1","referenceIndex":11,"text":"et al.","element":"a"},{"href":"#id-1","referenceIndex":11,"text":", ","element":"a"},{"href":"#id-1","referenceIndex":11,"text":"2014","element":"a"},{"text":"). Making classifiers robust to these adversarial perturbations is of great interest, especially when neural networks are applied to safety-critical applications. Several heuristic methods exist for obtaining robust classifiers, however powerful adversarial examples can be found against most of these defenses (","element":"span"},{"href":"#id-2","referenceIndex":1,"text":"Carlini & Wagner","element":"a"},{"href":"#id-2","referenceIndex":1,"text":", ","element":"a"},{"href":"#id-2","referenceIndex":1,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-3","referenceIndex":31,"text":"Uesato ","element":"a"},{"href":"#id-3","referenceIndex":31,"text":"et al.","element":"a"},{"href":"#id-3","referenceIndex":31,"text":", ","element":"a"},{"href":"#id-3","referenceIndex":31,"text":"2018","element":"a"},{"text":").","element":"span"}],[{"text":"Recent studies focus on verifying or enforcing the certified accuracy of deep classifiers, especially for networks with ReLU activations. They provide guarantees of a network’s robustness to any perturbation ","element":"span"},{"style":{"height":11.6},"width":19,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/0-0.png","element":"img","alt":" δ","inline":true,"padRight":true},{"text":"with norm bounded by ","element":"span"},{"style":{"height":16.79},"width":169.74,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/0-1.png","element":"img","alt":"∥δ∥p ≤ ϵ (","inline":true},{"href":"#id-4","referenceIndex":33,"text":"Wong & Kolter","element":"a"},{"href":"#id-4","referenceIndex":33,"text":", ","element":"a"},{"href":"#id-4","referenceIndex":33,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-6","referenceIndex":22,"text":"Raghu- ","element":"a"},{"href":"#id-6","referenceIndex":22,"text":"nathan et al.","element":"a"},{"href":"#id-6","referenceIndex":22,"text":", ","element":"a"},{"href":"#id-6","referenceIndex":22,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-7","referenceIndex":7,"text":"Dvijotham et al.","element":"a"},{"href":"#id-7","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-7","referenceIndex":7,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-8","referenceIndex":37,"text":"Zhang et al.","element":"a"},{"href":"#id-8","referenceIndex":37,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-9","referenceIndex":23,"text":"Salman et al.","element":"a"},{"href":"#id-9","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-9","referenceIndex":23,"text":"2019","element":"a"},{"text":"). Formal verification methods can find the exact minimum adversarial distortions needed to fool a classifier (","element":"span"},{"href":"#id-10","referenceIndex":8,"text":"Ehlers","element":"a"},{"href":"#id-10","referenceIndex":8,"text":", ","element":"a"},{"href":"#id-10","referenceIndex":8,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-11","referenceIndex":16,"text":"Katz et al.","element":"a"},{"href":"#id-11","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-11","referenceIndex":16,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-12","referenceIndex":30,"text":"Tjeng et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":30,"text":"2017","element":"a"},{"text":"), but require solving an NP-hard problem. To make verification efficient and scalable, convex relaxations are adopted, resulting in a lower bound on the norm of adversarial perturbations (","element":"span"},{"href":"#id-8","referenceIndex":37,"text":"Zhang et al.","element":"a"},{"href":"#id-8","referenceIndex":37,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":"), or an upper bound on the robust error (","element":"span"},{"href":"#id-7","referenceIndex":7,"text":"Dvijotham et al.","element":"a"},{"href":"#id-7","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-7","referenceIndex":7,"text":"2018b","element":"a"},{"text":"; ","element":"span"},{"href":"#id-14","referenceIndex":10,"text":"Gehr et al.","element":"a"},{"href":"#id-14","referenceIndex":10,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":10,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-15","referenceIndex":28,"text":"Singh et al.","element":"a"},{"href":"#id-15","referenceIndex":28,"text":", ","element":"a"},{"href":"#id-15","referenceIndex":28,"text":"2018","element":"a"},{"text":"). Linear programming (LP) relaxations (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":") are efficient enough to estimate the lower bound of the margin in each iteration for training certifiably robust networks. However, due to the relaxation of the underlying problem, a wide gap remains between the optimal values from the original and relaxed problems (","element":"span"},{"href":"#id-9","referenceIndex":23,"text":"Salman et al.","element":"a"},{"href":"#id-9","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-9","referenceIndex":23,"text":"2019","element":"a"},{"text":").","element":"span"}],[{"text":"In this paper, we focus on improving the certified robustness of neural networks trained with convex relaxation bounds. To achieve this, we first give a more interpretable explanation for the bounds achieved in (","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"). Namely, the constraints of the relaxed problem are defined by a simple linear network with adversaries injecting bounded perturbations to both the input of the network and the pre-activations of intermediate layers. The optimal solution of the relaxed problem can be written as a forward pass of the clean image through the linear network, plus the cumulative adversarial effects of all the perturbations added to the linear transforms, which makes it easier to identify the optimality conditions and serves as a bridge between the relaxed problem and the original non-convex problem. We further identify conditions for the bound to be tight, and we propose two indicators for the gap between the original non-convex problem and the relaxed problem. Adding the proposed indicators into the loss function results in classifiers with better certified accuracy.","element":"span"}]]},{"heading":"2. Background and Related Work","paragraphs":[[{"text":"Adversarial defenses roughly fall into two categories: heuristic defenses and verifiable defenses. The heuristic defenses either try to identify adversarial examples and remove adversarial perturbations from images, or make the network invariant to small perturbations through training (","element":"span"},{"href":"#id-16","referenceIndex":21,"text":"Papernot ","element":"a"},{"href":"#id-16","referenceIndex":21,"text":"& McDaniel","element":"a"},{"href":"#id-16","referenceIndex":21,"text":", ","element":"a"},{"href":"#id-16","referenceIndex":21,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":27,"text":"Shan et al.","element":"a"},{"href":"#id-17","referenceIndex":27,"text":", ","element":"a"},{"href":"#id-17","referenceIndex":27,"text":"2019","element":"a"},{"text":"; ","element":"span"},{"href":"#id-18","referenceIndex":24,"text":"Samangouei et al.","element":"a"},{"href":"#id-18","referenceIndex":24,"text":", ","element":"a"},{"href":"#id-18","referenceIndex":24,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-19","referenceIndex":15,"text":"Hwang et al.","element":"a"},{"href":"#id-19","referenceIndex":15,"text":", ","element":"a"},{"href":"#id-19","referenceIndex":15,"text":"2019","element":"a"},{"text":"). In addition, adversarial training uses adversarial examples as opposed to clean examples during training, so that the network can learn how to classify","element":"span"}],[{"text":"adversarial examples directly (","element":"span"},{"href":"#id-20","referenceIndex":19,"text":"Madry et al.","element":"a"},{"href":"#id-20","referenceIndex":19,"text":", ","element":"a"},{"href":"#id-20","referenceIndex":19,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-21","referenceIndex":26,"text":"Shafahi ","element":"a"},{"href":"#id-21","referenceIndex":26,"text":"et al.","element":"a"},{"href":"#id-21","referenceIndex":26,"text":", ","element":"a"},{"href":"#id-21","referenceIndex":26,"text":"2019","element":"a"},{"text":"; ","element":"span"},{"href":"#id-22","referenceIndex":36,"text":"Zhang et al.","element":"a"},{"href":"#id-22","referenceIndex":36,"text":", ","element":"a"},{"href":"#id-22","referenceIndex":36,"text":"2019","element":"a"},{"text":").","element":"span"}],[{"text":"In response, a line of works have proposed to verify the robustness of neural nets. Exact methods obtain the perturbation ","element":"span"},{"style":{"height":11.6},"width":19,"height":29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-0.png","element":"img","alt":" δ","inline":true,"padRight":true},{"text":"with minimum ","element":"span"},{"style":{"height":16.79},"width":76.07,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-1.png","element":"img","alt":" ∥δ∥p","inline":true,"padRight":true},{"text":"such that ","element":"span"},{"style":{"height":16},"width":285.51,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-2.png","element":"img","alt":" f(x) ̸= f(x + δ)","inline":true},{"text":", where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f ","element":"span"},{"text":"is a classifier and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"is the data point. Nevertheless, the problem itself is NP-hard and the methods can hardly scale (","element":"span"},{"href":"#id-23","referenceIndex":3,"text":"Cheng et al.","element":"a"},{"href":"#id-23","referenceIndex":3,"text":", ","element":"a"},{"href":"#id-23","referenceIndex":3,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-24","referenceIndex":18,"text":"Lomuscio & Maganti","element":"a"},{"href":"#id-24","referenceIndex":18,"text":", ","element":"a"},{"href":"#id-24","referenceIndex":18,"text":"2017","element":"a"},{"href":"#id-24","referenceIndex":18,"text":"; ","element":"a"},{"href":"#id-24","referenceIndex":18,"text":"Dutta et al.","element":"a"},{"href":"#id-24","referenceIndex":18,"text":", ","element":"a"},{"href":"#id-24","referenceIndex":18,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-25","referenceIndex":9,"text":"Fischetti & Jo","element":"a"},{"href":"#id-25","referenceIndex":9,"text":", ","element":"a"},{"href":"#id-25","referenceIndex":9,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-12","referenceIndex":30,"text":"Tjeng et al.","element":"a"},{"href":"#id-12","referenceIndex":30,"text":", ","element":"a"},{"href":"#id-12","referenceIndex":30,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-26","referenceIndex":25,"text":"Scheibler et al.","element":"a"},{"href":"#id-26","referenceIndex":25,"text":", ","element":"a"},{"href":"#id-26","referenceIndex":25,"text":"2015","element":"a"},{"text":"; ","element":"span"},{"href":"#id-11","referenceIndex":16,"text":"Katz et al.","element":"a"},{"href":"#id-11","referenceIndex":16,"text":", ","element":"a"},{"href":"#id-11","referenceIndex":16,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-27","referenceIndex":2,"text":"Carlini et al.","element":"a"},{"href":"#id-27","referenceIndex":2,"text":", ","element":"a"},{"href":"#id-27","referenceIndex":2,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-10","referenceIndex":8,"text":"Ehlers","element":"a"},{"href":"#id-10","referenceIndex":8,"text":", ","element":"a"},{"href":"#id-10","referenceIndex":8,"text":"2017","element":"a"},{"text":").","element":"span"}],[{"text":"A body of work focuses on relaxing the non-linearities in the original problem into linear inequality constraints (","element":"span"},{"href":"#id-15","referenceIndex":28,"text":"Singh ","element":"a"},{"href":"#id-15","referenceIndex":28,"text":"et al.","element":"a"},{"href":"#id-15","referenceIndex":28,"text":", ","element":"a"},{"href":"#id-15","referenceIndex":28,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-14","referenceIndex":10,"text":"Gehr et al.","element":"a"},{"href":"#id-14","referenceIndex":10,"text":", ","element":"a"},{"href":"#id-14","referenceIndex":10,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-8","referenceIndex":37,"text":"Zhang et al.","element":"a"},{"href":"#id-8","referenceIndex":37,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-28","referenceIndex":20,"text":"Mirman ","element":"a"},{"href":"#id-28","referenceIndex":20,"text":"et al.","element":"a"},{"href":"#id-28","referenceIndex":20,"text":", ","element":"a"},{"href":"#id-28","referenceIndex":20,"text":"2018","element":"a"},{"text":"), sometimes using the dual of the relaxed problem (","element":"span"},{"href":"#id-4","referenceIndex":33,"text":"Wong & Kolter","element":"a"},{"href":"#id-4","referenceIndex":33,"text":", ","element":"a"},{"href":"#id-4","referenceIndex":33,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-7","referenceIndex":7,"text":"Dvijotham ","element":"a"},{"href":"#id-7","referenceIndex":7,"text":"et al.","element":"a"},{"href":"#id-7","referenceIndex":7,"text":", ","element":"a"},{"href":"#id-7","referenceIndex":7,"text":"2018b","element":"a"},{"text":"). Recently, (","element":"span"},{"href":"#id-9","referenceIndex":23,"text":"Salman et al.","element":"a"},{"href":"#id-9","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-9","referenceIndex":23,"text":"2019","element":"a"},{"text":") unified the primal and dual views into a common convex relaxation framework, and suggested there is an inherent gap between the actual and the lower bound of robustness given by ver-ifiers based on LP relaxations, which they called a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"convex relaxation barrier","element":"span"},{"text":".","element":"span"}],[{"text":"Some defense approaches integrate the verification methods into the training of a network to minimize robust loss directly. (","element":"span"},{"href":"#id-29","referenceIndex":13,"text":"Hein & Andriushchenko","element":"a"},{"href":"#id-29","referenceIndex":13,"text":", ","element":"a"},{"href":"#id-29","referenceIndex":13,"text":"2017","element":"a"},{"text":") uses a local lipschitz regularization to improve certified robustness. In addition, a bound based on semi-definite programming (SDP) relaxation was developed and minimized as the objective (","element":"span"},{"href":"#id-6","referenceIndex":22,"text":"Raghu- ","element":"a"},{"href":"#id-6","referenceIndex":22,"text":"nathan et al.","element":"a"},{"href":"#id-6","referenceIndex":22,"text":", ","element":"a"},{"href":"#id-6","referenceIndex":22,"text":"2018","element":"a"},{"text":"). (","element":"span"},{"href":"#id-4","referenceIndex":33,"text":"Wong & Kolter","element":"a"},{"href":"#id-4","referenceIndex":33,"text":", ","element":"a"},{"href":"#id-4","referenceIndex":33,"text":"2017","element":"a"},{"text":") presents an upper bound on the robust loss caused by norm-bounded perturbation via LP relaxation, and minimizes this upper bound during training. (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":") further extend this method to much more general network structures with skip connections and general non-linearities, and provide a memory-friendly training strategy using random projections. Since LP relaxation is adopted, the aforementioned convex relaxation barrier exists for their methods.","element":"span"}],[{"text":"While another line of work (IBP) have shown that an intuitively looser interval bound can be used to train much more robust networks than convex relaxation for large ","element":"span"},{"style":{"height":5.2},"width":48.61,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-3.png","element":"img","alt":" ℓ∞","inline":true,"padRight":true},{"text":"perturbations (","element":"span"},{"href":"#id-30","referenceIndex":12,"text":"Gowal et al.","element":"a"},{"href":"#id-30","referenceIndex":12,"text":", ","element":"a"},{"href":"#id-30","referenceIndex":12,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":"), it is still important to study convex relaxation bounds since it can provide better certificates against a broader class of adversaries that IBP struggles to certify in some cases, such as ","element":"span"},{"style":{"height":7.6},"width":32.61,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-4.png","element":"img","alt":" ℓ2","inline":true,"padRight":true},{"text":"adversaries for convolutional networks. We discuss these motivations in more detail in Appendix ","element":"span"},{"text":"F","element":"span"},{"text":".","element":"span"}],[{"text":"We seek to enforce the tightness of the convex relaxation certificate during training. We reduce the optimality gap between the original and the relaxed problem by using various tightness indicators as regularizers during training. Compared with previous approaches, we have the following contributions: First, based upon the same relaxation in (","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":"), we illustrate a more intuitive view for the bounds on intermediate ReLU activations achieved by (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"), which can be viewed as a linear network facing adversaries adding that make bounded perturbations to both the input and the intermediate layers. Second, starting from this view, we identify conditions where the bound from the relaxed problem is tight for the original non-convex problem. Third, based on the conditions, we propose regularizers that encourage the bound to be tight for the obtained network, which improves the certificate on both MNIST and CIFAR-10.","element":"span"}]]},{"heading":"3. Problem Formulation","paragraphs":[[{"text":"In general, to train an adversarially robust network, we solve a constrained minimax problem where the adversary tries to maximize the loss given the norm constraint, and the parameters of the network are trained to minimize this maximal loss. Due to nonconvexity and the complexity of neural networks, it is expensive to solve the inner max problem exactly. To obtain certified robustness, like many related works (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-30","referenceIndex":12,"text":"Gowal et al.","element":"a"},{"href":"#id-30","referenceIndex":12,"text":", ","element":"a"},{"href":"#id-30","referenceIndex":12,"text":"2018","element":"a"},{"text":"), we minimize an upper bound of the inner max problem, which is a cross entropy loss on the negation of the lower bounds of margins over each other class, as shown in Eq. ","element":"span"},{"href":"#id-32","text":"3","element":"a"},{"text":". Without loss of generality, in this section we analyze the original and relaxed problems for minimizing the margin between the ground truth class ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y ","element":"span"},{"text":"and some other class ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"under norm-bounded adversaries, which can be adapted directly to compute the loss in Eq. ","element":"span"},{"href":"#id-32","text":"3","element":"a"},{"text":".","element":"span"}],[{"text":"The original nonconvex constrained optimization problem for finding the norm-bounded adversary that minimizes the margin can be formulated as","element":"span"}],[{"id":"id-33","style":{"width":"81%"},"width":761,"height":194,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-5.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":11.59},"width":290.3,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-6.png","element":"img","alt":" ct = ey − et, ey","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":9.19},"width":30.56,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-7.png","element":"img","alt":" et","inline":true,"padRight":true},{"text":"are one-hot vectors corresponding to the label ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y ","element":"span"},{"text":"and some other class ","element":"span"},{"style":{"height":16},"width":105.67,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-8.png","element":"img","alt":" t, σ(·)","inline":true,"padRight":true},{"text":"is the ReLU activation, and ","element":"span"},{"style":{"height":14},"width":30.51,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-9.png","element":"img","alt":" fi","inline":true,"padRight":true},{"text":"is one functional block of the neural network. This can be a linear layer (","element":"span"},{"style":{"height":16},"width":143.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-10.png","element":"img","alt":"fi(zi) =","inline":true},{"style":{"height":13.19},"width":167.44,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-11.png","element":"img","alt":"Wizi + bi","inline":true},{"text":"), or even a residual block. We use ","element":"span"},{"style":{"height":16},"width":142,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-12.png","element":"img","alt":" hi(x) =","inline":true},{"style":{"height":16},"width":368.34,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-13.png","element":"img","alt":"fi(σ(fi−1(· · · f1(x))))","inline":true,"padRight":true},{"text":"to denote the ReLU network up to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th layer, and ","element":"span"},{"style":{"height":15.38},"width":45.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-14.png","element":"img","alt":" p∗O ","inline":true,"padRight":true},{"text":"to denote the optimal solution to ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3.1. Efficient Convex Relaxations","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Grouping of ReLU Activations ","element":"span"},{"text":"The nonconvexity of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"text":"stems from the nonconvex feasible set given by the ReLU activations. Since the network is a continuous function, the pre-activations ","element":"span"},{"style":{"height":9.19},"width":33.78,"height":22.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-15.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"have lower and upper bounds ","element":"span"},{"style":{"height":10.78},"width":33.78,"height":26.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-16.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.99},"width":33.77,"height":29.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-17.png","element":"img","alt":" ¯xi","inline":true,"padRight":true},{"text":"when the input ","element":"span"},{"style":{"height":16.79},"width":220.76,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/1-18.png","element":"img","alt":" z1 ∈ Bp,ϵ(x)","inline":true},{"text":". If a certain pre-activation","element":"span"}],[{"style":{"width":"74%"},"width":1459,"height":341,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-0.png","element":"img"}],[{"id":"id-35","style":{"fontStyle":"italic"},"text":"Figure 1. ","element":"figcaption","subtype":"caption"},{"text":"The feasible sets (blue regions/lines) given by the bounded ReLU constraints (Eq. ","element":"figcaption","subtype":"caption"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a","subtype":"caption"},{"text":"), convex hull (","element":"figcaption","subtype":"caption"},{"style":{"height":10.39},"width":97.9,"height":25.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-1.png","element":"img","alt":"convij","inline":true},{"text":") and the relaxation (Fast-Lin) discussed in this paper (specific choice for Eq. ","element":"figcaption","subtype":"caption"},{"href":"#id-34","style":{"height":12.8},"width":183.78,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-2.png","element":"img","alt":" C) for j ∈ Ii","inline":true},{"text":". The red lines and dots are the intersections between the boundaries of the convex feasible sets and the ReLU constraints.","element":"figcaption","subtype":"caption"}],[{"style":{"height":11.59},"width":47.05,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-3.png","element":"img","alt":"xij","inline":true,"padRight":true},{"text":"has ","element":"span"},{"style":{"height":17.18},"width":225.46,"height":42.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-4.png","element":"img","alt":" xij < 0 < ¯xij","inline":true},{"text":", its corresponding ReLU constraint ","element":"span"},{"style":{"height":16.79},"width":260.4,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-5.png","element":"img","alt":"zi+1,j = σ(xij)","inline":true,"padRight":true},{"text":"gives rise to a non-convex feasible set as shown in the left of Figure ","element":"span"},{"href":"#id-35","text":"1","element":"a"},{"text":", making Eq. ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"text":"a non-convex optimization problem. On the other hand, if ","element":"span"},{"style":{"height":15.59},"width":136.92,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-6.png","element":"img","alt":" ¯xij ≤ 0","inline":true,"padRight":true},{"text":"or ","element":"span"},{"style":{"height":17.19},"width":127.32,"height":42.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-7.png","element":"img","alt":"xij ≥ 0","inline":true},{"text":", the constraints degenerate into linear constraints ","element":"span"},{"style":{"height":15.59},"width":184.01,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-8.png","element":"img","alt":"zi+1,j = 0","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.59},"width":211.06,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-9.png","element":"img","alt":" zi+1,j = xij","inline":true,"padRight":true},{"text":"respectively, which do not affect convexity. Based on ","element":"span"},{"style":{"height":10.78},"width":33.78,"height":26.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-10.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.99},"width":33.78,"height":29.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-11.png","element":"img","alt":" ¯xi","inline":true},{"text":", we divide the ReLU activations into three disjoint subsets","element":"span"}],[{"style":{"width":"84%"},"width":795,"height":116,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-12.png","element":"img"}],[{"text":"If ","element":"span"},{"style":{"height":14},"width":110.12,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-13.png","element":"img","alt":" j ∈ Ii","inline":true},{"text":", we call the corresponding ReLU activation an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"unstable neuron","element":"span"},{"text":".","element":"span"}],[{"text":"Convex relaxation expands the non-convex feasible sets into convex ones and solves a convex optimization problem ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":". The feasible set of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"text":"is a subset of the feasible set of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":", so the optimal value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"lower bounds the optimal value of Eq. ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":". Moreover, we want problem ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"to be solved efficiently, better with a closed form solution, so that it can be integrated into the training process.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Computational Challenge for the “optimal” Relaxation ","element":"span"},{"text":"As pointed out by (","element":"span"},{"href":"#id-9","referenceIndex":23,"text":"Salman et al.","element":"a"},{"href":"#id-9","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-9","referenceIndex":23,"text":"2019","element":"a"},{"text":"), the optimal layer-wise convex relaxation, i.e., the optimal convex relaxation for the nonlinear constraint ","element":"span"},{"style":{"height":16},"width":217.06,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-14.png","element":"img","alt":" zi+1 = σ(xi)","inline":true,"padRight":true},{"text":"of a single layer, can be obtained independently for each neuron. For each ","element":"span"},{"style":{"height":14},"width":116.82,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-15.png","element":"img","alt":"j ∈ Ii","inline":true,"padRight":true},{"text":"in a ReLU network, the optimal layer-wise convex relaxation is the closed convex hull ","element":"span"},{"style":{"height":11.59},"width":105.5,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-16.png","element":"img","alt":" convij","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"height":15.59},"width":99.62,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-17.png","element":"img","alt":" Sij =","inline":true}],[{"style":{"width":"99%"},"width":931,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-18.png","element":"img"}],[{"style":{"height":16.79},"width":70.85,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-19.png","element":"img","alt":"xij}","inline":true},{"text":", which is just ","element":"span"},{"style":{"height":16.79},"width":631.76,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-20.png","element":"img","alt":" convij = {(xij, zi+1,j)| max(0, xij) ≤","inline":true},{"style":{"height":23.49},"width":467.98,"height":58.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-21.png","element":"img","alt":"zi+1,j ≤ xijxij−xij (xij − xij)}","inline":true},{"text":", corresponding to the triangle ","element":"span"},{"text":"region in the middle of Figure ","element":"span"},{"href":"#id-35","text":"1","element":"a"},{"text":". Despite being relatively tight, there is no closed-form solution to this relaxed problem. LP solvers are typically adopted to solve a linear programming problem for each neuron. Therefore, such a relaxation is hardly scalable to verify larger networks without any additional trick (like (","element":"span"},{"href":"#id-36","referenceIndex":35,"text":"Xiao et al.","element":"a"},{"href":"#id-36","referenceIndex":35,"text":", ","element":"a"},{"href":"#id-36","referenceIndex":35,"text":"2018","element":"a"},{"text":")). (","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":") find it to be 34 to 1523 times slower than FastLin, and it has difficulty verifying MLPs with more than 3 layers on MNIST. In (","element":"span"},{"href":"#id-9","referenceIndex":23,"text":"Salman et al.","element":"a"},{"href":"#id-9","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-9","referenceIndex":23,"text":"2019","element":"a"},{"text":"), it takes 10,000 ","element":"span"},{"text":"CPU cores to parallelize the LP solvers for bounding the activations of every neuron in a two-hidden-layer MLP with 100 neurons per layer. Since solving LP problems for all neurons are usually impractical, it is even more difficult to optimize the network to maximize the lower bounds of margin found by solving this relaxation problem, as differentiating through the LP optimization process is even more expensive.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Computationally Efficient Relaxations ","element":"span"},{"text":"In the layer-wise convex relaxation, instead of using a boundary nonlinear in ","element":"span"},{"style":{"height":11.59},"width":47.04,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-22.png","element":"img","alt":" xij","inline":true},{"text":", (","element":"span"},{"href":"#id-8","referenceIndex":37,"text":"Zhang et al.","element":"a"},{"href":"#id-8","referenceIndex":37,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"2018","element":"a"},{"text":") has shown that for any nonlinearity, when both the lower and upper boundaries are linear in ","element":"span"},{"style":{"height":11.59},"width":47.05,"height":28.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-23.png","element":"img","alt":" xij","inline":true},{"text":", there exist closed-form solutions to the relaxed problem, which avoids using LP solvers and improves ef-ficiency. Specifically, the following relaxation of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"text":"has closed-form solutions:","element":"span"}],[{"id":"id-34","style":{"width":"88%"},"width":829,"height":287,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-24.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":4.8},"width":11,"height":12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-25.png","element":"img","alt":" ·","inline":true,"padRight":true},{"text":"denotes element-wise product, and for simplicity, we have only considered networks with no skip connections, and represent both Full Connected and Convolutional Layers as a linear transform ","element":"span"},{"style":{"height":16},"width":320.48,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-26.png","element":"img","alt":" fi(zi) = Wizi + bi.","inline":true}],[{"text":"Before we can solve ","element":"span"},{"href":"#id-34","style":{"fontStyle":"italic"},"text":"C ","element":"a"},{"text":"to get the lower bound of margin, we need to know the range ","element":"span"},{"style":{"height":16},"width":111.86,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-27.png","element":"img","alt":" [xi, ¯xi]","inline":true,"padRight":true},{"text":"for the pre-activations ","element":"span"},{"style":{"height":14},"width":166.76,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-28.png","element":"img","alt":"xi. As in (","inline":true},{"href":"#id-4","referenceIndex":33,"text":"Wong & Kolter","element":"a"},{"href":"#id-4","referenceIndex":33,"text":", ","element":"a"},{"href":"#id-4","referenceIndex":33,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-8","referenceIndex":37,"text":"Zhang ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"et al.","element":"a"},{"href":"#id-8","referenceIndex":37,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"2018","element":"a"},{"text":"), we can solve the same optimization problem for each neuron ","element":"span"},{"style":{"height":11.59},"width":47.05,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-29.png","element":"img","alt":" xij","inline":true,"padRight":true},{"text":"starting from layer 1 to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":", by replacing ","element":"span"},{"style":{"height":17.58},"width":494.26,"height":43.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-30.png","element":"img","alt":"ct with ej or −ej for xij or ¯xij","inline":true,"padRight":true},{"text":"respectively.","element":"span"},{"text":"1","element":"span"}],[{"text":"The most efficient approach in this category is FastLin (","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":"), which sets ","element":"span"},{"style":{"height":15.99},"width":147.62,"height":39.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-31.png","element":"img","alt":" aij = ¯aij","inline":true},{"text":", as shown in the right of Figure ","element":"span"},{"href":"#id-35","text":"1","element":"a"},{"text":". A tighter choice is CROWN (","element":"span"},{"href":"#id-8","referenceIndex":37,"text":"Zhang ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"et al.","element":"a"},{"href":"#id-8","referenceIndex":37,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"2018","element":"a"},{"text":"), which chooses different ","element":"span"},{"style":{"height":13.18},"width":45.33,"height":32.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-32.png","element":"img","alt":" aij","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":14.39},"width":45.33,"height":35.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/2-33.png","element":"img","alt":" ¯aij","inline":true,"padRight":true},{"text":"such that the convex feasible set is minimized. However, CROWN ","element":"span"},{"text":"has much higher complexity than Fast-Lin due to its varying slopes. We give detailed analysis of the closed-form solutions of both bounds and their complexities in Appendix ","element":"span"},{"href":"#id-37","text":"D","element":"a"},{"text":". Recently, CROWN-IBP (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":") has been proposed to provide a better initialization to IBP, which uses IBP to estimate range ","element":"span"},{"style":{"height":16},"width":111.87,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-0.png","element":"img","alt":" [xi, ¯xi]","inline":true,"padRight":true},{"text":"for CROWN. In this case, both CROWN and Fast-Lin have the same complexity and CROWN is a better choice.","element":"span"}],[{"style":{"width":"59%"},"width":560,"height":390,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-1.png","element":"img"}],[{"id":"id-38","style":{"fontStyle":"italic"},"text":"Figure 2. ","element":"figcaption","subtype":"caption"},{"text":"Illustration of the data distribution and the decision boundary of the network ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":99.93,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-2.png","element":"img","alt":" h2(z1)","inline":true},{"text":", where Fast-Lin gives the exact lower bound of the margin for every sample in ","element":"figcaption","subtype":"caption"},{"style":{"height":11.59},"width":37.52,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-3.png","element":"img","alt":" S1","inline":true},{"text":". We assume ","element":"figcaption","subtype":"caption"},{"style":{"height":11.19},"width":68.32,"height":27.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-4.png","element":"img","alt":" x0 is","inline":true,"padRight":true},{"text":"uniformly distributed in ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":592.87,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-5.png","element":"img","alt":" S0 ∪ S1, where S0 = {x0|x02 ≤ |x01| −","inline":true}],[{"style":{"width":"99%"},"width":935,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-6.png","element":"img"}],[{"text":"0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"< b < ","element":"span"},{"text":"1","element":"span"},{"text":". The ground-truth label for ","element":"span"},{"style":{"height":12.4},"width":396.05,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-7.png","element":"img","alt":" x0 ∈ S0 and x0 ∈ S1 are 0,","inline":true,"padRight":true},{"text":"1 respectively. In this case, ","element":"span"},{"style":{"height":16.49},"width":524.48,"height":41.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-8.png","element":"img","alt":" b = 0.3, ϵ = 0.2, x0 = [0.1, 0.42]T .","inline":true}]]},{"heading":"4. Tighter Bounds via Regularization","paragraphs":[[{"text":"Despite being relatively efficient to compute, Fast-Lin and CROWN are not even the tightest layer-wise convex relaxation. Using tighter bounds to train the networks could potentially lead to higher certified robustness by preventing such bounds from over-regularizing the networks.","element":"span"}],[{"text":"Nevertheless, there exist certain parameters and inputs such that the seemingly looser Fast-Lin is tight for ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":", i.e., the optimal value of Fast-Lin is the same as ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":". The immediate trivial case on can think of is where no unstable neuron exists for the samples inside the allowed perturbation interval.","element":"span"}],[{"text":"In fact, even when unstable neurons exist, the optimal solution to the relaxed problem can still be a feasible solution to the original non-convex problem for a significant portion of input samples . We give an illustrative example where Fast-Lin is tight for a significant portion of the samples even when unstable neurons exist, as shown in Figure ","element":"span"},{"href":"#id-38","text":"2","element":"a"},{"text":". In this figure, Fast-Lin is tight at every sample of ","element":"span"},{"style":{"height":13.19},"width":40.44,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-9.png","element":"img","alt":" S1","inline":true,"padRight":true},{"text":"for the network ","element":"span"},{"style":{"height":16},"width":108.75,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-10.png","element":"img","alt":" h1(z2)","inline":true},{"text":". Please refer to Appendix ","element":"span"},{"text":"E ","element":"span"},{"text":"for more details of this example.","element":"span"}],[{"text":"It is therefore interesting to check the conditions for Fast-Lin or CROWN to be tight for ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":", and enforcing such conditions during training so that the network can be better verified by efficient verifiers like Fast-Lin, CROWN, and even IBP.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.1. Conditions for Tightness","element":"span"}],[{"text":"Here we look into conditions that make the optimal value ","element":"span"},{"style":{"height":15.38},"width":37.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-11.png","element":"img","alt":" p∗C","inline":true,"padRight":true},{"text":"of the convex problem ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"to be equal to ","element":"span"},{"style":{"height":18.18},"width":310.43,"height":45.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-12.png","element":"img","alt":" p∗O. Let {zi, xi}Li=1","inline":true,"padRight":true},{"text":"be some feasible solution of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":", from which the objective value of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"can be determined as ","element":"span"},{"style":{"height":17.93},"width":442.84,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-13.png","element":"img","alt":" pC = c⊤t xL. Let {z′i, x′i}Li=1","inline":true,"padRight":true},{"text":"be some feasible solution of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O ","element":"span"},{"text":"computed by passing ","element":"span"},{"style":{"height":10.76},"width":34.53,"height":26.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-14.png","element":"img","alt":" z′1","inline":true,"padRight":true},{"text":"through the ReLU sub-networks ","element":"span"},{"style":{"height":16},"width":104.14,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-15.png","element":"img","alt":" hi(z′1)","inline":true,"padRight":true},{"text":"defined in ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":", and ","element":"span"},{"text":"denote the resulting feasible objective value as ","element":"span"},{"style":{"height":11.19},"width":202.08,"height":27.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-16.png","element":"img","alt":" p′O = c⊤t x′L.","inline":true}],[{"text":"Generally, for a given network with the set of weights ","element":"span"},{"style":{"height":17.93},"width":190.57,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-17.png","element":"img","alt":"{Wi, bi}Li=1","inline":true},{"text":", as long as the optimal solution ","element":"span"},{"style":{"height":17.93},"width":189,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-18.png","element":"img","alt":" {z∗i , x∗i }Li=1","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"is equal to a feasible solution ","element":"span"},{"style":{"height":17.93},"width":177.14,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-19.png","element":"img","alt":" {z′i, x′i}Li=1","inline":true,"padRight":true},{"text":"of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":", we ","element":"span"},{"text":"will have ","element":"span"},{"style":{"height":15.37},"width":151.31,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-20.png","element":"img","alt":" p∗O = p∗C","inline":true},{"text":", since any feasible ","element":"span"},{"style":{"height":11.19},"width":45.05,"height":27.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-21.png","element":"img","alt":" p′O","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O ","element":"span"},{"text":"satisfies ","element":"span"},{"style":{"height":15.37},"width":146.35,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-22.png","element":"img","alt":"p′O ≥ p∗O","inline":true},{"text":", and by the nature of relaxation ","element":"span"},{"style":{"height":15.37},"width":152.04,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-23.png","element":"img","alt":" p∗C ≤ p∗O.","inline":true}],[{"text":"Therefore, for a given network and input ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":", to check the tightness of the convex relaxation, we can check whether its optimal solution ","element":"span"},{"style":{"height":17.93},"width":189,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-24.png","element":"img","alt":" {z∗i , x∗i }Li=1 ","inline":true,"padRight":true},{"text":"is feasible for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"O","element":"span"},{"text":". This can ","element":"span"},{"text":"be achieved by passing ","element":"span"},{"style":{"height":14.94},"width":36.29,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-25.png","element":"img","alt":" z∗1 ","inline":true,"padRight":true},{"text":"through the ReLU network, and ","element":"span"},{"text":"either directly check the resultant objective value ","element":"span"},{"style":{"height":11.19},"width":45.05,"height":27.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-26.png","element":"img","alt":" p′O","inline":true},{"text":", or ","element":"span"},{"text":"compare ","element":"span"},{"style":{"height":17.93},"width":189.01,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-27.png","element":"img","alt":" {z∗i , x∗i }Li=1","inline":true,"padRight":true},{"text":"with the resultant feasible solution ","element":"span"},{"style":{"height":17.93},"width":177.14,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-28.png","element":"img","alt":"{z′i, x′i}Li=1","inline":true},{"text":". Further, we can encourage such conditions to ","element":"span"},{"text":"happen during the training process to improve the tightness of the bound. Based on such mechanisms, we propose two regularizers to enforce the tightness. Notice such regularizers are different from the RS Loss (","element":"span"},{"href":"#id-36","referenceIndex":35,"text":"Xiao et al.","element":"a"},{"href":"#id-36","referenceIndex":35,"text":", ","element":"a"},{"href":"#id-36","referenceIndex":35,"text":"2018","element":"a"},{"text":") introduced to reduce the number of unstable neurons, since we have shown with Appendix ","element":"span"},{"text":"E ","element":"span"},{"text":"that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"can be tight even when unstable neurons exist.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.2. A Intuitive Indicator of Tightness: Difference in Output Bounds","element":"span"}],[{"text":"The observation above motivates us to consider the non-negative value","element":"span"}],[{"id":"id-41","style":{"width":"77%"},"width":724,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-29.png","element":"img"}],[{"text":"as an indicator of the difference between ","element":"span"},{"style":{"height":17.93},"width":189,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-30.png","element":"img","alt":" {z∗i , x∗i }Li=1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.93},"width":177.14,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-31.png","element":"img","alt":"{z′i, x′i}Li=1","inline":true},{"text":", where ","element":"span"},{"style":{"height":16.39},"width":446.92,"height":40.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-32.png","element":"img","alt":" p′O(x, δ∗0) = c⊤t hL(x + δ∗0)","inline":true,"padRight":true},{"text":"is the mar- ","element":"span"},{"text":"gin over class ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"computed by passing the optimal perturbation ","element":"span"},{"style":{"height":15.56},"width":35.22,"height":38.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-33.png","element":"img","alt":" δ∗0","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"through the original network. ","element":"span"},{"style":{"height":15.56},"width":35.21,"height":38.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-34.png","element":"img","alt":" δ∗0","inline":true,"padRight":true},{"text":"can be ","element":"span"},{"text":"computed efficiently from the optimality condition of FastLin or CROWN, as demonstrated in Eq. ","element":"span"},{"href":"#id-39","text":"8","element":"a"},{"text":". ","element":"span"},{"text":"For example, when ","element":"span"},{"style":{"height":10.4},"width":130.69,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-35.png","element":"img","alt":" p = ∞","inline":true},{"text":", the optimal input perturbation ","element":"span"},{"style":{"height":15.56},"width":35.22,"height":38.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-36.png","element":"img","alt":" δ∗0","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"is ","element":"span"},{"style":{"height":16},"width":368.06,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-37.png","element":"img","alt":" δ∗0 = −ϵsign(c⊤t WL:1)","inline":true},{"text":", which corresponds to sending ","element":"span"},{"style":{"height":16},"width":497.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-38.png","element":"img","alt":"z′1 = z∗1 = x − ϵsign(c⊤t WL:1)","inline":true,"padRight":true},{"text":"through the ReLU network;","element":"span"}],[{"text":"when ","element":"span"},{"style":{"height":25.3},"width":448.86,"height":63.26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-39.png","element":"img","alt":" p = 2, δ∗0 = −ϵ c⊤t WL:1∥c⊤t WL:1∥2","inline":true,"padRight":true},{"text":", which corresponds to ","element":"span"},{"text":"sending ","element":"span"},{"style":{"height":25.31},"width":448.81,"height":63.27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-40.png","element":"img","alt":" z′1 = z∗1 = x − ϵ c⊤t WL:1∥c⊤t WL:1∥2 .","inline":true}],[{"text":"The larger ","element":"span"},{"style":{"height":16},"width":219.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-41.png","element":"img","alt":" d(x, δ∗0, W, b)","inline":true,"padRight":true},{"text":"is, the more relaxed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"is, and the ","element":"span"},{"text":"higher ","element":"span"},{"style":{"height":15.37},"width":140.47,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-42.png","element":"img","alt":" p∗O − p∗C","inline":true,"padRight":true},{"text":"could be. Therefore, we can regularize ","element":"span"},{"text":"the network to minimize ","element":"span"},{"style":{"height":16},"width":219.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-43.png","element":"img","alt":" d(x, δ∗0, W, b)","inline":true,"padRight":true},{"text":"during training and ","element":"span"},{"text":"maximize the lower-bound of the margin ","element":"span"},{"style":{"height":15.38},"width":37.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-44.png","element":"img","alt":" p∗C","inline":true},{"text":", so that we can ","element":"span"},{"text":"obtain a network where ","element":"span"},{"style":{"height":15.37},"width":37.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-45.png","element":"img","alt":" p∗C","inline":true,"padRight":true},{"text":"is a better estimate of ","element":"span"},{"style":{"height":15.37},"width":45.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/3-46.png","element":"img","alt":" p∗O","inline":true,"padRight":true},{"text":"and","element":"span"}],[{"text":"the robustness is better represented by ","element":"span"},{"style":{"height":15.37},"width":37.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-0.png","element":"img","alt":" p∗C","inline":true},{"text":". Such an indicator ","element":"span"},{"text":"avoids comparing the intermediate variables, which gives more flexibility for adjustment. It bears some similarities to knowledge distillation (","element":"span"},{"href":"#id-40","referenceIndex":14,"text":"Hinton et al.","element":"a"},{"href":"#id-40","referenceIndex":14,"text":", ","element":"a"},{"href":"#id-40","referenceIndex":14,"text":"2015","element":"a"},{"text":"), in that it encourages learning a network whose relaxed lower bound gives similar outputs of the corresponding ReLU network. It is worth noting that minimizing ","element":"span"},{"style":{"height":16},"width":219.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-1.png","element":"img","alt":" d(x, δ∗0, W, b)","inline":true,"padRight":true},{"text":"does not ","element":"span"},{"text":"necessarily lead to decreasing ","element":"span"},{"style":{"height":16.39},"width":157.69,"height":40.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-2.png","element":"img","alt":" p′O(x, δ∗0)","inline":true,"padRight":true},{"text":"or increasing ","element":"span"},{"style":{"height":15.38},"width":37.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-3.png","element":"img","alt":" p∗C","inline":true},{"text":". ","element":"span"},{"text":"In fact, both ","element":"span"},{"style":{"height":16.39},"width":267.94,"height":40.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-4.png","element":"img","alt":" p′O(x, δ∗0) and p∗C ","inline":true,"padRight":true},{"text":"can be increased or decreased ","element":"span"},{"text":"at the same time with their difference decreasing.","element":"span"}],[{"text":"The tightest indicator should give the minimum gap ","element":"span"},{"style":{"height":15.38},"width":135.56,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-5.png","element":"img","alt":" p∗O−p∗C,","inline":true,"padRight":true},{"text":"where we need to find the optimal perturbation for ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":". However, the minimum gap cannot be found in polynomial time, due to the non-convex nature of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":". (","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":") also proved that there is no polynomial time algorithm to find the minimum ","element":"span"},{"style":{"height":7.6},"width":32.61,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-6.png","element":"img","alt":" ℓ1","inline":true},{"text":"-norm adversarial distortion with ","element":"span"},{"text":"0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"99 ln ","element":"span"},{"style":{"fontStyle":"italic"},"text":"n ","element":"span"},{"text":"approximation ratio unless NP=P, a problem equivalent to finding the minimum margin here.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.3. A Better Indicator for Regularization: Difference in Optimal Pre-activations","element":"span"}],[{"text":"Despite being intuitive and is able to achieve improvements, Eq. ","element":"span"},{"href":"#id-41","text":"2 ","element":"a"},{"text":"which enforces similarity between objective values does not work as good as enforcing similarity between the solutions ","element":"span"},{"style":{"height":17.93},"width":189,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-7.png","element":"img","alt":" {z∗i , x∗i }Li=1","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.93},"width":177.15,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-8.png","element":"img","alt":" {z′i, x′i}Li=1","inline":true,"padRight":true},{"text":"in practice, an ap- ","element":"span"},{"text":"proach we will elaborate below. For both CROWN and Fast-Lin, unless ","element":"span"},{"style":{"height":17.93},"width":500.99,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-9.png","element":"img","alt":" d(x, δ∗0, W, b) = 0, {z∗i , x∗i }Li=1","inline":true,"padRight":true},{"text":"may devi- ","element":"span"},{"text":"ate a lot from ","element":"span"},{"style":{"height":17.93},"width":177.14,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-10.png","element":"img","alt":" {z′i, x′i}Li=1","inline":true,"padRight":true},{"text":"and does not correspond to any ","element":"span"},{"text":"ReLU network, even if ","element":"span"},{"style":{"height":16},"width":219.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-11.png","element":"img","alt":" d(x, δ∗0, W, b)","inline":true,"padRight":true},{"text":"may seem small. For ","element":"span"},{"text":"example, it is possible that ","element":"span"},{"style":{"height":17.53},"width":130.3,"height":43.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-12.png","element":"img","alt":" z∗ij < 0","inline":true,"padRight":true},{"text":"for a given ","element":"span"},{"style":{"height":14.94},"width":36.29,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-13.png","element":"img","alt":" z∗1","inline":true},{"text":", but a ","element":"span"},{"text":"ReLU network will always have ","element":"span"},{"style":{"height":17.35},"width":129.66,"height":43.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-14.png","element":"img","alt":" z′ij ≥ 0.","inline":true}],[{"text":"We find an alternative regularizer more effective at improving verifiable accuracy. The regularizer encourages the feasible solution ","element":"span"},{"href":"#id-33","style":{"height":17.93},"width":423.98,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-15.png","element":"img","alt":" {z′i, x′i}Li=1 of O to exactly","inline":true,"padRight":true},{"text":"match the feasible ","element":"span"},{"text":"optimal solution ","element":"span"},{"style":{"height":17.93},"width":189,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-16.png","element":"img","alt":" {z∗i , x∗i }Li=1","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":". Since we are adopting ","element":"span"},{"text":"the layer-wise convex relaxation, the optimal solutions of the unstable neurons can be considered independently.","element":"span"}],[{"text":"Here we derive a sufficient condition for tightness for FastLin, which also serves as a sufficient condition for CROWN. For linear programming, the optimal solution occurs on the boundaries of the feasible set. Since Fast-Lin is a layer-wise convex relaxation, the solution to each of its neurons in ","element":"span"},{"style":{"height":9.19},"width":29.53,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-17.png","element":"img","alt":" zi","inline":true,"padRight":true},{"text":"can be considered independently, and therefore for a specific layer ","element":"span"},{"style":{"height":14},"width":188.63,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-18.png","element":"img","alt":" i and j ∈ Ii","inline":true},{"text":", the pair of optimal solutions ","element":"span"},{"style":{"height":18.55},"width":196.46,"height":46.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-19.png","element":"img","alt":" (x∗ij, z∗i+1,j)","inline":true,"padRight":true},{"text":"should occur on the boundary in the right of Figure ","element":"span"},{"href":"#id-35","text":"1","element":"a"},{"text":". It follows that the only 3 optimal solutions ","element":"span"},{"style":{"height":18.55},"width":196.46,"height":46.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-20.png","element":"img","alt":" (x∗ij, z∗i+1,j)","inline":true,"padRight":true},{"text":"of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"that are also feasible for ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"text":"are ","element":"span"},{"style":{"height":18.39},"width":288.07,"height":45.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-21.png","element":"img","alt":" (xij, 0), (xij, xij)","inline":true,"padRight":true},{"text":"and ","element":"span"},{"text":"(0","element":"span"},{"style":{"fontStyle":"italic"},"text":", ","element":"span"},{"text":"0)","element":"span"},{"text":". Notice they are also in the intersection between the boundary of CROWN and ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":".","element":"span"}],[{"id":"id-32","text":"In practice, out of efficiency concerns, both Fast-Lin and ","element":"span"},{"text":"CROWN identify the boundaries that the optimal solution ","element":"span"},{"text":"lies on and computes the optimal value by accumulating the contribution of each layer in a backward pass, without explicitly computing ","element":"span"},{"style":{"height":17.93},"width":189,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-22.png","element":"img","alt":" {z∗i , x∗i }Li=1","inline":true,"padRight":true},{"text":"for each layer with ","element":"span"},{"text":"a forward pass (see Appendix ","element":"span"},{"href":"#id-37","text":"D ","element":"a"},{"text":"for more details). It is therefore beneficial to link the feasible solutions of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"text":"to the parameters of the boundaries. Specifically, let ","element":"span"},{"style":{"height":18.15},"width":90.56,"height":45.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-23.png","element":"img","alt":" δ∗ij ∈","inline":true},{"style":{"height":20.19},"width":147.97,"height":50.48,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-24.png","element":"img","alt":"{bij,¯bij}","inline":true,"padRight":true},{"text":"be the intercept of the line that the optimal solution ","element":"span"},{"style":{"height":18.55},"width":196.46,"height":46.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-25.png","element":"img","alt":" (x∗ij, z∗i+1,j)","inline":true,"padRight":true},{"text":"lies on. We want to find a rule based ","element":"span"},{"text":"on ","element":"span"},{"style":{"height":17.93},"width":129.13,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-26.png","element":"img","alt":" {δ∗i }Li=1","inline":true,"padRight":true},{"text":"to determine whether the bound is tight from ","element":"span"},{"text":"the values of ","element":"span"},{"style":{"height":17.93},"width":127.63,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-27.png","element":"img","alt":" {x′i}Li=1","inline":true},{"text":". For both Fast-Lin and CROWN, ","element":"span"},{"style":{"height":25.4},"width":403.22,"height":63.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-28.png","element":"img","alt":"bij = 0,¯bij = −xijxijxij−xij","inline":true,"padRight":true},{"text":". For Fast-Lin, when ","element":"span"},{"style":{"height":20.36},"width":152.28,"height":50.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-29.png","element":"img","alt":" δ∗ij = ¯bij","inline":true},{"text":", ","element":"span"},{"text":"only ","element":"span"},{"style":{"height":18.55},"width":572.72,"height":46.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-30.png","element":"img","alt":" (x∗ij, z∗i+1,j) = (xij, 0) or (xij, xij)","inline":true,"padRight":true},{"text":"are fesible for ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":"; ","element":"span"},{"text":"when ","element":"span"},{"style":{"height":18.55},"width":579.64,"height":46.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-31.png","element":"img","alt":" δ∗ij = bij, only (x∗ij, z∗i+1,j) = (0, 0)","inline":true,"padRight":true},{"text":"is feasible for ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":". ","element":"span"},{"text":"Meanwhile, ","element":"span"},{"style":{"height":18.55},"width":358.14,"height":46.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-32.png","element":"img","alt":" z′i+1,j = max(x′ij, 0)","inline":true,"padRight":true},{"text":"is deterministic if ","element":"span"},{"style":{"height":13.35},"width":47.05,"height":33.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-33.png","element":"img","alt":" x′ij","inline":true,"padRight":true},{"text":"is given. Therefore, when the bound is tight for Fast-Lin, if ","element":"span"},{"style":{"height":18.15},"width":151.95,"height":45.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-34.png","element":"img","alt":" δ∗ij = bij","inline":true},{"text":", then ","element":"span"},{"style":{"height":17.35},"width":135.65,"height":43.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-35.png","element":"img","alt":" x′ij = 0","inline":true},{"text":". Otherwise, if ","element":"span"},{"style":{"height":20.36},"width":151.96,"height":50.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-36.png","element":"img","alt":" δ∗ij = ¯bij","inline":true},{"text":", and ","element":"span"},{"style":{"height":13.35},"width":155.68,"height":33.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-37.png","element":"img","alt":"x′ij = xij","inline":true,"padRight":true},{"text":"or ","element":"span"},{"style":{"height":11.59},"width":47.04,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-38.png","element":"img","alt":" xij","inline":true},{"text":". For CROWN, this condition is also fea- ","element":"span"},{"text":"sible, though it could be either ","element":"span"},{"style":{"height":17.35},"width":131.3,"height":43.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-39.png","element":"img","alt":" x′ij ≤ 0","inline":true,"padRight":true},{"text":"or ","element":"span"},{"style":{"height":17.35},"width":131.3,"height":43.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-40.png","element":"img","alt":" x′ij ≥ 0","inline":true,"padRight":true},{"text":"when ","element":"span"},{"style":{"height":18.15},"width":118.93,"height":45.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-41.png","element":"img","alt":"δ∗ij = 0","inline":true},{"text":", depending on the optimal slope ","element":"span"},{"style":{"height":23.52},"width":92.87,"height":58.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-42.png","element":"img","alt":" D(L)ij .","inline":true}],[{"text":"Indeed, we achieve optimal tightness (","element":"span"},{"style":{"height":15.37},"width":145.01,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-43.png","element":"img","alt":"p∗C = p∗O","inline":true},{"text":") for both ","element":"span"},{"text":"Fast-Lin and CROWN if ","element":"span"},{"style":{"height":13.35},"width":47.05,"height":33.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-44.png","element":"img","alt":" x′ij","inline":true,"padRight":true},{"text":"satisfy these conditions at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"all ","element":"span"},{"text":"unstable neurons. Specifically,","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Proposition 1. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume ","element":"span"},{"style":{"height":17.93},"width":177.15,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-45.png","element":"img","alt":" {z′i, x′i}Li=1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is obtained by the ReLU ","element":"span"},{"style":{"fontStyle":"italic"},"text":"network ","element":"span"},{"style":{"height":13.19},"width":44.96,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-46.png","element":"img","alt":" hL","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with input ","element":"span"},{"style":{"height":19.26},"width":265.35,"height":48.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-47.png","element":"img","alt":" z′1, and {δ∗i }L−1i=0 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the optimal solu- ","element":"span"},{"style":{"fontStyle":"italic"},"text":"tion of Fast-Lin or CROWN. If ","element":"span"},{"style":{"height":18.55},"width":470.48,"height":46.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-48.png","element":"img","alt":" z′1 = x+δ∗0, and x′ij ∈ S(δ∗ij)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for all ","element":"span"},{"style":{"height":14},"width":373.14,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-49.png","element":"img","alt":" i = 1, ..., L − 1, j ∈ Ii","inline":true},{"style":{"fontStyle":"italic"},"text":", then ","element":"span"},{"style":{"height":17.93},"width":177.14,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-50.png","element":"img","alt":" {z′i, x′i}Li=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is an opti- ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mal solution of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"style":{"fontStyle":"italic"},"text":", Fast-Lin and CROWN. Here","element":"span"}],[{"style":{"width":"72%"},"width":681,"height":126,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-51.png","element":"img"}],[{"text":"We provide the proof of this simple proposition in the Appendix.","element":"span"}],[{"text":"It remains to be discussed how to best enforce the similarity between the optimal solutions of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"text":"and Fast-Lin or CROWN. Like before, we choose to enforce the similarity between ","element":"span"},{"style":{"height":17.93},"width":127.63,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-52.png","element":"img","alt":" {x′i}Li=1","inline":true,"padRight":true},{"text":"and the closest optimal solution of Fast- ","element":"span"},{"text":"Lin, where ","element":"span"},{"style":{"height":17.93},"width":127.63,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-53.png","element":"img","alt":" {x′i}Li=1","inline":true,"padRight":true},{"text":"is constructed by setting ","element":"span"},{"style":{"height":14.94},"width":190.12,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-54.png","element":"img","alt":" x′1 = x∗1 =","inline":true},{"style":{"height":16},"width":280.44,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-55.png","element":"img","alt":"W1(x + δ∗0) + b1","inline":true,"padRight":true},{"text":"and pass ","element":"span"},{"style":{"height":10.75},"width":38.78,"height":26.88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-56.png","element":"img","alt":" x′1","inline":true,"padRight":true},{"text":"through the ReLU network ","element":"span"},{"text":"to obtain ","element":"span"},{"style":{"height":16.15},"width":268.36,"height":40.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-57.png","element":"img","alt":" x′i = hi(x + δ∗0)","inline":true},{"text":". By Proposition ","element":"span"},{"href":"#id-42","text":"1","element":"a"},{"text":", the distance ","element":"span"},{"text":"can be computed by considering the values of the intercepts","element":"span"}],[{"style":{"width":"97%"},"width":916,"height":495,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/4-58.png","element":"img"}],[{"style":{"width":"99%"},"width":1945,"height":765,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-0.png","element":"img"}],[{"id":"id-43","text":"where the first term corresponds to ","element":"span"},{"style":{"height":18.15},"width":130.65,"height":45.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-1.png","element":"img","alt":" δ∗ij = 0","inline":true,"padRight":true},{"text":"and the con- ","element":"span"},{"text":"dition ","element":"span"},{"style":{"height":18.55},"width":185.82,"height":46.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-2.png","element":"img","alt":" x′ij ∈ {0}","inline":true},{"text":", and the second term corresponds to ","element":"span"},{"style":{"height":25.4},"width":259.17,"height":63.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-3.png","element":"img","alt":"δ∗ij = −xijxijxij−xij","inline":true,"padRight":true},{"text":"and the condition ","element":"span"},{"style":{"height":18.55},"width":274.14,"height":46.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-4.png","element":"img","alt":" x′ij ∈ {xij, xij}","inline":true},{"text":". To ","element":"span"},{"text":"minimize the second term, the original ReLU network only needs to be optimized towards the nearest feasible optimal solution. It is easy to see from Proposition ","element":"span"},{"href":"#id-42","text":"1 ","element":"a"},{"text":"that if ","element":"span"},{"style":{"height":16},"width":295.31,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-5.png","element":"img","alt":"r(x, δ∗0, W, b) = 0","inline":true},{"text":", then ","element":"span"},{"style":{"height":15.38},"width":143.35,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-6.png","element":"img","alt":" p∗O = p∗C","inline":true},{"text":", where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"could be both ","element":"span"},{"text":"Fast-Lin or CROWN.","element":"span"}],[{"text":"Compared with ","element":"span"},{"style":{"height":16},"width":454.85,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-7.png","element":"img","alt":" d(x, δ∗0, W, b), r(x, δ∗0, W, b)","inline":true,"padRight":true},{"text":"puts more con- ","element":"span"},{"text":"straints on the parameters ","element":"span"},{"style":{"fontStyle":"italic"},"text":"W, b ","element":"span"},{"text":", since it requires all unstable neurons of the ReLU network to match the optimal solutions of Fast-Lin, instead of only matching the objective values ","element":"span"},{"style":{"height":11.19},"width":45.05,"height":27.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-8.png","element":"img","alt":"p′O","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.37},"width":37.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-9.png","element":"img","alt":" p∗C","inline":true},{"text":". In this way, it provides stronger guidance to- ","element":"span"},{"text":"wards a network whose optimal solution for ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"text":"and Fast-Lin or CROWN agree. However, again, this is not equivalent to trying to kill all unstable neurons, since Fast-Lin can be tight even when unstable neurons exist.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"4.4. Certified Robust Training in Practice","element":"span"}],[{"text":"In practice, for classification problems with more than two classes, we will compute the lower bound of the margins with respect to multiple classes. Denote ","element":"span"},{"style":{"height":15.38},"width":42.46,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-10.png","element":"img","alt":" p∗C","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.38},"width":50.46,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-11.png","element":"img","alt":" p∗O","inline":true,"padRight":true},{"text":"as the ","element":"span"},{"text":"concatenated vector of lower bounds of the relaxed problem and original problem for multiple classes, and ","element":"span"},{"style":{"height":14},"width":82.46,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-12.png","element":"img","alt":" dt, rt","inline":true,"padRight":true},{"text":"as the regularizers for the margins with respect to class ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":".","element":"span"}],[{"text":"Together with the regularizers, we optimize the following objective","element":"span"}],[{"style":{"width":"96%"},"width":910,"height":192,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-13.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":16.39},"width":226.17,"height":40.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-14.png","element":"img","alt":" LCE(−p∗C, y)","inline":true,"padRight":true},{"text":"is the cross entropy loss with label ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y","element":"span"},{"text":", as adopted by many related works (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-30","referenceIndex":12,"text":"Gowal et al.","element":"a"},{"href":"#id-30","referenceIndex":12,"text":", ","element":"a"},{"href":"#id-30","referenceIndex":12,"text":"2018","element":"a"},{"text":"), and we have implicitly abbreviated the inner maximization problem w.r.t. ","element":"span"},{"style":{"height":19.26},"width":133.59,"height":48.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-15.png","element":"img","alt":" {δi}L−1i=0 ","inline":true,"padRight":true},{"text":"into the optimal ","element":"span"},{"text":"values ","element":"span"},{"style":{"height":15.37},"width":42.46,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-16.png","element":"img","alt":" p∗C","inline":true,"padRight":true},{"text":"and solution ","element":"span"},{"style":{"height":15.56},"width":35.22,"height":38.9,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-17.png","element":"img","alt":" δ∗0","inline":true},{"text":". More details for computing the ","element":"span"},{"text":"intermediate and output bounds can be found in Algorithm ","element":"span"},{"href":"#id-43","text":"1","element":"a"},{"text":", where we have used ","element":"span"},{"style":{"height":16.79},"width":130.64,"height":41.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-18.png","element":"img","alt":" ∥·∥1,row","inline":true,"padRight":true},{"text":"to denote row-wise ","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-19.png","element":"img","alt":" ℓ1","inline":true,"padRight":true},{"text":"norm, and ","element":"span"},{"style":{"height":16.79},"width":73.54,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-20.png","element":"img","alt":" (·):,j","inline":true,"padRight":true},{"text":"for taking the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"-th column.","element":"span"}],[{"text":"One major challenge of the convex relaxation approach is the high memory consumption. To compute the bounds ","element":"span"},{"style":{"height":10.79},"width":87.53,"height":26.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-21.png","element":"img","alt":"xi, xi","inline":true},{"text":", we need to pass an identity matrix with the same number of diagonal entries as the total dimensions of the input images, which can make the batch size thousands of times larger than usual. To mitigate this, one can adopt the random projection from (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"), which projects identity matrices into lower dimensions as ","element":"span"},{"style":{"height":13.19},"width":108.54,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-22.png","element":"img","alt":" Wi:1R","inline":true,"padRight":true},{"text":"to estimate the norm of ","element":"span"},{"style":{"height":13.19},"width":75.65,"height":32.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-23.png","element":"img","alt":" Wi:1","inline":true},{"text":". Such projections add noise/variance to ","element":"span"},{"style":{"height":10.79},"width":87.52,"height":26.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-24.png","element":"img","alt":" xi, xi","inline":true},{"text":", and the regularizers are affected as well.","element":"span"}]]},{"heading":"5. Experiments","paragraphs":[[{"text":"We evaluate the proposed regularizer on two datasets (MNIST and CIFAR10) with two different ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-25.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"each. We consider only ","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-26.png","element":"img","alt":" ℓ∞","inline":true,"padRight":true},{"text":"adversaries. Our implementation is based on the code released by (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":") for Convex Outer Adversarial Polytope (CP), and (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":") for CROWN-IBP, so when ","element":"span"},{"style":{"height":14.4},"width":172.51,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/5-27.png","element":"img","alt":" λ = γ = 0","inline":true},{"text":", we obtain the same results as CP or CROWN-IBP. We use up to 4 GTX 1080Ti or 2080Ti for all our experiments.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Architectures: ","element":"span"},{"text":"We experiment with a variety of different network structures, including a MLP (2x100) with two 100-neuron hidden layers as (","element":"span"},{"href":"#id-9","referenceIndex":23,"text":"Salman et al.","element":"a"},{"href":"#id-9","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-9","referenceIndex":23,"text":"2019","element":"a"},{"text":"), two Conv Nets (","element":"span"},{"text":"Small ","element":"span"},{"text":"and ","element":"span"},{"text":"Large","element":"span"},{"text":") that are the same as (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"), a family of 10 small conv nets and a family of 8 larger conv nets, all the same as (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":"), and also the same 5-layer convolutional network (","element":"span"},{"text":"XLarge","element":"span"},{"text":") as in the latest version of CROWN-IBP (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":").","element":"span"}],[{"id":"id-46","style":{"width":"79%"},"width":1547,"height":1426,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Table 1. ","element":"figcaption","subtype":"caption"},{"text":"Results on MNIST, and CIFAR10 with small networks, large networks, and different coefficients of ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":428.96,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-1.png","element":"img","alt":" d(x, δ∗0, W, b), r(x, δ∗0, W, b).","inline":true,"padRight":true},{"text":"All entries with positive ","element":"figcaption","subtype":"caption"},{"style":{"height":13.2},"width":88.1,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-2.png","element":"img","alt":" λ or γ","inline":true,"padRight":true},{"text":"are using our regularizers. For all models not marked as “Exact”, we have projected the input dimension of ","element":"figcaption","subtype":"caption"},{"style":{"height":11.59},"width":70.57,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-3.png","element":"img","alt":"Wi:1","inline":true,"padRight":true},{"text":"to 50, the same as (","element":"figcaption","subtype":"caption"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a","subtype":"caption"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a","subtype":"caption"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a","subtype":"caption"},{"text":"). For ","element":"figcaption","subtype":"caption"},{"style":{"height":0},"width":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-4.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"values with ","element":"figcaption","subtype":"caption"},{"style":{"height":13.7},"width":144.08,"height":34.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-5.png","element":"img","alt":"∗, larger ϵ","inline":true,"padRight":true},{"text":"is used for training. ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":327.9,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-6.png","element":"img","alt":" ϵ = 0.3, 2/255, 8/255","inline":true,"padRight":true},{"text":"correspond to using ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":387.11,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-7.png","element":"img","alt":"ϵ = 0.4, 2.2/255, 8.8/255","inline":true,"padRight":true},{"text":"for training respectively. For the methods: ","element":"figcaption","subtype":"caption"},{"style":{"height":15.29},"width":48.25,"height":38.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-8.png","element":"img","alt":"1: (","inline":true},{"href":"#id-44","referenceIndex":6,"text":"Dvijotham et al.","element":"a","subtype":"caption"},{"href":"#id-44","referenceIndex":6,"text":", ","element":"a","subtype":"caption"},{"href":"#id-44","referenceIndex":6,"text":"2018a","element":"a","subtype":"caption"},{"text":"); ","element":"figcaption","subtype":"caption"},{"style":{"height":15.29},"width":48.25,"height":38.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-9.png","element":"img","alt":"2: (","inline":true},{"href":"#id-36","referenceIndex":35,"text":"Xiao et al.","element":"a","subtype":"caption"},{"href":"#id-36","referenceIndex":35,"text":", ","element":"a","subtype":"caption"},{"href":"#id-36","referenceIndex":35,"text":"2018","element":"a","subtype":"caption"},{"text":"); ","element":"figcaption","subtype":"caption"},{"href":"#id-28","referenceIndex":20,"style":{"height":15.29},"width":166.01,"height":38.23,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-10.png","element":"img","alt":"3: (Mirman","inline":true,"padRight":true},{"href":"#id-28","referenceIndex":20,"text":"et al.","element":"a","subtype":"caption"},{"href":"#id-28","referenceIndex":20,"text":", ","element":"a","subtype":"caption"},{"href":"#id-28","referenceIndex":20,"text":"2018","element":"a","subtype":"caption"},{"text":"); ","element":"figcaption","subtype":"caption"},{"style":{"height":15.29},"width":37.57,"height":38.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-11.png","element":"img","alt":"4 (","inline":true},{"href":"#id-30","referenceIndex":12,"text":"Gowal et al.","element":"a","subtype":"caption"},{"href":"#id-30","referenceIndex":12,"text":", ","element":"a","subtype":"caption"},{"href":"#id-30","referenceIndex":12,"text":"2018","element":"a","subtype":"caption"},{"text":").","element":"figcaption","subtype":"caption"}],[{"text":"The ","element":"span"},{"text":"Small ","element":"span"},{"text":"convnet has two convolutional layers of 16, 32 output channels each and two FC layers with 100 hidden neurons. The ","element":"span"},{"text":"Large ","element":"span"},{"text":"convnet has four Conv layers with 32, 32, 64 and 64 output channels each, plus three FC layers of 512 neurons. The ","element":"span"},{"text":"XLarge ","element":"span"},{"text":"convnet has five conv lyaers with 64, 64, 128, 128, 128 output channels each, with two FC layers of 512 neurons.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Hyper-parameters: ","element":"span"},{"text":"For experiments on CP, we use Adam (","element":"span"},{"href":"#id-45","referenceIndex":17,"text":"Kingma & Ba","element":"a"},{"href":"#id-45","referenceIndex":17,"text":", ","element":"a"},{"href":"#id-45","referenceIndex":17,"text":"2014","element":"a"},{"text":") with a learning rate of ","element":"span"},{"style":{"height":13.39},"width":80.76,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-12.png","element":"img","alt":" 10−3","inline":true,"padRight":true},{"text":"and no weight decay. Like (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"), we train the models for 80 epochs, where in the first 20 epochs the learning rate is fixed but the ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-13.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"increases from 0.01/0.001 to its maximum value for MNIST/CIFAR10, and in the following epochs, we reduce learning rate by half every 10 epochs. Unless labelled with “Exact” in the model names of Table ","element":"span"},{"href":"#id-46","text":"1","element":"a"},{"text":", we use random projection as in (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":") for CP ","element":"span"},{"text":"experiments to reduce the memory consumption. Due to the noisy estimation of the optimal solutions from these random projections, we also adopt a warm-up schedule for the regularizers in all CP experiments to prevent over-regularization, where ","element":"span"},{"style":{"height":14.4},"width":62.96,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-14.png","element":"img","alt":" λ, γ","inline":true,"padRight":true},{"text":"increases form 0 to the preset values in the first 20 epochs.","element":"span"}],[{"text":"For CROWN-IBP, we use the updated expensive training schedule as (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":"), which uses 200 epochs with batch size 256 for MNIST and 3200 epochs with batch size 1024 for CIFAR10. We also use the afore-mentioned warm up schedule for ","element":"span"},{"style":{"height":14.4},"width":73.8,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/6-15.png","element":"img","alt":" λ, γ.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"5.1. Improving Convex Outer Adversarial Polytope","element":"span"}],[{"text":"Table ","element":"span"},{"href":"#id-46","text":"1 ","element":"a"},{"text":"shows comparisons with various approaches. All of our baseline implementations of CP have already improved upon (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"). After adding the proposed regular-","element":"span"}],[{"id":"id-48","style":{"width":"95%"},"width":1854,"height":268,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Table 2. ","element":"figcaption","subtype":"caption"},{"text":"Mean and standard deviation of the family of 10small models on MNIST with ","element":"figcaption","subtype":"caption"},{"style":{"height":9.6},"width":110.75,"height":24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-1.png","element":"img","alt":" ϵ = 0.3","inline":true},{"text":". Here we use a cheaper training schedule with a total of 100 epochs, all in the same setting as the IBP baseline results of (","element":"figcaption","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a","subtype":"caption"},{"text":"). Baseline is CROWN-IBP with epoch=140 and lr decay step=20. Like in CROWN-IBP, we run each model 5 times to compute the mean and standard deviation. “Copied” are results from (","element":"figcaption","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a","subtype":"caption"},{"text":").","element":"figcaption","subtype":"caption"}],[{"text":"izers, the certified robust accuracy is further improved upon our baseline in all cases. We also provide results against a 100-step PGD adversary for our CP models. Since both PGD errors and standard errors are reduced in most cases, the regularizer should have improved not only the certified upper bound, but also improved the actual robust error.","element":"span"}],[{"text":"Despite the fact that we start from a stronger baseline, the relative improvement on 2x100 with our regularizer (10.3%/8.7%) are comparable to the improvements (5.9%/10.0%) under the same setting from (","element":"span"},{"href":"#id-9","referenceIndex":23,"text":"Salman et al.","element":"a"},{"href":"#id-9","referenceIndex":23,"text":", ","element":"a"},{"href":"#id-9","referenceIndex":23,"text":"2019","element":"a"},{"text":"), which solves for the lower and upper bounds of all intermediate layers via the tightest layer-wise LP relaxation (Figure ","element":"span"},{"href":"#id-35","text":"1","element":"a"},{"text":"). This indicates that the improvement brought by using our regularizer during training and efficient verifiers (Fast-Lin in this case) for verification is comparable with using the expensive and unstable optimal layer-wise convex relaxation.","element":"span"}],[{"text":"Our results with ","element":"span"},{"text":"Small ","element":"span"},{"text":"are better than the best results of (","element":"span"},{"href":"#id-44","referenceIndex":6,"text":"Dvijotham et al.","element":"a"},{"href":"#id-44","referenceIndex":6,"text":", ","element":"a"},{"href":"#id-44","referenceIndex":6,"text":"2018a","element":"a"},{"text":"; ","element":"span"},{"href":"#id-36","referenceIndex":35,"text":"Xiao et al.","element":"a"},{"href":"#id-36","referenceIndex":35,"text":", ","element":"a"},{"href":"#id-36","referenceIndex":35,"text":"2018","element":"a"},{"text":") on MNIST with ","element":"span"},{"style":{"height":10.8},"width":124.71,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-2.png","element":"img","alt":" ϵ = 0.1","inline":true},{"text":", though not as good as the best of (","element":"span"},{"href":"#id-28","referenceIndex":20,"text":"Mirman ","element":"a"},{"href":"#id-28","referenceIndex":20,"text":"et al.","element":"a"},{"href":"#id-28","referenceIndex":20,"text":", ","element":"a"},{"href":"#id-28","referenceIndex":20,"text":"2018","element":"a"},{"text":"), which uses a larger model. When applying the same model on CIFAR10, we achieve better robust error than (","element":"span"},{"href":"#id-28","referenceIndex":20,"text":"Mirman et al.","element":"a"},{"href":"#id-28","referenceIndex":20,"text":", ","element":"a"},{"href":"#id-28","referenceIndex":20,"text":"2018","element":"a"},{"text":").","element":"span"}],[{"text":"The relative improvements in certified robust error for ","element":"span"},{"style":{"height":10.8},"width":127.47,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-3.png","element":"img","alt":"ϵ = 0.1","inline":true,"padRight":true},{"text":"and 0.3 are 18%/3.4% for the small exact model on MNIST, compared with 0.03%/3.13% for the random projection counterparts. This is mainly because in the exact models, we have better estimates of ","element":"span"},{"style":{"height":10.78},"width":87.53,"height":26.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-4.png","element":"img","alt":" xi, xi","inline":true},{"text":". Still, these consistent improvements validate that our proposed regularizers improve the performance.","element":"span"}],[{"text":"We also give ablation studies of the two regularizers in Appendix ","element":"span"},{"text":"B ","element":"span"},{"text":"and Table ","element":"span"},{"href":"#id-47","text":"3","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"5.2. Improving CROWN-IBP","element":"span"}],[{"text":"In its first stage of training, CROWN-IBP (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":") trains the network with CROWN (","element":"span"},{"href":"#id-8","referenceIndex":37,"text":"Zhang et al.","element":"a"},{"href":"#id-8","referenceIndex":37,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"2018","element":"a"},{"text":") to compute the bounds of the final outputs based on the interval bounds of intermediate activations, and in its second stage, CROWN-IBP uses only IBP. We apply our regularizer to the first stage of CROWN-IBP, using interval bounds ","element":"span"},{"text":"to over-approximate ","element":"span"},{"style":{"height":10.79},"width":33.78,"height":26.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-5.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.99},"width":33.78,"height":29.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-6.png","element":"img","alt":" ¯xi","inline":true,"padRight":true},{"text":"required by our second regularizer on the optimal pre-activations, to obtain a better intialization for its second stage, and demonstrate improvements in Table ","element":"span"},{"href":"#id-46","text":"1","element":"a"},{"text":".","element":"span"}],[{"text":"Methods based on interval bounds, including IBP, CROWNIBP and DAI (","element":"span"},{"href":"#id-28","referenceIndex":20,"text":"Mirman et al.","element":"a"},{"href":"#id-28","referenceIndex":20,"text":", ","element":"a"},{"href":"#id-28","referenceIndex":20,"text":"2018","element":"a"},{"text":"), tend to behave not as good as CP when ","element":"span"},{"style":{"height":0},"width":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-7.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"is small. Our regularizers are able to further improve CP on CIFAR10 (","element":"span"},{"style":{"height":16},"width":169.01,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-8.png","element":"img","alt":"ϵ = 2/255","inline":true},{"text":"), and demonstrate the best result among all approaches compared in this setting, as shown in Table ","element":"span"},{"href":"#id-46","text":"1","element":"a"},{"text":". To our knowledge, these are the best results for CIFAR10 (","element":"span"},{"style":{"height":16},"width":171.7,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-9.png","element":"img","alt":"ϵ = 2/255","inline":true},{"text":") reported on comparable sized models. By using our regularizers on CROWN-IBP to provide a better initialization for the later training stage of pure IBP, our method also achieves the best certified accuracy on MNIST (","element":"span"},{"style":{"height":10.8},"width":120.3,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-10.png","element":"img","alt":"ϵ = 0.3","inline":true},{"text":") and CIFAR10 ( ","element":"span"},{"style":{"height":16},"width":192.2,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/7-11.png","element":"img","alt":"ϵ = 8/255).","inline":true}],[{"text":"To verify the significance of the regularizers, Table ","element":"span"},{"href":"#id-48","text":"2 ","element":"a"},{"text":"shows the mean and variance of the results with the family smaller models on MNIST, demonstrating consistent improvements of our model, while Table ","element":"span"},{"href":"#id-49","text":"4 ","element":"a"},{"text":"(in the appendix) gives the best, median and worst case results with the large models on the MNIST dataset and compares with both IBP and CROWN-IBP.","element":"span"}]]},{"heading":"6. Conclusions","paragraphs":[[{"text":"We propose two regularizers based on the convex relaxation bounds for training robust neural networks that can be better verified by efficient verifiers including Fast-Lin and IBP for certifiable robustness. Extensive experiments validate that the regularizers improve robust accuracy over non-regularized baselines, and outperform state-of-the-art approaches. This work is a step towards closing the gap between certified and empirical robustness. Future directions include methods to improve computational efficiency for LP relaxations (and certified methods in general), and better ways to leverage random projections for acceleration.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-2","text":"Carlini, N. and Wagner, D. Adversarial examples are not ","element":"span"},{"text":"easily detected: Bypassing ten detection methods. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 10th ACM Workshop on Artificial Intelligence and Security","element":"span"},{"text":", pp. 3–14. ACM, 2017.","element":"span"}],[{"id":"id-27","text":"Carlini, N., Katz, G., Barrett, C., and Dill, D. L. Provably ","element":"span"},{"text":"minimally-distorted adversarial examples. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1709.10207","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-23","text":"Cheng, C.-H., N","element":"span"},{"text":"¨uhrenberg, G., and Ruess, H. Maximum resilience of artificial neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Symposium on Automated Technology for Verification and Analysis","element":"span"},{"text":", pp. 251–268. Springer, 2017.","element":"span"}],[{"id":"id-56","text":"Cohen, J. M., Rosenfeld, E., and Kolter, J. Z. Certified ","element":"span"},{"text":"adversarial robustness via randomized smoothing. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1902.02918","element":"span"},{"text":", 2019.","element":"span"}],[{"text":"Dutta, S., Jha, S., Sankaranarayanan, S., and Tiwari, A. Output range analysis for deep feedforward neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"NASA Formal Methods Symposium","element":"span"},{"text":", pp. 121– 138. Springer, 2018.","element":"span"}],[{"id":"id-44","text":"Dvijotham, K., Gowal, S., Stanforth, R., Arandjelovic, R., ","element":"span"},{"text":"O’Donoghue, B., Uesato, J., and Kohli, P. ","element":"span"},{"text":"Training verified learners with learned verifiers. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1805.10265","element":"span"},{"text":", 2018a.","element":"span"}],[{"id":"id-7","text":"Dvijotham, K., Stanforth, R., Gowal, S., Mann, T., and ","element":"span"},{"text":"Kohli, P. A dual approach to scalable verification of deep networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1803.06567","element":"span"},{"text":", 104, 2018b.","element":"span"}],[{"id":"id-10","text":"Ehlers, R. Formal verification of piece-wise linear feed- ","element":"span"},{"text":"forward neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Symposium on Automated Technology for Verification and Analysis","element":"span"},{"text":", pp. 269–286. Springer, 2017.","element":"span"}],[{"id":"id-25","text":"Fischetti, M. and Jo, J. ","element":"span"},{"text":"Deep neural networks as 0-1 mixed integer linear programs: A feasibility study. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1712.06174","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-14","text":"Gehr, T., Mirman, M., Drachsler-Cohen, D., Tsankov, P., ","element":"span"},{"text":"Chaudhuri, S., and Vechev, M. Ai2: Safety and robustness certification of neural networks with abstract interpretation. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"2018 IEEE Symposium on Security and Privacy (SP)","element":"span"},{"text":", pp. 3–18. IEEE, 2018.","element":"span"}],[{"id":"id-1","text":"Goodfellow, I. J., Shlens, J., and Szegedy, C. Explain- ","element":"span"},{"text":"ing and harnessing adversarial examples. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1412.6572","element":"span"},{"text":", 2014.","element":"span"}],[{"id":"id-30","text":"Gowal, S., Dvijotham, K., Stanforth, R., Bunel, R., Qin, C., ","element":"span"},{"text":"Uesato, J., Mann, T., and Kohli, P. On the effectiveness of interval bound propagation for training verifiably robust models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1810.12715","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-29","text":"Hein, M. and Andriushchenko, M. Formal guarantees on the ","element":"span"},{"text":"robustness of a classifier against adversarial manipulation. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pp. 2266–2276, 2017.","element":"span"}],[{"id":"id-40","text":"Hinton, G., Vinyals, O., and Dean, J. ","element":"span"},{"text":"Distilling the knowledge in a neural network. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1503.02531","element":"span"},{"text":", 2015.","element":"span"}],[{"id":"id-19","text":"Hwang, U., Park, J., Jang, H., Yoon, S., and Cho, N. I. ","element":"span"},{"text":"Puvae: A variational autoencoder to purify adversarial examples. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1903.00585","element":"span"},{"text":", 2019.","element":"span"}],[{"id":"id-11","text":"Katz, G., Barrett, C., Dill, D. L., Julian, K., and Kochender- ","element":"span"},{"text":"fer, M. J. Reluplex: An efficient smt solver for verifying deep neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Computer Aided Verification","element":"span"},{"text":", pp. 97–117. Springer, 2017.","element":"span"}],[{"id":"id-45","text":"Kingma, D. P. and Ba, J. Adam: A method for stochastic ","element":"span"},{"text":"optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1412.6980","element":"span"},{"text":", 2014.","element":"span"}],[{"id":"id-24","text":"Lomuscio, A. and Maganti, L. An approach to reachability ","element":"span"},{"text":"analysis for feed-forward relu neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1706.07351","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-20","text":"Madry, A., Makelov, A., Schmidt, L., Tsipras, D., and ","element":"span"},{"text":"Vladu, A. Towards deep learning models resistant to adversarial attacks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1706.06083","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-28","text":"Mirman, M., Gehr, T., and Vechev, M. Differentiable ab- ","element":"span"},{"text":"stract interpretation for provably robust neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Machine Learning","element":"span"},{"text":", pp. 3575–3583, 2018.","element":"span"}],[{"id":"id-16","text":"Papernot, N. and McDaniel, P. Deep k-nearest neighbors: ","element":"span"},{"text":"Towards confident, interpretable and robust deep learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1803.04765","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-6","text":"Raghunathan, A., Steinhardt, J., and Liang, P. Certified ","element":"span"},{"text":"defenses against adversarial examples. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1801.09344","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-9","text":"Salman, H., Yang, G., Zhang, H., Hsieh, C.-J., and Zhang, ","element":"span"},{"text":"P. A convex relaxation barrier to tight robust verification of neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"NeurIPS","element":"span"},{"text":", 2019.","element":"span"}],[{"id":"id-18","text":"Samangouei, P., Kabkab, M., and Chellappa, R. Defense- ","element":"span"},{"text":"gan: Protecting classifiers against adversarial attacks using generative models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1805.06605","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-26","text":"Scheibler, K., Winterer, L., Wimmer, R., and Becker, B. ","element":"span"},{"text":"Towards verification of artificial neural networks. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"MBMV","element":"span"},{"text":", pp. 30–40, 2015.","element":"span"}],[{"id":"id-21","text":"Shafahi, A., Najibi, M., Ghiasi, A., Xu, Z., Dickerson, ","element":"span"},{"text":"J., Studer, C., Davis, L. S., Taylor, G., and Goldstein, T. ","element":"span"},{"text":"Adversarial training for free! ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1904.12843","element":"span"},{"text":", 2019.","element":"span"}],[{"id":"id-17","text":"Shan, S., Willson, E., Wang, B., Li, B., Zheng, H., and Zhao, ","element":"span"},{"text":"B. Y. Gotta catch ’em all: Using concealed trapdoors to detect adversarial attacks on neural networks, 2019.","element":"span"}],[{"id":"id-15","text":"Singh, G., Gehr, T., Mirman, M., P","element":"span"},{"text":"¨uschel, M., and Vechev, M. Fast and effective robustness certification. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pp. 10802–10813, 2018.","element":"span"}],[{"id":"id-0","text":"Szegedy, C., Zaremba, W., Sutskever, I., Bruna, J., Erhan, ","element":"span"},{"text":"D., Goodfellow, I., and Fergus, R. Intriguing properties of neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1312.6199","element":"span"},{"text":", 2013.","element":"span"}],[{"id":"id-12","text":"Tjeng, V., Xiao, K., and Tedrake, R. Evaluating robust- ","element":"span"},{"text":"ness of neural networks with mixed integer programming. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1711.07356","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-3","text":"Uesato, J., O’Donoghue, B., Oord, A. v. d., and Kohli, P. ","element":"span"},{"text":"Adversarial risk and the dangers of evaluating against weak attacks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1802.05666","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-13","text":"Weng, T.-W., Zhang, H., Chen, H., Song, Z., Hsieh, C.- ","element":"span"},{"text":"J., Boning, D., Dhillon, I. S., and Daniel, L. Towards fast computation of certified robustness for relu networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ICML","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-4","text":"Wong, E. and Kolter, J. Z. Provable defenses against adver- ","element":"span"},{"text":"sarial examples via the convex outer adversarial polytope. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1711.00851","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-5","text":"Wong, E., Schmidt, F., Metzen, J. H., and Kolter, J. Z. ","element":"span"},{"text":"Scaling provable adversarial defenses. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pp. 8400–8409, 2018.","element":"span"}],[{"id":"id-36","text":"Xiao, K. Y., Tjeng, V., Shafiullah, N. M., and Madry, A. ","element":"span"},{"text":"Training for faster adversarial robustness verification via inducing relu stability. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1809.03008","element":"span"},{"text":", 2018.","element":"span"}],[{"id":"id-22","text":"Zhang, D., Zhang, T., Lu, Y., Zhu, Z., and Dong, B. You ","element":"span"},{"text":"only propagate once: Painless adversarial training using maximal principle. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1905.00877","element":"span"},{"text":", 2019.","element":"span"}],[{"id":"id-8","text":"Zhang, H., Weng, T.-W., Chen, P.-Y., Hsieh, C.-J., and ","element":"span"},{"text":"Daniel, L. Efficient neural network robustness certifica-tion with general activation functions. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", pp. 4939–4948, 2018.","element":"span"}],[{"id":"id-31","text":"Zhang, H., Chen, H., Xiao, C., Li, B., Boning, D., and Hsieh, ","element":"span"},{"text":"C.-J. Towards stable and efficient training of verifiably robust neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International Conference on Learning Representations","element":"span"},{"text":", 2020.","element":"span"}],[{"style":{"width":"100%"},"width":939,"height":331,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-0.png","element":"img"}]]},{"heading":"A. Proof of Proposition 1","paragraphs":[[{"id":"id-42","style":{"fontWeight":"bold"},"text":"Proposition 1. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Assume ","element":"span"},{"style":{"height":17.93},"width":177.15,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-1.png","element":"img","alt":" {z′i, x′i}Li=1 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is obtained by the ReLU ","element":"span"},{"style":{"fontStyle":"italic"},"text":"network ","element":"span"},{"style":{"height":13.19},"width":44.96,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-2.png","element":"img","alt":" hL","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"with input ","element":"span"},{"style":{"height":19.26},"width":265.36,"height":48.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-3.png","element":"img","alt":" z′1, and {δ∗i }L−1i=0 ","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is the optimal solu- ","element":"span"},{"style":{"fontStyle":"italic"},"text":"tion of Fast-Lin or CROWN. If ","element":"span"},{"style":{"height":18.55},"width":470.48,"height":46.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-4.png","element":"img","alt":" z′1 = x+δ∗0, and x′ij ∈ S(δ∗ij)","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for all ","element":"span"},{"style":{"height":14},"width":373.14,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-5.png","element":"img","alt":" i = 1, ..., L − 1, j ∈ Ii","inline":true},{"style":{"fontStyle":"italic"},"text":", then ","element":"span"},{"style":{"height":17.93},"width":177.15,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-6.png","element":"img","alt":" {z′i, x′i}Li=1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"is an opti- ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mal solution of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"style":{"fontStyle":"italic"},"text":", Fast-Lin and CROWN. Here","element":"span"}],[{"style":{"width":"72%"},"width":681,"height":126,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-7.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"We only need to prove ","element":"span"},{"style":{"height":17.93},"width":177.15,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-8.png","element":"img","alt":" {z′i, x′i}Li=1 ","inline":true,"padRight":true},{"text":"is an optimal solu- ","element":"span"},{"text":"tion of both Fast-Lin and CROWN. After that, ","element":"span"},{"style":{"height":17.93},"width":213.42,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-9.png","element":"img","alt":" {z′i, x′i}Li=1 is","inline":true,"padRight":true},{"text":"both a lower bound and feasible solution of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":", and therefore is the optimal solution of ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O","element":"a"},{"text":".","element":"span"}],[{"text":"Here we define ","element":"span"},{"style":{"height":15.13},"width":270.73,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-10.png","element":"img","alt":" x∗i = Wiz∗i + bi","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":16.73},"width":330.25,"height":41.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-11.png","element":"img","alt":" i = 1, ..., L, z∗i+1 =","inline":true},{"style":{"height":21.12},"width":211.7,"height":52.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-12.png","element":"img","alt":"D(L)i x∗i + δ∗i","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":14},"width":264.6,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-13.png","element":"img","alt":" i = 1, ..., L − 1","inline":true},{"text":", and ","element":"span"},{"style":{"height":15.56},"width":211.8,"height":38.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-14.png","element":"img","alt":" z∗1 = x + δ∗0","inline":true},{"text":". By ","element":"span"},{"text":"definition, ","element":"span"},{"style":{"height":17.93},"width":189,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-15.png","element":"img","alt":" {x∗i , z∗i }Li=1 ","inline":true,"padRight":true},{"text":"is an optimal solution of Fast-Lin or ","element":"span"},{"text":"CROWN. Also, since ","element":"span"},{"style":{"height":14.94},"width":592.17,"height":37.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-16.png","element":"img","alt":" z∗1 = z′1, we have x′1 = W1z∗1 + b1 =","inline":true},{"style":{"height":14.94},"width":38.78,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-17.png","element":"img","alt":"x∗1","inline":true},{"text":". Next, we will prove if the assumption holds, we will ","element":"span"},{"text":"have ","element":"span"},{"style":{"height":14.94},"width":125.83,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-18.png","element":"img","alt":" z′2 = z∗2 ","inline":true,"padRight":true},{"text":"for both Fast-Lin and CROWN.","element":"span"}],[{"style":{"width":"99%"},"width":934,"height":168,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-19.png","element":"img"}],[{"text":"For ","element":"span"},{"style":{"height":15.05},"width":128.7,"height":37.63,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-20.png","element":"img","alt":" j ∈ I−1","inline":true,"padRight":true},{"text":", again, by definition, ","element":"span"},{"style":{"height":23.52},"width":167.66,"height":58.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-21.png","element":"img","alt":" D(L)1j = 0","inline":true},{"text":", and ","element":"span"},{"style":{"height":17.53},"width":103.36,"height":43.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-22.png","element":"img","alt":" x∗1j ≤","inline":true},{"style":{"height":15.59},"width":148.18,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-23.png","element":"img","alt":"¯x1j < 0","inline":true},{"text":", so ","element":"span"},{"style":{"height":23.52},"width":709.78,"height":58.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-24.png","element":"img","alt":" x′1j = x∗1j < 0, z∗2j = D(L)1j x∗1j = 0 =","inline":true},{"style":{"height":18.55},"width":312.59,"height":46.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-25.png","element":"img","alt":"max(x′1j, 0) = z′2j.","inline":true}],[{"text":"For ","element":"span"},{"style":{"height":14},"width":117.97,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-26.png","element":"img","alt":" j ∈ I1:","inline":true}],[{"id":"id-37","style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":18.15},"width":125.72,"height":45.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-27.png","element":"img","alt":" δ∗1j = 0","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.35},"width":130.78,"height":43.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-28.png","element":"img","alt":" x′1j = 0","inline":true,"padRight":true},{"text":"as assumed in the conditions, ","element":"span"},{"text":"since ","element":"span"},{"style":{"height":15.16},"width":292.28,"height":37.89,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-29.png","element":"img","alt":" z∗1 = z′1, we know","inline":true}],[{"style":{"width":"81%"},"width":764,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-30.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.59},"width":400.01,"height":38.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-31.png","element":"img","alt":" W1j is the j-th row of W1","inline":true},{"text":". No matter what value ","element":"span"},{"style":{"height":23.52},"width":80.42,"height":58.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-32.png","element":"img","alt":"D(L)1j","inline":true,"padRight":true},{"text":"is, ","element":"span"},{"style":{"height":23.52},"width":712.66,"height":58.8,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-33.png","element":"img","alt":" z∗2,j = D(L)1j x∗1j = 0, z′2j = max(x′1j, 0) =","inline":true,"padRight":true},{"text":"0","element":"span"},{"text":", the equality still holds.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":27.4},"width":270.82,"height":68.5,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-34.png","element":"img","alt":" δ∗1,j = −¯x1jx1j¯x1j−x1j","inline":true,"padRight":true},{"text":", for both Fast-Lin and CROWN, ","element":"span"},{"style":{"height":27.4},"width":502.17,"height":68.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-35.png","element":"img","alt":"z∗2j = −¯x1jx1j¯x1j−x1j (x∗1j − x1j)","inline":true},{"text":". ","element":"span"},{"text":"Further, if ","element":"span"},{"style":{"height":15.35},"width":110.74,"height":38.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-36.png","element":"img","alt":" x′1j ∈","inline":true},{"style":{"height":18.39},"width":168.54,"height":45.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-37.png","element":"img","alt":"{x1j, ¯x1j}","inline":true,"padRight":true},{"text":"as assumed: if ","element":"span"},{"style":{"height":17.53},"width":308.35,"height":43.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-38.png","element":"img","alt":" x∗1j = x′1j = x1j","inline":true},{"text":", then","element":"span"}],[{"style":{"width":"100%"},"width":939,"height":662,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-39.png","element":"img"}],[{"text":"Now we have proved ","element":"span"},{"style":{"height":14.94},"width":157.53,"height":37.36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-40.png","element":"img","alt":" z′2 = z∗2","inline":true,"padRight":true},{"text":"for both Fast-Lin and ","element":"span"},{"text":"CROWN if the assumption is satisfied. Starting from this layer, using the same argument as above, we can prove ","element":"span"},{"style":{"height":14.94},"width":132.02,"height":37.35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-41.png","element":"img","alt":"z′3 = z∗3","inline":true},{"text":",...,","element":"span"},{"style":{"height":15.37},"width":223.81,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-42.png","element":"img","alt":"z′L−1 = z∗L−1","inline":true,"padRight":true},{"text":"for both Fast-Lin and CROWN. ","element":"span"},{"text":"As a result, ","element":"span"},{"style":{"height":15.38},"width":152.74,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-43.png","element":"img","alt":" x′L = x∗L","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.78},"width":337.67,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-44.png","element":"img","alt":" cTt x′L = cTt x∗L = p∗C","inline":true},{"text":", where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"can be both Fast-lin and CROWN. Therefore, ","element":"span"},{"style":{"height":17.93},"width":214.8,"height":44.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-45.png","element":"img","alt":" {z′i, x′i}Li=1 is","inline":true,"padRight":true},{"text":"an optimal solution of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":".","element":"span"}]]},{"heading":"B. Ablation Studies of the Two Regularizers","paragraphs":[[{"text":"In this section, we give the detailed results with either ","element":"span"},{"style":{"height":14.4},"width":95.26,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-46.png","element":"img","alt":" λ or γ","inline":true,"padRight":true},{"text":"set to 0, i.e., we use only one regularizer in each experiment, in order to compare the effectiveness of the two regularizers. All the results are with the small model on CIFAR10 with ","element":"span"},{"style":{"height":16},"width":169.01,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-47.png","element":"img","alt":"ϵ = 2/255","inline":true},{"text":". The best results are achieved with ","element":"span"},{"style":{"height":16},"width":227.16,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-48.png","element":"img","alt":" r(x, δ∗0, W, b).","inline":true,"padRight":true},{"text":"We reasoned in ","element":"span"},{"text":"4.2 ","element":"span"},{"text":"that ","element":"span"},{"style":{"height":16},"width":219.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-49.png","element":"img","alt":" d(x, δ∗0, W, b)","inline":true,"padRight":true},{"text":"may not perform ","element":"span"},{"text":"well when random projection is adopted. As shown in the supplementary, the best robust error achieved under the same setting when fixing ","element":"span"},{"style":{"height":14.4},"width":96,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-50.png","element":"img","alt":" γ = 0","inline":true,"padRight":true},{"text":"is higher than when fixing ","element":"span"},{"style":{"height":13.2},"width":106.3,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-51.png","element":"img","alt":" λ = 0,","inline":true,"padRight":true},{"text":"which means ","element":"span"},{"style":{"height":16},"width":217.66,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-52.png","element":"img","alt":" r(x, δ∗0, W, b)","inline":true,"padRight":true},{"text":"is more resistant to the noise ","element":"span"},{"text":"introduced by random projection. Still, random projections offer a huge efficiency boost when they are used. How to improve the bounds while maintaining efficiency is an important future work.","element":"span"}]]},{"heading":"C. Additional Results on MNIST","paragraphs":[[{"text":"See Table ","element":"span"},{"href":"#id-49","text":"4","element":"a"},{"text":".","element":"span"}]]},{"heading":"D. Solutions to the Relaxed Problems","paragraphs":[[{"text":"In this section, we give more details about the optimal solutions of Fast-Lin (","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":") and CROWN (","element":"span"},{"href":"#id-8","referenceIndex":37,"text":"Zhang ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"et al.","element":"a"},{"href":"#id-8","referenceIndex":37,"text":", ","element":"a"},{"href":"#id-8","referenceIndex":37,"text":"2018","element":"a"},{"text":"), to make this paper self-contained. Recall that for layer-wise convex relaxations, each neuron in the activation layer are independent. ","element":"span"},{"style":{"height":18.38},"width":419.2,"height":45.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-53.png","element":"img","alt":" {aij, bij, aij, bij} are cho-","inline":true,"padRight":true},{"text":"sen to bound the activations assuming the lower bound ","element":"span"},{"style":{"height":13.18},"width":47.05,"height":32.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-54.png","element":"img","alt":" xij","inline":true,"padRight":true},{"text":"and upper bound ","element":"span"},{"style":{"height":14.39},"width":47.04,"height":35.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-55.png","element":"img","alt":" ¯xij","inline":true,"padRight":true},{"text":"of the preactivation ","element":"span"},{"style":{"height":11.59},"width":47.05,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-56.png","element":"img","alt":" xij","inline":true,"padRight":true},{"text":"is known. For","element":"span"}],[{"style":{"width":"99%"},"width":934,"height":59,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/10-57.png","element":"img"}],[{"style":{"width":"66%"},"width":1285,"height":502,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-0.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Table 3. ","element":"figcaption","subtype":"caption"},{"text":"Ablation results on CIFAR10 with the small model, where ","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":165.24,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-1.png","element":"img","alt":" ϵ = 2/255.","inline":true}],[{"id":"id-47","style":{"width":"74%"},"width":1451,"height":264,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-2.png","element":"img"}],[{"id":"id-49","style":{"fontStyle":"italic"},"text":"Table 4. ","element":"figcaption","subtype":"caption"},{"text":"Our results on the MNIST dataset, with CROWN-IBP. Here we use a cheaper training schedule with a total of 100 epochs, all in the same setting as the IBP baseline results of (","element":"figcaption","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a","subtype":"caption"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a","subtype":"caption"},{"text":"). CI Orig are results copied from the paper, CI ReImp are results of our implementation of CROWN-IBP, and CI Reg is with regularizer ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"r","element":"figcaption","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"style":{"height":15.59},"width":191.93,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-3.png","element":"img","alt":"aijxij + bij","inline":true},{"text":"; for ","element":"span"},{"style":{"height":18.46},"width":121.72,"height":46.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-4.png","element":"img","alt":" j ∈ I+i","inline":true,"padRight":true},{"text":", ","element":"span"},{"style":{"height":17.19},"width":443.81,"height":42.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-5.png","element":"img","alt":" aij = aij = aij = bij = 1","inline":true},{"text":"; for ","element":"span"},{"style":{"height":17.19},"width":569.92,"height":42.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-6.png","element":"img","alt":"j ∈ I−i , aij = aij = bij = bij = 0.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Optimal Solutions of Fast-Lin ","element":"span"},{"text":"In Fast-Lin (","element":"span"},{"href":"#id-13","referenceIndex":32,"text":"Weng et al.","element":"a"},{"href":"#id-13","referenceIndex":32,"text":", ","element":"a"},{"href":"#id-13","referenceIndex":32,"text":"2018","element":"a"},{"text":"), for ","element":"span"},{"style":{"height":23.49},"width":478.81,"height":58.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-7.png","element":"img","alt":" j ∈ Ii, aij = aij = xijxij−xij","inline":true,"padRight":true},{"text":", ","element":"span"},{"style":{"height":17.19},"width":246.4,"height":42.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-8.png","element":"img","alt":" bij = 0, bij =","inline":true}],[{"id":"id-53","style":{"width":"99%"},"width":935,"height":53,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-9.png","element":"img"}],[{"text":"To compute the lower and upper bound ","element":"span"},{"style":{"height":13.18},"width":47.04,"height":32.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-10.png","element":"img","alt":" xij","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.59},"width":47.05,"height":28.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-11.png","element":"img","alt":" xij","inline":true,"padRight":true},{"text":"f","element":"span"},{"href":"#id-35","text":"or ","element":"a"},{"style":{"height":11.59},"width":47.05,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-12.png","element":"img","alt":"xij","inline":true},{"text":", we just need to replace the objective of Eq. ","element":"span"},{"href":"#id-34","style":{"fontStyle":"italic"},"text":"C ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":13.35},"width":79.09,"height":33.37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-13.png","element":"img","alt":"c⊤ijxi","inline":true},{"text":", where ","element":"span"},{"style":{"height":11.59},"width":41.51,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-14.png","element":"img","alt":" cij","inline":true,"padRight":true},{"text":"is a one-hot vector with the same number ","element":"span"},{"text":"of entries as ","element":"span"},{"style":{"height":14.4},"width":188.33,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-15.png","element":"img","alt":" xi and the j","inline":true},{"text":"-th entry being 1 for ","element":"span"},{"style":{"height":13.18},"width":47.04,"height":32.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-16.png","element":"img","alt":" xij","inline":true,"padRight":true},{"text":"and -1 for ","element":"span"},{"style":{"height":11.59},"width":47.05,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-17.png","element":"img","alt":"xij","inline":true,"padRight":true},{"text":"(an extra negation is applied to the minimum to get ","element":"span"},{"style":{"height":15.19},"width":73.85,"height":37.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-18.png","element":"img","alt":" xij).","inline":true}],[{"text":"Such constraints allow each intermediate ReLU activation to reach their upper or lower bounds independently. As a result, ","element":"span"},{"id":"id-54","text":"each intermediate ","element":"span"},{"style":{"fontStyle":"italic"},"text":"unstable neuron ","element":"span"},{"text":"can be seen as an adversary adding a perturbation ","element":"span"},{"style":{"height":16.39},"width":41.98,"height":40.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-19.png","element":"img","alt":" δij","inline":true,"padRight":true},{"text":"in the range ","element":"span"},{"style":{"height":25.4},"width":253.48,"height":63.51,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-20.png","element":"img","alt":" [0, −xijxijxij−xij ] to","inline":true}],[{"text":"a linear transform, represented as ","element":"span"},{"style":{"height":23.49},"width":371.01,"height":58.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-21.png","element":"img","alt":" zij = xijxij−xij xij + δij","inline":true},{"text":". ","element":"span"},{"text":"Such a point of view gives rise to a more interpretable explanation for Fast-Lin. If we construct a network from the relaxed constraints, then the problem becomes how to choose the perturbations for both the input and intermediate unstable neurons to minimize ","element":"span"},{"style":{"height":10.74},"width":88.92,"height":26.85,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-22.png","element":"img","alt":" c⊤t xL","inline":true,"padRight":true},{"text":"of a multi-layer linear ","element":"span"},{"text":"network. Such a linear network under the perturbations is defined as","element":"span"}],[{"id":"id-50","style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":15.59},"width":670.58,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-23.png","element":"img","alt":"i+1 = Dixi+δi, xi = Wizi+bi, for i = 1","inline":true},{"style":{"fontStyle":"italic"},"text":", ..., L, z","element":"span"},{"style":{"height":14.8},"width":171.37,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-24.png","element":"img","alt":"1 = x+δ0,","inline":true}],[{"text":"(4) where ","element":"span"},{"style":{"height":13.19},"width":43.99,"height":32.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-25.png","element":"img","alt":" Di","inline":true,"padRight":true},{"text":"is a diagonal matrix and ","element":"span"},{"style":{"height":13.99},"width":28.71,"height":34.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-26.png","element":"img","alt":" δi","inline":true,"padRight":true},{"text":"is a vector. The input perturbation satisfies ","element":"span"},{"style":{"height":16.79},"width":174.56,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-27.png","element":"img","alt":" ∥δ0∥p ≤ ϵ","inline":true},{"text":". The ","element":"span"},{"style":{"fontStyle":"italic"},"text":"j","element":"span"},{"text":"-th diagonal entry","element":"span"}],[{"style":{"height":16.39},"width":561,"height":40.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-28.png","element":"img","alt":"Dij and the j-th entry δij for i > 0","inline":true,"padRight":true},{"text":"is defined as","element":"span"}],[{"style":{"width":"87%"},"width":824,"height":316,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-29.png","element":"img"}],[{"text":"With such an observation, we can further unfold the objective in Eq. ","element":"span"},{"href":"#id-34","style":{"fontStyle":"italic"},"text":"C ","element":"a"},{"href":"#id-50","text":"4 ","element":"a"},{"text":"into a more interpretable form as","element":"span"}],[{"style":{"width":"85%"},"width":800,"height":191,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-30.png","element":"img"}],[{"text":"where the first term of RHS is a forward pass of the clean image ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"through a linear network interleaving between a linear layer ","element":"span"},{"style":{"height":13.19},"width":220.05,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-31.png","element":"img","alt":" x = Wiz + bi","inline":true,"padRight":true},{"text":"and a scaling layer ","element":"span"},{"style":{"height":13.2},"width":218.61,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-32.png","element":"img","alt":" z = Dix, and","inline":true,"padRight":true},{"text":"the second term is the sum of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th perturbation passing through all the weight matrices ","element":"span"},{"style":{"height":13.19},"width":48.63,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-33.png","element":"img","alt":" Wi","inline":true,"padRight":true},{"text":"of the linear operation layers and scaling layers ","element":"span"},{"style":{"height":13.19},"width":171.35,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-34.png","element":"img","alt":" Di after it.","inline":true}],[{"text":"Therefore, under such a relaxation, only the second term is affected by the variables ","element":"span"},{"style":{"height":19.26},"width":133.6,"height":48.16,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-35.png","element":"img","alt":" {δi}L−1i=0","inline":true,"padRight":true},{"text":"for optimizing Eq. ","element":"span"},{"href":"#id-34","style":{"fontStyle":"italic"},"text":"C","element":"a"},{"text":". ","element":"span"},{"text":"Denote the linear network up to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":"-th layer as ","element":"span"},{"style":{"height":16},"width":163.45,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-36.png","element":"img","alt":" gi(x), and","inline":true},{"style":{"height":20.4},"width":421.66,"height":50.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-37.png","element":"img","alt":"Wi:i′ = Wi�i−1k=i′ DkWk","inline":true},{"text":". We can transform Eq. ","element":"span"},{"href":"#id-34","style":{"fontStyle":"italic"},"text":"C ","element":"a"},{"text":"with ","element":"span"},{"style":{"height":23.49},"width":344.42,"height":58.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-38.png","element":"img","alt":"aij = aij = xijxij−xij","inline":true,"padRight":true},{"text":", ","element":"span"},{"style":{"height":17.19},"width":297.94,"height":42.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/11-39.png","element":"img","alt":" bij = 0, bij = −","inline":true}],[{"text":"following constrained optimization problem","element":"span"}],[{"id":"id-51","style":{"width":"100%"},"width":938,"height":821,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-0.png","element":"img"}],[{"id":"id-39","text":"where ","element":"span"},{"style":{"height":16},"width":66.92,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-1.png","element":"img","alt":" ∥·∥∗","inline":true,"padRight":true},{"text":"is the dual norm of the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p ","element":"span"},{"text":"norm. In this way, the optimal value ","element":"span"},{"style":{"height":15.37},"width":87.71,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-2.png","element":"img","alt":" p∗C pC","inline":true,"padRight":true},{"text":"of the relaxed problem (Eq. ","element":"span"},{"href":"#id-51","text":"7","element":"a"},{"text":") can be ","element":"span"},{"text":"found efficiently without any gradient step. The optimal value can be achieved by just treating the input perturbations and intermediate relaxed ReLU activations as adversaries against a linear network after them.The resulting expression for the lower-bound is","element":"span"}],[{"style":{"width":"97%"},"width":917,"height":241,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-3.png","element":"img"}],[{"id":"id-52","text":"Though starting from different points of view, it can be ","element":"span"},{"text":"easily proved that the objective derived from a dual view in (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":") is the same as Fast-Lin.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Optimal Solution of CROWN ","element":"span"},{"text":"The only difference between CROWN and Fast-Lin is in the choice of ","element":"span"},{"style":{"height":13.18},"width":45.34,"height":32.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-4.png","element":"img","alt":" aij","inline":true,"padRight":true},{"text":"for ","element":"span"},{"style":{"height":14},"width":106.4,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-5.png","element":"img","alt":"j ∈ Ii","inline":true},{"text":". For ReLU activations, CROWN chooses ","element":"span"},{"style":{"height":17.19},"width":128.58,"height":42.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-6.png","element":"img","alt":" aij = 1","inline":true,"padRight":true},{"text":"if ","element":"span"},{"style":{"height":16.79},"width":202.79,"height":41.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-7.png","element":"img","alt":" ¯xij ≥ −xij","inline":true},{"text":", or ","element":"span"},{"style":{"height":17.19},"width":143.03,"height":42.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-8.png","element":"img","alt":" aij = 0","inline":true,"padRight":true},{"text":"otherwise. This makes the relaxation tighter than Fast-Lin, but also introduces extra complexity due to the varying ","element":"span"},{"style":{"height":13.19},"width":44,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-9.png","element":"img","alt":" Di","inline":true},{"text":". In Fast-Lin, ","element":"span"},{"style":{"height":13.19},"width":43.99,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-10.png","element":"img","alt":" Di","inline":true,"padRight":true},{"text":"is a constant once the upper and lower bounds ","element":"span"},{"style":{"height":11.99},"width":33.78,"height":29.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-11.png","element":"img","alt":" ¯xi","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":10.78},"width":33.78,"height":26.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-12.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"are given. For CROWN, since ","element":"span"},{"style":{"height":17.99},"width":510.04,"height":44.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-13.png","element":"img","alt":" 0 < ¯aij < 1, ¯aij ̸= aij, Di now","inline":true,"padRight":true},{"text":"changes with the optimality condition of ","element":"span"},{"style":{"height":13.99},"width":28.71,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-14.png","element":"img","alt":" δi","inline":true},{"text":", which depends on the layer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"l ","element":"span"},{"text":"and the index ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"of the neuron/logit of interest. Specifically, for ","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-15.png","element":"img","alt":" ℓ∞","inline":true,"padRight":true},{"text":"adversaries, the optimality condition of ","element":"span"},{"style":{"height":13.99},"width":28.72,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-16.png","element":"img","alt":"δi","inline":true,"padRight":true},{"text":"is determined by ","element":"span"},{"style":{"height":15.31},"width":116.72,"height":38.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-17.png","element":"img","alt":" c⊤lkWl:i","inline":true},{"text":", so now we have to apply extra ","element":"span"},{"text":"index to the slope as ","element":"span"},{"style":{"height":21.12},"width":95.88,"height":52.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-18.png","element":"img","alt":" D(l,k)i","inline":true,"padRight":true},{"text":", as well as the equivalent linear ","element":"span"},{"text":"operator as ","element":"span"},{"style":{"height":21.49},"width":104.41,"height":53.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-19.png","element":"img","alt":" W(l,k)l:1","inline":true,"padRight":true},{"text":". As a result, the optimal solution is now","element":"span"}],[{"style":{"width":"96%"},"width":905,"height":246,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-20.png","element":"img"}],[{"text":"This drastically increase the number of computations, especially when computing the intermediate bounds ","element":"span"},{"style":{"height":10.78},"width":33.78,"height":26.96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-21.png","element":"img","alt":" xi","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":11.99},"width":33.77,"height":29.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-22.png","element":"img","alt":" ¯xi","inline":true},{"text":", where we can no longer just compute a single ","element":"span"},{"style":{"height":13.19},"width":74.61,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-23.png","element":"img","alt":" Wl:1","inline":true,"padRight":true},{"text":"to get the bound, but have to compute number-of-neuron copies of it for the different values of ","element":"span"},{"style":{"height":21.12},"width":95.87,"height":52.81,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-24.png","element":"img","alt":" D(l,k)i","inline":true,"padRight":true},{"text":"in the intermediate layers.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Practical Implementations of the Bounds ","element":"span"},{"text":"In practice, the final output bound (also the intermediate bounds) is computed in a backward pass, since we need to determine the value ","element":"span"},{"style":{"height":23.09},"width":203.33,"height":57.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-25.png","element":"img","alt":" (c⊤lkW(l,k)l:i+1)j","inline":true,"padRight":true},{"text":"to choose the optimal ","element":"span"},{"style":{"height":18.15},"width":41.98,"height":45.38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-26.png","element":"img","alt":" δ∗ij","inline":true},{"text":", which ","element":"span"},{"text":"is the multiplication of all linear operators after layer ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":". Computing ","element":"span"},{"style":{"height":23.09},"width":157.46,"height":57.72,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-27.png","element":"img","alt":" c⊤lkW(l,k)l:i+1","inline":true,"padRight":true},{"text":"in a backward pass avoids repeated ","element":"span"},{"text":"computation. It proceeds as","element":"span"}],[{"style":{"width":"77%"},"width":728,"height":58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-28.png","element":"img"}]]},{"heading":"E. A Toy Example for Tight Relaxation","paragraphs":[[{"style":{"width":"59%"},"width":560,"height":390,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-29.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"Figure 3. ","element":"figcaption","subtype":"caption"},{"text":"Illustration of the data distribution and the decision boundary of the network. In this case, ","element":"figcaption","subtype":"caption"},{"style":{"height":12.8},"width":353.64,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-30.png","element":"img","alt":" b = 0.3, ϵ = 0.2, x0 =","inline":true},{"style":{"height":16.49},"width":181.84,"height":41.24,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-31.png","element":"img","alt":"[0.1, 0.42]T .","inline":true}],[{"text":"We give an illustrative example where the optimal solution to the relaxed problem is a feasible solution to the original non-convex problem for certain input samples even when unstable neurons exist. ","element":"span"},{"text":"It is a binary clas-sification problem for samples ","element":"span"},{"style":{"height":17.38},"width":406.14,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-32.png","element":"img","alt":" x0 = [x01, x02]T ∈ R2","inline":true},{"text":". We assume ","element":"span"},{"style":{"height":9.19},"width":38.77,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-33.png","element":"img","alt":" x0","inline":true,"padRight":true},{"text":"is uniformly distributed in ","element":"span"},{"style":{"height":13.19},"width":128.83,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-34.png","element":"img","alt":" S0 ∪ S1","inline":true},{"text":", where","element":"span"}],[{"style":{"width":"99%"},"width":931,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-35.png","element":"img"}],[{"style":{"height":16},"width":396.16,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-36.png","element":"img","alt":"|x01| + b, ∥x0∥∞ ≤ 1}","inline":true},{"text":", and ","element":"span"},{"text":"0 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"< b < ","element":"span"},{"text":"1","element":"span"},{"text":". The ground-truth label for ","element":"span"},{"style":{"height":13.19},"width":135.41,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-37.png","element":"img","alt":" x0 ∈ S0","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.19},"width":135.4,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-38.png","element":"img","alt":" x0 ∈ S1","inline":true,"padRight":true},{"text":"are 0, 1 respectively. The maximal-margin classifier for such data distribution is ","element":"span"},{"style":{"height":21.62},"width":350.19,"height":54.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/12-39.png","element":"img","alt":"1{z12≥|z12|}, where z1","inline":true,"padRight":true},{"text":"is the input to the classifier. The data distribution and the associated maximal-margin classifier is shown in Figure ","element":"span"},{"href":"#id-52","text":"3","element":"a"},{"text":".","element":"span"}],[{"text":"This maximal-margin classifier can be represented by a ReLU network with single hidden layer as ","element":"span"},{"style":{"height":21.62},"width":203.35,"height":54.04,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-0.png","element":"img","alt":" 1{h2(z1)≥0}","inline":true},{"text":", where ","element":"span"},{"style":{"height":16},"width":441.55,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-1.png","element":"img","alt":" h2(z1) = W2σ(W1z1), and","inline":true}],[{"style":{"width":"94%"},"width":887,"height":193,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-2.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"Claim 1 ","element":"span"},{"text":"(Convex relaxation can be tight when unstable neurons exist)","element":"span"},{"style":{"fontWeight":"bold"},"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The solution to the relaxed problem ","element":"span"},{"href":"#id-51","style":{"fontStyle":"italic"},"text":"7 ","element":"a"},{"style":{"fontStyle":"italic"},"text":"is feasible for the original non-convex problem ","element":"span"},{"href":"#id-33","style":{"fontStyle":"italic"},"text":"O ","element":"a"},{"style":{"fontStyle":"italic"},"text":"of the aforementioned ReLU network ","element":"span"},{"style":{"height":16},"width":580.36,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-3.png","element":"img","alt":" h2(x0 + δ0) = W2σ(W1(x0 + δ0))","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"for any ","element":"span"},{"style":{"height":13.19},"width":132.22,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-4.png","element":"img","alt":" x0 ∈ S1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"under any perturbation ","element":"span"},{"style":{"height":16},"width":273.04,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-5.png","element":"img","alt":" δ0 ∈ {δ|∥δ∥∞ ≤","inline":true},{"style":{"height":16},"width":223.28,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-6.png","element":"img","alt":"ϵ, 0 < ϵ < b}.","inline":true}],[{"style":{"fontStyle":"italic"},"text":"Proof. ","element":"span"},{"text":"Both the network and ","element":"span"},{"style":{"height":13.19},"width":40.44,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-7.png","element":"img","alt":" S1","inline":true,"padRight":true},{"text":"are symmetric in ","element":"span"},{"style":{"height":13.2},"width":165.84,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-8.png","element":"img","alt":" x01, there-","inline":true,"padRight":true},{"text":"fore it is sufficient to prove the result for ","element":"span"},{"style":{"height":13.2},"width":139.6,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-9.png","element":"img","alt":" x01 ≥ 0.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"(1) ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":12.8},"width":151.31,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-10.png","element":"img","alt":" x01 ≥ ϵ","inline":true},{"text":", since ","element":"span"},{"style":{"height":13.2},"width":422.38,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-11.png","element":"img","alt":" x01 ∈ S1, x02 ≥ b + ϵ","inline":true},{"text":". The 4 neurons in ","element":"span"},{"style":{"height":16},"width":317.55,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-12.png","element":"img","alt":" x1 = W1(x0 + δ0)","inline":true,"padRight":true},{"text":"are either non-negative or non-positive for any ","element":"span"},{"style":{"height":16},"width":522.62,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-13.png","element":"img","alt":" δ0 ∈ {δ|∥δ∥∞ ≤ ϵ, 0 < ϵ < b}","inline":true,"padRight":true},{"text":"and the convex relaxation is tight.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"(2) ","element":"span"},{"text":"If ","element":"span"},{"style":{"height":11.19},"width":162.39,"height":27.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-14.png","element":"img","alt":" x01 < ϵ","inline":true},{"text":", for any input sample ","element":"span"},{"style":{"height":16},"width":257.06,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-15.png","element":"img","alt":" x0 ∈ {x|x =","inline":true},{"style":{"height":17.38},"width":717.92,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-16.png","element":"img","alt":"[a, c]T , 0 < a < b < b + a ≤ c} ⊆ S1","inline":true},{"text":", the optimal perturbation ","element":"span"},{"style":{"height":16.39},"width":652.98,"height":40.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-17.png","element":"img","alt":" δ∗O ∈ {δ|∥δ∥∞ ≤ ϵ, 0 < a < ϵ < b}","inline":true,"padRight":true},{"text":"can ","element":"span"},{"text":"be inferred from Figure ","element":"span"},{"href":"#id-52","text":"3 ","element":"a"},{"text":"as ","element":"span"},{"style":{"height":17.38},"width":126.2,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-18.png","element":"img","alt":" [ϵ, −ϵ]T","inline":true,"padRight":true},{"text":". The corresponding ReLU activations and the optimal solution are","element":"span"}],[{"id":"id-55","style":{"width":"89%"},"width":844,"height":46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-19.png","element":"img"}],[{"text":"Meanwhile, for the relaxed problem, the lower and upper bounds of the hidden neurons ","element":"span"},{"style":{"height":13.19},"width":244.64,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-20.png","element":"img","alt":" x1 = W1z1 are","inline":true}],[{"style":{"width":"64%"},"width":601,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-21.png","element":"img"}],[{"text":"Therefore, the first 2 hidden neurons are unstable neurons, and the convex relaxation we are using will relax the ReLU operation ","element":"span"},{"style":{"height":16},"width":191.1,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-22.png","element":"img","alt":" z2 = σ(x1)","inline":true,"padRight":true},{"text":"into ","element":"span"},{"style":{"height":13.99},"width":270.76,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-23.png","element":"img","alt":" z2 = D1x1 + δ1","inline":true},{"text":", where ","element":"span"},{"style":{"height":13.19},"width":49,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-24.png","element":"img","alt":" D1","inline":true,"padRight":true},{"text":"is a diagonal matrix, and ","element":"span"},{"style":{"height":13.99},"width":33.71,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-25.png","element":"img","alt":" δ1","inline":true,"padRight":true},{"text":"are slack variables bounded by ","element":"span"},{"style":{"height":16.21},"width":216.71,"height":40.52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-26.png","element":"img","alt":"0 ≤ δ1 ≤ ¯δ1","inline":true},{"text":". The diagonal entries of ","element":"span"},{"style":{"height":13.19},"width":49,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-27.png","element":"img","alt":" D1","inline":true,"padRight":true},{"text":"and the upper bounds ","element":"span"},{"style":{"height":16.2},"width":33.71,"height":40.49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-28.png","element":"img","alt":"¯δ1","inline":true,"padRight":true},{"text":"are defined by Eq. ","element":"span"},{"href":"#id-53","text":"5 ","element":"a"},{"text":"as","element":"span"}],[{"style":{"width":"98%"},"width":928,"height":217,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-29.png","element":"img"}],[{"text":"i.e., ","element":"span"},{"style":{"height":13.99},"width":49.62,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-30.png","element":"img","alt":" δ13","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":13.99},"width":49.62,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-31.png","element":"img","alt":" δ14","inline":true,"padRight":true},{"text":"are always 0. ","element":"span"},{"text":"The relaxed linear network, as defined by the constraints in Eq. ","element":"span"},{"href":"#id-34","style":{"fontStyle":"italic"},"text":"C ","element":"a"},{"text":"with our spe-cific relaxation, is now determined as ","element":"span"},{"style":{"height":16},"width":280.13,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-32.png","element":"img","alt":" x2 = h2(z1) =","inline":true},{"style":{"height":16},"width":417.55,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-33.png","element":"img","alt":"W2(D1W1(x + δ0) + δ1)","inline":true},{"text":". It can be written into the same","element":"span"}],[{"text":"form as Eq. ","element":"span"},{"href":"#id-54","text":"6 ","element":"a"},{"text":"as","element":"span"}],[{"style":{"width":"99%"},"width":937,"height":529,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-34.png","element":"img"}],[{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":18.18},"width":955.79,"height":45.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-35.png","element":"img","alt":"∗2 = D1W1(x+δ∗0)+δ∗1 = [a+ϵ, 0, c−ϵ, 0]T , x∗2 = c−a−2ϵ,","inline":true}],[{"text":"the same as the optimal solution of the original non-convex problem given in Eq. ","element":"span"},{"href":"#id-55","text":"14","element":"a"},{"text":". This shows both of the regularizers, in this case instantiated as","element":"span"}],[{"style":{"width":"88%"},"width":827,"height":143,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-36.png","element":"img"}],[{"text":"are able to reach 0 for certain networks and samples when non-stable neurons exist.","element":"span"}],[{"text":"It might seem that adding ","element":"span"},{"style":{"height":16.39},"width":514.95,"height":40.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-37.png","element":"img","alt":" d(x, δ∗0, W, b) = p′O(x, δ∗0) − p∗C","inline":true,"padRight":true},{"text":"as a regularizer into the loss function will undesirably minimize the margin ","element":"span"},{"style":{"height":16.39},"width":157.69,"height":40.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-38.png","element":"img","alt":" p′O(x, δ∗0)","inline":true,"padRight":true},{"text":"for the ReLU network. The- ","element":"span"},{"text":"oretically, however, it is not the case, since ","element":"span"},{"style":{"height":15.37},"width":210.41,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-39.png","element":"img","alt":" min p′O − p∗C","inline":true,"padRight":true},{"text":"is a different optimization problem from neither ","element":"span"},{"style":{"height":15.19},"width":118.12,"height":37.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-40.png","element":"img","alt":" min p′O","inline":true,"padRight":true},{"text":"nor ","element":"span"},{"style":{"height":15.37},"width":117.86,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-41.png","element":"img","alt":" max p∗C","inline":true},{"text":". In fact, the non-negative ","element":"span"},{"style":{"height":16},"width":219.32,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-42.png","element":"img","alt":" d(x, δ∗0, W, b)","inline":true,"padRight":true},{"text":"could ","element":"span"},{"text":"be minimized to 0 with both ","element":"span"},{"style":{"height":11.19},"width":45.05,"height":27.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-43.png","element":"img","alt":" p′O","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":15.38},"width":37.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-44.png","element":"img","alt":" p∗C","inline":true,"padRight":true},{"text":"taking large val- ","element":"span"},{"text":"ues. In the illustrative example, it is easy to see that for any ","element":"span"},{"style":{"height":17.38},"width":857.65,"height":43.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-45.png","element":"img","alt":" x0 ∈ {x|x = [a, c]T , 0 < a < b < b + a ≤ c}","inline":true},{"text":", ","element":"span"},{"style":{"height":16},"width":608.58,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-46.png","element":"img","alt":"d(x, δ∗0, W, b) = ct(x′2 − x∗2) = 0","inline":true},{"text":", but ","element":"span"},{"style":{"height":15.37},"width":228.35,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-47.png","element":"img","alt":" p′O = p∗C =","inline":true},{"style":{"height":19.37},"width":455.88,"height":48.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-48.png","element":"img","alt":"c − a − 2ϵ > 0 when ϵ < b2.","inline":true}],[{"text":"Moreover, since we are maximizing ","element":"span"},{"style":{"height":15.37},"width":37.05,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-49.png","element":"img","alt":" p∗C ","inline":true,"padRight":true},{"text":"via the robust cross ","element":"span"},{"text":"entropy loss","element":"span"},{"text":"2 ","element":"span"},{"text":"while minimizing the non-negative difference ","element":"span"},{"style":{"height":15.38},"width":134.22,"height":38.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-50.png","element":"img","alt":"p′O − p∗C","inline":true},{"text":", the overall objective tends to converge to a state ","element":"span"},{"text":"where both ","element":"span"},{"style":{"height":15.59},"width":323.9,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/13-51.png","element":"img","alt":" p′O and p∗C are large.","inline":true}]]},{"heading":"F. Difﬁculties in Adapting IBP for ℓ2 Adversary","paragraphs":[[{"text":"The Inverval Bound Propagation (IBP) method discussed here is defined in the same way as (","element":"span"},{"href":"#id-30","referenceIndex":12,"text":"Gowal et al.","element":"a"},{"href":"#id-30","referenceIndex":12,"text":", ","element":"a"},{"href":"#id-30","referenceIndex":12,"text":"2018","element":"a"},{"text":"), where the bound of the margins are computed layer-wise from the input layer to the final layer, and the bound of each neuron is considered independently for both bounding that neuron and using its inverval to bound other neurons.","element":"span"}],[{"text":"It is natural to apply IBP against ","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-0.png","element":"img","alt":" ℓ∞","inline":true,"padRight":true},{"text":"adversaries, since each neurons are allowed to change independently in its interval, which is similar to the ","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-1.png","element":"img","alt":" ℓ∞","inline":true,"padRight":true},{"text":"ball. One way to generalize IBP to other ","element":"span"},{"style":{"height":7.2},"width":33.61,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-2.png","element":"img","alt":" ℓp","inline":true,"padRight":true},{"text":"norms is to modify the bound propagation in the first layer, such that any of its output neuron (","element":"span"},{"style":{"fontStyle":"italic"},"text":"i","element":"span"},{"text":") is bounded by an interval centered at ","element":"span"},{"style":{"height":15.59},"width":306.19,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-3.png","element":"img","alt":" x1i = W1,ix + b1,i","inline":true,"padRight":true},{"text":"with a radius of ","element":"span"},{"style":{"height":16.79},"width":359.23,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-4.png","element":"img","alt":" ϵp∥W1,i∥p∗, where x1i","inline":true,"padRight":true},{"text":"the clean image ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"’s response, and ","element":"span"},{"style":{"height":15.59},"width":73.98,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-5.png","element":"img","alt":"W1,i","inline":true,"padRight":true},{"text":"is the first layer’s linear transform corresponding to the neuron, and ","element":"span"},{"style":{"height":16.79},"width":82.35,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-6.png","element":"img","alt":" ∥·∥p∗","inline":true,"padRight":true},{"text":"is the dual norm of ","element":"span"},{"style":{"height":21.77},"width":356.99,"height":54.43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-7.png","element":"img","alt":" ∥·∥p, with 1p + 1p∗ = 1.","inline":true,"padRight":true},{"text":"We refer to this approach IBP(","element":"span"},{"style":{"height":7.2},"width":88.07,"height":18,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-8.png","element":"img","alt":"ℓp, ϵp","inline":true},{"text":"). Here by the example of ","element":"span"},{"style":{"height":7.6},"width":32.61,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-9.png","element":"img","alt":" ℓ2","inline":true,"padRight":true},{"text":"norm, we show such an adaptation may not be able to obtain a robust ","element":"span"},{"style":{"fontStyle":"italic"},"text":"convolutional ","element":"span"},{"text":"neural network compared with established results, such as reaching 61% certified accuracy on CIFAR10 with ","element":"span"},{"style":{"height":14.4},"width":181,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-10.png","element":"img","alt":" ϵ2 = 0.25 (","inline":true},{"href":"#id-56","referenceIndex":4,"text":"Cohen et al.","element":"a"},{"href":"#id-56","referenceIndex":4,"text":", ","element":"a"},{"href":"#id-56","referenceIndex":4,"text":"2019","element":"a"},{"text":").","element":"span"}],[{"text":"Specifically, for adversaries within the ","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-11.png","element":"img","alt":" ℓ2","inline":true},{"text":"-ball ","element":"span"},{"style":{"height":16.79},"width":136.62,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-12.png","element":"img","alt":" B2,ϵ2(x)","inline":true},{"text":", IBP(","element":"span"},{"style":{"height":7.61},"width":86.58,"height":19.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-13.png","element":"img","alt":"ℓ2, ϵ2","inline":true},{"text":") computes the upper and lower bounds as","element":"span"}],[{"id":"id-58","style":{"width":"78%"},"width":735,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-14.png","element":"img"}],[{"text":"By comparison, for some adversary within the ","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-15.png","element":"img","alt":" ℓ∞","inline":true},{"text":"-ball ","element":"span"},{"style":{"height":16.79},"width":166.07,"height":41.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-16.png","element":"img","alt":"B∞,ϵ∞(x)","inline":true},{"text":", IBP(","element":"span"},{"style":{"height":5.21},"width":125.69,"height":13.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-17.png","element":"img","alt":"ℓ∞, ϵ∞","inline":true},{"text":") computes the upper and lower bounds as","element":"span"}],[{"id":"id-57","style":{"width":"79%"},"width":745,"height":102,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-18.png","element":"img"}],[{"text":"Since the two approaches are identical in the following layers, to analyze the best-case results of IBP(","element":"span"},{"style":{"height":14.4},"width":202.38,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-19.png","element":"img","alt":"ℓ2, ϵ2) based","inline":true,"padRight":true},{"text":"on established results of IBP(","element":"span"},{"style":{"height":5.21},"width":117.13,"height":13.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-20.png","element":"img","alt":"ℓ∞, ϵ∞","inline":true},{"text":"), it suffices to compare the results of IBP(","element":"span"},{"style":{"height":14.4},"width":272.86,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-21.png","element":"img","alt":"ℓ∞, ϵ∞) with ϵ∞","inline":true,"padRight":true},{"text":"set to some value such that the range ","element":"span"},{"href":"#id-57","style":{"height":14.97},"width":311.88,"height":37.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-22.png","element":"img","alt":" ¯x∞1 − x∞1 of Eq. 18","inline":true,"padRight":true},{"text":"is majorized by the range ","element":"span"},{"style":{"height":17.37},"width":128.3,"height":43.42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-23.png","element":"img","alt":"¯x21 − x21","inline":true,"padRight":true},{"text":"of Eq. ","element":"span"},{"href":"#id-58","text":"17","element":"a"},{"text":". In this way, we are assuming a weaker ","element":"span"},{"text":"adversary for IBP(","element":"span"},{"style":{"height":5.21},"width":123.14,"height":13.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-24.png","element":"img","alt":"ℓ∞, ϵ∞","inline":true},{"text":") than the original IBP(","element":"span"},{"style":{"height":7.61},"width":91.25,"height":19.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-25.png","element":"img","alt":"ℓ2, ϵ2","inline":true},{"text":"), so its certified accuracy is an upper bound of IBP(","element":"span"},{"style":{"height":7.61},"width":89.11,"height":19.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-26.png","element":"img","alt":"ℓ2, ϵ2","inline":true},{"text":"). Therefore, it suffices to let","element":"span"}],[{"id":"id-59","style":{"width":"90%"},"width":849,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-27.png","element":"img"}],[{"text":"For any ","element":"span"},{"style":{"height":18.18},"width":177.24,"height":45.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-28.png","element":"img","alt":" W1,i ∈ Rd","inline":true},{"text":", we have ","element":"span"},{"style":{"height":19.18},"width":379.66,"height":47.94,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-29.png","element":"img","alt":" ∥W1,i∥1 ≤√d∥W1,i∥2","inline":true},{"text":". To make Eq. ","element":"span"},{"href":"#id-59","text":"19 ","element":"a"},{"text":"hold for any ","element":"span"},{"style":{"height":18.17},"width":170.74,"height":45.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-30.png","element":"img","alt":" W1,i ∈ Rd","inline":true},{"text":", we can set","element":"span"}],[{"style":{"width":"22%"},"width":208,"height":92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-31.png","element":"img"}],[{"text":"In general, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"is equal to the data dimension, such as 3072 for the CIFAR10 dataset. However, for convolutional neural networks, the first layer is usually convolutional layers and ","element":"span"},{"style":{"height":15.59},"width":73.98,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-32.png","element":"img","alt":" W1,i","inline":true,"padRight":true},{"text":"is a 3072-dimensional sparse vector with at most ","element":"span"},{"style":{"height":10.8},"width":165.09,"height":27,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-33.png","element":"img","alt":"k × k × 3","inline":true,"padRight":true},{"text":"non-zero entries at fixed positions for convolution kernels with size ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"and input images with 3 channels. In (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":"; ","element":"span"},{"href":"#id-30","referenceIndex":12,"text":"Gowal et al.","element":"a"},{"href":"#id-30","referenceIndex":12,"text":", ","element":"a"},{"href":"#id-30","referenceIndex":12,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"), ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k ","element":"span"},{"text":"= 3 ","element":"span"},{"text":"for their major results. In this case,","element":"span"}],[{"style":{"width":"61%"},"width":580,"height":92,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-34.png","element":"img"}],[{"text":"Under such assumptions, for ","element":"span"},{"style":{"height":13.59},"width":158.11,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-35.png","element":"img","alt":" ϵ2 = 0.25","inline":true},{"text":", the certified accuracy of IBP(","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-36.png","element":"img","alt":"ℓ2","inline":true},{"text":", 0.25) on CIFAR10 should be upper bounded by IBP(","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-37.png","element":"img","alt":"ℓ∞","inline":true},{"text":", 0.04811), unless changing the first layer bounds into ","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-38.png","element":"img","alt":" ℓ∞","inline":true,"padRight":true},{"text":"norm based bounds significantly harms the performance.","element":"span"},{"text":"3 ","element":"span"},{"text":"The best available results of certified accuracies are 33.06% for IBP(","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-39.png","element":"img","alt":"ℓ∞","inline":true},{"text":", 0.03137) and 23.20% for IBP(","element":"span"},{"style":{"height":5.21},"width":60.36,"height":13.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-40.png","element":"img","alt":"ℓ∞,","inline":true,"padRight":true},{"text":"0.06275) (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":"). Comparing with the established results from (","element":"span"},{"href":"#id-56","referenceIndex":4,"text":"Cohen et al.","element":"a"},{"href":"#id-56","referenceIndex":4,"text":", ","element":"a"},{"href":"#id-56","referenceIndex":4,"text":"2019","element":"a"},{"text":") (61%), we can conclude the certified accuracy of IBP(","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-41.png","element":"img","alt":"ℓ2","inline":true},{"text":", 0.25) is at least 27.93% to 37.80% lower than the best available results, since we are assuming a weaker adversary.","element":"span"}],[{"text":"IBP(","element":"span"},{"style":{"height":7.61},"width":89.25,"height":19.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-42.png","element":"img","alt":"ℓ2, ϵ2","inline":true},{"text":") is also not as good as the results with convex relaxation from (","element":"span"},{"href":"#id-5","referenceIndex":34,"text":"Wong et al.","element":"a"},{"href":"#id-5","referenceIndex":34,"text":", ","element":"a"},{"href":"#id-5","referenceIndex":34,"text":"2018","element":"a"},{"text":"), where the best singlemodel (with projection as approximation) certified accuracy with ","element":"span"},{"style":{"height":16},"width":206.85,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-43.png","element":"img","alt":" ϵ2 = 36/255","inline":true,"padRight":true},{"text":"is 51.09%. For IBP, this adversary is no weaker than ","element":"span"},{"style":{"height":16},"width":295.1,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-44.png","element":"img","alt":" ϵ∞ = 6.9282/255","inline":true},{"text":". The best available results for IBP(","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-45.png","element":"img","alt":"ℓ∞","inline":true},{"text":", 2/255) and IBP(","element":"span"},{"style":{"height":5.2},"width":48.6,"height":13,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-46.png","element":"img","alt":"ℓ∞","inline":true},{"text":", 8/255) are 50.02% (","element":"span"},{"href":"#id-30","referenceIndex":12,"text":"Gowal ","element":"a"},{"href":"#id-30","referenceIndex":12,"text":"et al.","element":"a"},{"href":"#id-30","referenceIndex":12,"text":", ","element":"a"},{"href":"#id-30","referenceIndex":12,"text":"2018","element":"a"},{"text":") and 33.06% (","element":"span"},{"href":"#id-31","referenceIndex":38,"text":"Zhang et al.","element":"a"},{"href":"#id-31","referenceIndex":38,"text":", ","element":"a"},{"href":"#id-31","referenceIndex":38,"text":"2020","element":"a"},{"text":") respectively, which indicates the certified accuracy of IBP(","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-47.png","element":"img","alt":"ℓ2","inline":true},{"text":", 36/255) is at least 1.07% to 18.03% worse (much loser to 18.03%) than the approximated version of convex relaxation under the same ","element":"span"},{"style":{"height":7.6},"width":32.6,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2002.09766/images/14-48.png","element":"img","alt":" ℓ2","inline":true,"padRight":true},{"text":"adversary.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]