1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjAwMS4xMDkyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2020-09-17T09:34:56.000Z","paperID":"2001.10929","published":"2020-01-29T16:19:44.000Z","authors":"[\"Juri Opitz\",\"Letitia Parcalabescu\",\"Anette Frank\"]","title":"AMR Similarity Metrics from Principles","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-01T17:54:03.653Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9hbXItc2ltaWxhcml0eS1tZXRyaWNzLWZyb20tcHJpbmNpcGxlcyJ9","type":"pwc","url":"https://paperswithcode.com/paper/amr-similarity-metrics-from-principles","data":null}],"reposConnection":{"edges":[{"official":null,"node":{"id":"eyJyZXBvSUQiOiIyMzQwNjA3NjQiLCJzb3VyY2UiOiJnaXRodWIifQ==","source":"github","repoID":"234060764","url":"https://github.com/flipz357/amr-metric-suite","title":"amr-metric-suite","language":"python","stars":16,"forks":5,"framework":null,"scoreTrending":null,"updated":null,"created":null,"downloads":null,"likes":null,"owner":[{"username":"flipz357","avatar":"https://avatars.githubusercontent.com/u/59831412?v=4"}]}}]},"models":[],"tags":[{"id":"eyJuYW1lIjoibWFjaGluZSB0cmFuc2xhdGlvbiIsInR5cGUiOiJ0YXNrIn0=","name":"machine translation","description":"Machine translation takes text in one language as input and outputs the same text translated into another language. This task is commonly used in real-world applications like Google Translate, enabling communication between different language speakers.","scoreTrending":0.13431401164552068,"count":{"stars":8516,"papers":4925,"models":7710},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[{"author":"anette frank","node":{"id":"eyJhZGRyZXNzIjoiZnJhbmtAY2wudW5pLWhlaWRlbGJlcmcuZGUifQ==","address":"frank@cl.uni-heidelberg.de","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/66028562?v=4","username":"AnetteFrank"}],"scholar":[{"thirdPartyID":"9FP2fokAAAAJ"},{"thirdPartyID":"B_KpQlEAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJjMDcyY2FiYS05ODg2LTRlOWEtODM2YS0yNTU4YzQ1MDBiODIifQ==","name":"anette frank","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTgwNS4wNzg1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1805.07858"},{"id":"eyJwYXBlcklEIjoiMjExMi4wNTI1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.05253"},{"id":"eyJwYXBlcklEIjoiMTYwOC4wNTI0MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1608.05243"},{"id":"eyJwYXBlcklEIjoiMTcxMS4wMDc2OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1711.00768"},{"id":"eyJwYXBlcklEIjoiMTkwNC4wMDY3NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.00676"},{"id":"eyJwYXBlcklEIjoiMTkwOC4xMTMyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.11326"},{"id":"eyJwYXBlcklEIjoiMjAxMi4xMjM1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.12352"},{"id":"eyJwYXBlcklEIjoiMjAwOC4wODg5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.08896"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wNTU4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.05587"},{"id":"eyJwYXBlcklEIjoiMjAwMS4xMDkyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2001.10929"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wMzMzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.03338"},{"id":"eyJwYXBlcklEIjoiMTcxMS4wMzc1NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1711.03754"},{"id":"eyJwYXBlcklEIjoiMTcwMy4wNDMzMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1703.04330"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wMjQ5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.02497"},{"id":"eyJwYXBlcklEIjoiMjEwOC4xMTk0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.11949"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xMzY5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.13698"},{"id":"eyJwYXBlcklEIjoiMjEwNS4wMzE1NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.03157"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wMjI1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.02256"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wMTk5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.01998"},{"id":"eyJwYXBlcklEIjoiMTkwOC4xMDcyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1908.10721"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xMzIyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.13226"},{"id":"eyJwYXBlcklEIjoiMTkwNC4wODMwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.08301"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNzAyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.07023"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wMDkzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.00936"},{"id":"eyJwYXBlcklEIjoiMjEwNi4wMzk3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2106.03973"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wNjMwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.06304"},{"id":"eyJwYXBlcklEIjoiMjQwMS4wNzEwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2401.07105"},{"id":"eyJwYXBlcklEIjoiMTgwNy4wMzAwNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1807.03006"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wMTM0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.01349"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wODQ5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.08495"},{"id":"eyJwYXBlcklEIjoiMjMwNS4xNTA0NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.15045"},{"id":"eyJwYXBlcklEIjoiMjIxMC4wNjQ2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.06461"},{"id":"eyJwYXBlcklEIjoiMjMwOS4wNzYyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2309.07624"},{"id":"eyJwYXBlcklEIjoiMjQwMy4wNDQwMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2403.04400"}]}]}},{"author":"juri opitz","node":{"id":"eyJhZGRyZXNzIjoib3BpdHpAY2wudW5pLWhlaWRlbGJlcmcuZGUifQ==","address":"opitz@cl.uni-heidelberg.de","name":"Juri Opitz","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"2XZi2RkAAAAJ"},{"thirdPartyID":"DzxugZIAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI1OTZmNmM3NC0yMjUyLTRjYWQtYTY3ZC01Y2JiZmM4NDQ3ZjUifQ==","name":"juri opitz","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjAwOC4wODg5NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.08896"},{"id":"eyJwYXBlcklEIjoiMjAwMS4xMDkyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2001.10929"},{"id":"eyJwYXBlcklEIjoiMTkwNi4wMzMzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1906.03338"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wNjk5MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.06993"},{"id":"eyJwYXBlcklEIjoiMTkwOS4wOTAzMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.09031"},{"id":"eyJwYXBlcklEIjoiMjEwOC4xMTk0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.11949"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xOTc5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.19792"},{"id":"eyJwYXBlcklEIjoiMTcwNi4wMjI1NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1706.02256"},{"id":"eyJwYXBlcklEIjoiMjQwNS4wNTk2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2405.05966"},{"id":"eyJwYXBlcklEIjoiMjQwNC4wMzM0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.03344"},{"id":"eyJwYXBlcklEIjoiMjIwMy4xMzIyNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2203.13226"},{"id":"eyJwYXBlcklEIjoiMTkwNC4wODMwMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1904.08301"},{"id":"eyJwYXBlcklEIjoiMjAwNS4xMjE4NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2005.12187"},{"id":"eyJwYXBlcklEIjoiMjIwNi4wNzAyMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2206.07023"},{"id":"eyJwYXBlcklEIjoiMjMwNi4wMDkzNiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2306.00936"},{"id":"eyJwYXBlcklEIjoiMTkwMi4wMTM0OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1902.01349"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wODQ5NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.08495"},{"id":"eyJwYXBlcklEIjoiMjMwNy4xNTAwMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2307.15002"},{"id":"eyJwYXBlcklEIjoiMjIxMC4wNjQ2MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.06461"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xNjk1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.16958"}]}]}},{"author":"letitia parcalabescu","node":{"id":"eyJhZGRyZXNzIjoicGFyY2FsYWJlc2N1QGNsLnVuaS1oZWlkZWxiZXJnLmRlIn0=","address":"parcalabescu@cl.uni-heidelberg.de","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/16118202?v=4","username":"LetiP"}],"scholar":[{"thirdPartyID":"EeIGHM0AAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIwODcyMTAzNS05MTZiLTRkYTYtYjc3Mi0yNDBiMjhiNmUyMGUifQ==","name":"letitia parcalabescu","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMjExMi4wNTI1MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2112.05253"},{"id":"eyJwYXBlcklEIjoiMjMxMS4wNzQ2NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2311.07466"},{"id":"eyJwYXBlcklEIjoiMjAxMi4xMjM1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.12352"},{"id":"eyJwYXBlcklEIjoiMjAwMS4xMDkyOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2001.10929"},{"id":"eyJwYXBlcklEIjoiMjQwNC4xODYyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2404.18624"},{"id":"eyJwYXBlcklEIjoiMjEwMy4wNjMwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2103.06304"}]}]}}]},"__typename":"paper","authorArray":["Juri Opitz","Letitia Parcalabescu","Anette Frank"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2001.10929","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2001.10929","publisher":"arxiv","paperJSON":{"title":"AMR Similarity Metrics from Principles","paperID":"2001.10929","avgLineHeight":13.55,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"Different metrics have been proposed to compare ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Abstract Meaning Representation (AMR) ","element":"span"},{"text":"graphs. ","element":"span"},{"text":"The canonical S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"metric (","element":"span"},{"href":"#id-0","referenceIndex":8,"text":"Cai and Knight","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":8,"text":"2013","element":"a"},{"text":") aligns the variables of two graphs and assesses triple matches. ","element":"span"},{"text":"The recent S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"metric (","element":"span"},{"href":"#id-1","referenceIndex":34,"text":"Song and Gildea","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":34,"text":"2019","element":"a"},{"text":") is based on the machine-translation metric B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(","element":"span"},{"href":"#id-2","referenceIndex":29,"text":"Papineni ","element":"a"},{"href":"#id-2","referenceIndex":29,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-2","referenceIndex":29,"text":"2002","element":"a"},{"text":") and increases computational ef-ficiency by ablating the variable-alignment.","element":"span"}],[{"text":"In this paper, i) we establish criteria that enable researchers to perform a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"principled assessment of metrics ","element":"span"},{"text":"comparing meaning representations like AMR; ii) we undertake a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"thorough analysis ","element":"span"},{"text":"of S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"text":"EM","element":"span"},{"text":"-B","element":"span"},{"text":"LEU ","element":"span"},{"text":"where we show that the latter exhibits some undesirable properties. For example, it does not conform to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"identity of indiscernibles ","element":"span"},{"text":"rule and introduces biases that are hard to control; iii) we propose a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"novel metric ","element":"span"},{"text":"S","element":"span"},{"style":{"height":13.38},"width":138.31,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/0-0.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"that is more benevolent to only very slight meaning deviations and targets the fulfilment of all established criteria. We assess its suitability and show its advantages over S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":".","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"Proposed in 2013, the aim of Abstract Meaning Representation (AMR) is to represent a sentence’s meaning in a machine-readable graph format (","element":"span"},{"href":"#id-3","referenceIndex":3,"text":"Banarescu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":3,"text":"2013","element":"a"},{"text":"). ","element":"span"},{"text":"AMR graphs are rooted, acyclic, directed and edge-labeled. Entities, events, properties and states are represented as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"variables ","element":"span"},{"text":"that are linked to corresponding ","element":"span"},{"style":{"fontStyle":"italic"},"text":"concepts ","element":"span"},{"text":"(encoded as leaf nodes) via ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is-instance ","element":"span"},{"text":"relations (cf. Figure ","element":"span"},{"href":"#id-4","text":"1","element":"a"},{"text":", left). This structure allows us to capture complex linguistic phenomena such as coreference, semantic roles or polarity.","element":"span"}],[{"text":"When measuring the similarity between two AMR graphs ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", for instance for the purpose of AMR parse quality evaluation, the metric of choice is usually S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"(","element":"span"},{"href":"#id-0","referenceIndex":8,"text":"Cai and Knight","element":"a"},{"text":",","element":"span"}],[{"id":"id-4","style":{"width":"89%"},"width":784,"height":268,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/0-1.png","element":"img"}],[{"text":"Figure 1: ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"A cat drinks water. ","element":"figcaption","subtype":"caption"},{"text":"Simplified AMR graph and underlying deep form with ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"is-instance ","element":"figcaption","subtype":"caption"},{"text":"relations ( ","element":"figcaption","subtype":"caption"},{"text":") from variables (solid) to concepts (dashed).","element":"figcaption","subtype":"caption"}],[{"href":"#id-0","referenceIndex":8,"text":"2013","element":"a"},{"text":"). Its backbone is an alignment-search between the graphs’ variables. Recently, the S","element":"span"},{"text":"EM","element":"span"},{"text":"-B","element":"span"},{"text":"LEU ","element":"span"},{"text":"metric (","element":"span"},{"href":"#id-1","referenceIndex":34,"text":"Song and Gildea","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":34,"text":"2019","element":"a"},{"text":") has been proposed that operates on the basis of a variable-free AMR (Figure ","element":"span"},{"href":"#id-4","text":"1","element":"a"},{"text":", right)","element":"span"},{"text":"1","element":"span"},{"text":", converting it to a bag of k-grams. Circumventing a variable alignment search reduces computational cost and ensures full determinacy. Also, grounding the metric in B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(","element":"span"},{"href":"#id-2","referenceIndex":29,"text":"Papineni et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-2","referenceIndex":29,"text":"2002","element":"a"},{"text":") has a certain appeal, since B","element":"span"},{"text":"LEU ","element":"span"},{"text":"is quite popular in Machine Translation.","element":"span"}],[{"text":"However, we find that we are lacking a principled in-depth comparison of the properties of different AMR metrics which would help informing researchers to answer questions such as: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Which metric should I use to assess the similarity of two AMR graphs, e.g., in AMR parser evaluation? What are the trade-offs when choosing one metric over the other? ","element":"span"},{"text":"Besides providing criteria for such a principled comparison, we discuss a property that none of the existing AMR metrics currently satisfies: they do not measure graded meaning differences. Such differences may emerge due to near-synonyms such as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ruin – annihilate; skinny – thin – slim; enemy – foe ","element":"span"},{"text":"(","element":"span"},{"href":"#id-5","referenceIndex":18,"text":"Inkpen and Hirst","element":"a"},{"text":", ","element":"span"},{"href":"#id-5","referenceIndex":18,"text":"2006","element":"a"},{"text":"; ","element":"span"},{"href":"#id-6","referenceIndex":13,"text":"Edmonds and Hirst","element":"a"},{"text":", ","element":"span"},{"href":"#id-6","referenceIndex":13,"text":"2002","element":"a"},{"text":") or paraphrases such as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"be able to – can; unclear – not clear","element":"span"},{"text":". In a classical syntactic parsing task, metrics do not need to address this issue since input tokens are typically projected to lexical concepts by lemmatization, hence two graphs for the same sentence tend not to disagree on the concepts projected from the input. This is different in semantic parsing where the projected concepts are often more abstract.","element":"span"}],[{"text":"This article is structured as follows: We first establish ","element":"span"},{"style":{"fontStyle":"italic"},"text":"seven principles ","element":"span"},{"text":"that one may expect a metric for comparing meaning representations to satisfy, in order to obtain meaningful and appropriate scores for the given purpose (§","element":"span"},{"text":"2","element":"span"},{"text":"). Based on these principles we provide an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"in-depth analysis ","element":"span"},{"text":"of the properties of the AMR metrics S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(§","element":"span"},{"text":"3","element":"span"},{"text":"). We then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"develop ","element":"span"},{"text":"S","element":"span"},{"style":{"height":17.13},"width":163.89,"height":42.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/1-0.png","element":"img","alt":"2MATCH,","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"an extension of ","element":"span"},{"text":"S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"that abstracts away from a purely symbolic level, allowing for a graded semantic comparison of atomic graph-elements (§","element":"span"},{"text":"4","element":"span"},{"text":"). By this move, we enable S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"to take into account fine-grained meaning differences. We show that our proposed metric retains valuable bene-fits of S","element":"span"},{"text":"MATCH","element":"span"},{"text":", but at the same time is more benevolent to slight meaning deviations. Our code is available online ","element":"span"},{"href":"https://github.com/Heidelberg-NLP/amr-metric-suite","text":"https://github.com/ ","element":"a"},{"href":"https://github.com/Heidelberg-NLP/amr-metric-suite","text":"Heidelberg-NLP/amr-metric-suite","element":"a"},{"text":".","element":"span"}]]},{"heading":"2 From principles to AMR metrics","paragraphs":[[{"text":"The problem of comparing AMR graphs ","element":"span"},{"style":{"height":16},"width":134.9,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/1-1.png","element":"img","alt":" A, B ∈","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"D ","element":"span"},{"text":"with respect to the meaning they express occurs in several scenarios, for example, parser evaluation or inter-annotator agreement calculation (IAA). To measure the extent to which ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"agree with each other, we need a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":": ","element":"span"},{"style":{"height":12.4},"width":207.02,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/1-2.png","element":"img","alt":"D×D → R","inline":true,"padRight":true},{"text":"that returns a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"score ","element":"span"},{"text":"reflecting ","element":"span"},{"style":{"fontStyle":"italic"},"text":"meaning distance ","element":"span"},{"text":"or ","element":"span"},{"style":{"fontStyle":"italic"},"text":"meaning similarity ","element":"span"},{"text":"(for convenience, we use similarity). Below we establish seven principles that seem desirable for this metric.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Seven metric principles","element":"span"}],[{"text":"The first four metric principles are ","element":"span"},{"style":{"fontWeight":"bold"},"text":"mathematically motivated","element":"span"},{"text":":","element":"span"}],[{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"I. continuity, non-negativity and upper-bound ","element":"span"},{"text":"A similarity function should be continuous, with two natural edge cases: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B ","element":"span"},{"text":"are equivalent (maximum similarity) or unrelated (minimum similarity). By choosing 1 as upper bound, we obtain the following constraint on ","element":"span"},{"style":{"height":19.13},"width":455.69,"height":47.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/1-3.png","element":"img","alt":" metric: D × D → [0, 1].2","inline":true}],[{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"II. identity of indiscernibles ","element":"span"},{"text":"This focal principle is formalized by ","element":"span"},{"style":{"height":17.6},"width":503.2,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/1-4.png","element":"img","alt":" metric(A, B) = 1 ⇔ A =","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":". It is violated if a metric assigns a value indicating equivalence to inputs that are not equivalent or","element":"span"}],[{"text":"if it considers equivalent inputs as different.","element":"span"}],[{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"III. symmetry ","element":"span"},{"text":"In many cases, we want a metric to be symmetric: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") ","element":"span"},{"text":"= ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"B, A","element":"span"},{"text":")","element":"span"},{"text":". A metric violates this principle if it assigns a pair of objects different scores when argument order is inverted. Together with principles I and II, it extends the scope of the metric to usages beyond parser evaluation, as it also enables sound IAA calculation, clustering and classification of AMR graphs when we use the metric as a kernel (e.g., SVM). In parser evaluation, one may dispense with any (strong) requirements for symmetry— however, the metric must then be applied in a standardized way, with a fixed order of arguments.","element":"span"}],[{"text":"In cases where there is no defined reference, the asymmetry could be handled by aggregating ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"B, A","element":"span"},{"text":")","element":"span"},{"text":", e.g., using the mean. However, it is open what aggregation is best suited and how to interpret results, e.g. for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") = 0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"1 ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"B, A","element":"span"},{"text":") = 0","element":"span"},{"style":{"fontStyle":"italic"},"text":".","element":"span"},{"text":"9","element":"span"},{"text":".","element":"span"}],[{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"IV. determinacy ","element":"span"},{"text":"Repeated calculation over the same inputs should yield the same score. ","element":"span"},{"text":"This principle is clearly desirable as it ensures reproducibility (a very small deviation may be tolerable).","element":"span"}],[{"text":"The next three principles we believe to be desirable specifically when comparing meaning representation graphs such as AMR (","element":"span"},{"href":"#id-3","referenceIndex":3,"text":"Banarescu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":3,"text":"2013","element":"a"},{"text":"). The first two of the following principles are ","element":"span"},{"style":{"fontWeight":"bold"},"text":"motivated by computer science and linguistics","element":"span"},{"text":", whereas the last one is ","element":"span"},{"style":{"fontWeight":"bold"},"text":"motivated from a linguistic and an engineering perspective","element":"span"},{"text":".","element":"span"}],[{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"V. no bias","element":"span"},{"text":": Meaning representations consist of nodes and edges encoding specific information types. Unless explicitly justified, a metric should not unjustifiably or in unintended ways favor correctness or penalize errors for specific substructures (e.g., leaf nodes). In case a metric favors or penalizes certain substructures more than others, in the interest of transparency, this should be made clear and explicit, and should be easily verifiable and consistent. E.g., if we wish to give negation of the main predicate of a sentence a two times higher weight compared to negation in an embedded sentence, we want this to be made transparent. A concrete example for a transparent bias is found in ","element":"span"},{"href":"#id-7","referenceIndex":7,"text":"Cai ","element":"a"},{"href":"#id-7","referenceIndex":7,"text":"and Lam ","element":"a"},{"text":"(","element":"span"},{"href":"#id-7","referenceIndex":7,"text":"2019","element":"a"},{"text":"). They analyze the impact of their novel top-down AMR parsing strategy by integrating a root-distance bias into S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"to focus on structures situated at the top of a graph.","element":"span"}],[{"text":"We now turn to properties that focus on the nature of the objects we aim to compare: graph-based compositional meaning representations. These graphs consist of atomic conditions that determine the circumstances under which a sentence is true. Hence, our ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric ","element":"span"},{"text":"score should increase with increasing overlap of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", which we denote ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":")","element":"span"},{"text":", the number of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"matching ","element":"span"},{"text":"conditions. This overlap can be viewed from a ","element":"span"},{"style":{"fontWeight":"bold"},"text":"symbolic ","element":"span"},{"text":"or/and a ","element":"span"},{"style":{"fontWeight":"bold"},"text":"graded ","element":"span"},{"text":"perspective (cf., e.g., ","element":"span"},{"href":"#id-8","referenceIndex":33,"text":"Schenker et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-8","referenceIndex":33,"text":"2005","element":"a"},{"text":") who denote these perspectives as ‘syntactic’ vs. ‘semantic’). From the symbolic perspective, we compare the nodes and edges of two graphs on a symbolic level, while from the graded perspective, we take into account the degree to which nodes and edges differ. Both types of matching involve a precondition: If ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"contain variables, we need a variable-mapping in order to match conditions from ","element":"span"},{"style":{"height":15.13},"width":179.75,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-0.png","element":"img","alt":" A and B.3","inline":true}],[{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"VI. matching (graph-based) meaning representations – symbolic match ","element":"span"},{"text":"A natural symbolic overlap-objective can be found in the Jaccard index ","element":"span"},{"style":{"fontStyle":"italic"},"text":"J ","element":"span"},{"text":"(","element":"span"},{"href":"#id-9","referenceIndex":19,"text":"Jaccard","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":19,"text":"1912","element":"a"},{"text":"; ","element":"span"},{"href":"#id-10","referenceIndex":31,"text":"Real and Vargas","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":31,"text":"1996","element":"a"},{"text":"; ","element":"span"},{"href":"#id-11","referenceIndex":28,"text":"Pa- ","element":"a"},{"href":"#id-11","referenceIndex":28,"text":"padimitriou et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":28,"text":"2010","element":"a"},{"text":"): Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"G","element":"span"},{"text":") ","element":"span"},{"text":"be the set of triples of graph ","element":"span"},{"style":{"height":17.6},"width":565.63,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-1.png","element":"img","alt":" G, f(A, B) = |t(A) ∩ t(B)|","inline":true,"padRight":true},{"text":"the size of the overlap of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":", and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") ","element":"span"},{"text":"= ","element":"span"},{"style":{"height":17.6},"width":247.56,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-2.png","element":"img","alt":"|t(A) ∪ t(B)|","inline":true,"padRight":true},{"text":"the size of their union. Then, we wish that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"are considered more similar to each other than ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C ","element":"span"},{"text":"iff ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"exhibit a greater relative agreement in their (symbolic) conditions: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"> metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, C","element":"span"},{"text":")","element":"span"}],[{"style":{"width":"99%"},"width":872,"height":55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-3.png","element":"img"}],[{"text":"allowed exception to this monotonic relationship can occur if we want to take into account a graded semantic match of atomic graph elements or substructures, which we will now elaborate on.","element":"span"}],[{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"VII. matching (graph-based) meaning representations – graded semantic match","element":"span"},{"text":": One motivation for this principle can be found in engineering, e.g., when assessing the quality of produced parts. Here, small deviations from a reference may be tolerable within certain limits. Similarly, two AMR graphs may match almost perfectly – except for two small divergent components. The extent of divergence can be measured by the degree of similarity of the two divergent components. ","element":"span"},{"text":"In our case, we need linguistic knowledge to judge what degree of divergence we are dealing with and whether it is tolerable.","element":"span"}],[{"text":"For example, consider that graph ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"contains a triple ","element":"span"},{"style":{"height":17.6},"width":715.04,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-4.png","element":"img","alt":" ⟨x, instance, conceptA⟩ and graph B a","inline":true,"padRight":true},{"text":"triple ","element":"span"},{"style":{"height":17.6},"width":401.08,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-5.png","element":"img","alt":" ⟨y, instance, conceptB⟩","inline":true},{"text":", while otherwise the graphs are equivalent, and the alignment has set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":"=","element":"span"},{"style":{"fontStyle":"italic"},"text":"y","element":"span"},{"text":". Then ","element":"span"},{"style":{"fontStyle":"italic"},"text":"f","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") ","element":"span"},{"text":"should be higher when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"conceptA ","element":"span"},{"text":"is similar to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"conceptB ","element":"span"},{"text":"compared to the case where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"conceptA ","element":"span"},{"text":"is dissimilar to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"conceptB","element":"span"},{"text":". In AMR, concepts are often abstract, so near-syno-nyms may even be fully admissible (","element":"span"},{"style":{"fontStyle":"italic"},"text":"enemy–foe","element":"span"},{"text":"). While such (near-)synonyms are bound to occur frequently when we compare AMR graphs of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"different sentences ","element":"span"},{"text":"that may contain paraphrases, we will see, in Section §4, that this can also occur in parser evaluation, where two different graphs represent the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"same sentence","element":"span"},{"text":". By defining ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric ","element":"span"},{"text":"to map to a range [0,1] we already defined it to be globally graded. Here, we desire that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"graded similarity ","element":"span"},{"text":"may also hold of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"minimal units ","element":"span"},{"text":"of AMR graphs, such as atomic concepts or even subgraphs, e.g., to reflect that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"injustice","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":") ","element":"span"},{"text":"is very similar to ","element":"span"},{"style":{"height":17.6},"width":521.17,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-6.png","element":"img","alt":" justice(x) ∧ polarity(x, −).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"2.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"AMR metrics: S","element":"span"},{"style":{"fontWeight":"bold"},"text":"MATCH ","element":"span"},{"style":{"fontWeight":"bold"},"text":"and S","element":"span"},{"style":{"fontWeight":"bold"},"text":"EM","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"LEU","element":"span"}],[{"text":"With our seven principles for AMR similarity metrics in place, we now introduce S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":", two metrics that differ in their design and assumptions. ","element":"span"},{"text":"We describe each of them in detail and summarize their differences, setting the stage for our in-depth metric analysis (§","element":"span"},{"text":"3","element":"span"},{"text":").","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Align and match – S","element":"span"},{"style":{"fontWeight":"bold"},"text":"MATCH ","element":"span"},{"text":"The S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"metric operates in two steps. First, (i) we align the variables in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"in the best possible way, by finding a mapping ","element":"span"},{"style":{"height":17.6},"width":359.37,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-7.png","element":"img","alt":" map⋆: vars(A) →","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"vars","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":") ","element":"span"},{"text":"that yields a maximal set of matching triples between ","element":"span"},{"style":{"height":18.22},"width":589.57,"height":45.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-8.png","element":"img","alt":" A and B. E.g., if ⟨xi, rel, xj⟩ ∈","inline":true}],[{"style":{"width":"99%"},"width":870,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-9.png","element":"img"}],[{"style":{"height":17.6},"width":212.27,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-10.png","element":"img","alt":"ym⟩ ∈ t(B)","inline":true},{"text":", we obtain one triple match. (ii) We compute Precision, Recall and F1 score based on the set of triples returned by the alignment search. The NP-hard alignment search problem of step (i) is solved with a greedy hill-climber: Let ","element":"span"},{"style":{"height":18.22},"width":209.74,"height":45.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-11.png","element":"img","alt":" fmap(A, B)","inline":true,"padRight":true},{"text":"be the count of matching triples under any mapping function ","element":"span"},{"style":{"fontStyle":"italic"},"text":"map","element":"span"},{"text":". Then,","element":"span"}],[{"style":{"width":"80%"},"width":699,"height":76,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/2-12.png","element":"img"}],[{"text":"Multiple restarts with different seeds increase the likelihood of finding better optima.","element":"span"}],[{"id":"id-27","style":{"fontWeight":"bold"},"text":"Simplify and match – S","element":"span"},{"style":{"fontWeight":"bold"},"text":"EM","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"LEU ","element":"span"},{"text":"The S","element":"span"},{"text":"EM","element":"span"},{"text":"-B","element":"span"},{"text":"LEU ","element":"span"},{"text":"metric in ","element":"span"},{"href":"#id-1","referenceIndex":34,"text":"Song and Gildea ","element":"a"},{"text":"(","element":"span"},{"href":"#id-1","referenceIndex":34,"text":"2019","element":"a"},{"text":") can also be described as a two-step procedure. But unlike S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"it operates on a ","element":"span"},{"style":{"fontWeight":"bold"},"text":"variable-free reduction ","element":"span"},{"text":"of an AMR graph ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G","element":"span"},{"text":", which we denote by ","element":"span"},{"style":{"height":19.13},"width":157.48,"height":47.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-0.png","element":"img","alt":" Gvf (vf:","inline":true,"padRight":true},{"text":"variable-free, Figure ","element":"span"},{"href":"#id-4","text":"1","element":"a"},{"text":", right-hand side).","element":"span"}],[{"text":"In a first step, (i) S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"performs k-gram extraction from ","element":"span"},{"style":{"height":15.53},"width":246.48,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-1.png","element":"img","alt":" Avf and Bvf ","inline":true,"padRight":true},{"text":"in a breadth-first traversal (path extraction). It then (ii) adopts the B","element":"span"},{"text":"LEU ","element":"span"},{"text":"score from MT (","element":"span"},{"href":"#id-2","referenceIndex":29,"text":"Papineni et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-2","referenceIndex":29,"text":"2002","element":"a"},{"text":") to calculate an overlap score based on the extracted bags of k-grams:","element":"span"}],[{"style":{"width":"94%"},"width":824,"height":230,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-2.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":15.6},"width":252.39,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-3.png","element":"img","alt":" pk is BLEU’s","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"modified k-gram precision ","element":"span"},{"text":"that measures ","element":"span"},{"style":{"fontStyle":"italic"},"text":"k","element":"span"},{"text":"-gram overlap of a candidate against a reference: ","element":"span"},{"style":{"height":30.06},"width":523.43,"height":75.14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-4.png","element":"img","alt":" pk = |kgram(Avf)∩kgram(Bvf)||kgram(Avf)| .","inline":true},{"style":{"height":10.84},"width":49.24,"height":27.11,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-5.png","element":"img","alt":"wk","inline":true,"padRight":true},{"text":"is the (typically uniform) weight over chosen k-gram sizes. ","element":"span"},{"text":"S","element":"span"},{"text":"EMBLEU ","element":"span"},{"text":"uses NIST geometric probability smoothing (","element":"span"},{"href":"#id-12","referenceIndex":10,"text":"Chen and Cherry","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":10,"text":"2014","element":"a"},{"text":"). The recall-focused ‘brevity penalty’ ","element":"span"},{"style":{"fontStyle":"italic"},"text":"BP ","element":"span"},{"text":"returns a value smaller than 1 when the candidate length ","element":"span"},{"style":{"height":19.53},"width":96.26,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-6.png","element":"img","alt":"|Avf|","inline":true,"padRight":true},{"text":"is smaller than the reference length ","element":"span"},{"style":{"height":19.53},"width":109.95,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-7.png","element":"img","alt":" |Bvf|.","inline":true}],[{"text":"The graph traversal performed in S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"starts at the root node. During this traversal it sim-plifies the graph by replacing variables with their corresponding concepts (see Figure ","element":"span"},{"href":"#id-4","text":"1","element":"a"},{"text":": the node ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"becomes D","element":"span"},{"text":"RINK","element":"span"},{"text":"-01) and collects visited nodes and edges in uni-, bi- and tri-grams (k=3 is recommended). Here, a source node together with a relation and its target node counts as a bi-gram. For the graph in Figure ","element":"span"},{"href":"#id-4","text":"1","element":"a"},{"text":", the extracted unigrams are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"style":{"fontStyle":"italic"},"text":"cat, water, drink","element":"span"},{"text":"-","element":"span"},{"text":"01","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":"; the extracted bi-grams are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"style":{"fontStyle":"italic"},"text":"drink","element":"span"},{"text":"-","element":"span"},{"text":"01 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arg","element":"span"},{"text":"1 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cat, drink","element":"span"},{"text":"-","element":"span"},{"text":"01 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arg","element":"span"},{"text":"2 ","element":"span"},{"style":{"fontStyle":"italic"},"text":"water","element":"span"},{"style":{"fontStyle":"italic"},"text":"}","element":"span"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"MATCH ","element":"span"},{"style":{"fontWeight":"bold"},"text":"vs. S","element":"span"},{"style":{"fontWeight":"bold"},"text":"EM","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"LEU ","element":"span"},{"style":{"fontWeight":"bold"},"text":"in a nutshell ","element":"span"},{"text":"S","element":"span"},{"text":"EM","element":"span"},{"text":"-B","element":"span"},{"text":"LEU ","element":"span"},{"text":"differs significantly from S","element":"span"},{"text":"MATCH","element":"span"},{"text":". A key difference is that S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"operates on reduced variable-free AMR graphs (","element":"span"},{"style":{"height":15.53},"width":70.94,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-8.png","element":"img","alt":"Gvf","inline":true},{"text":") – instead of full-fledged AMR graphs. ","element":"span"},{"text":"By eliminating variables, S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"bypasses an alignment search. This makes the calculation faster and alleviates a weakness of S","element":"span"},{"text":"MATCH","element":"span"},{"text":": the hill-climbing search is slightly imprecise. However, S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"is not guided by aligned variables as anchors. Instead, S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"uses an n-gram statistic (B","element":"span"},{"text":"LEU","element":"span"},{"text":") to compute an overlap score for graphs, based on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"khop paths ","element":"span"},{"text":"extracted from ","element":"span"},{"style":{"height":15.53},"width":70.94,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-9.png","element":"img","alt":" Gvf","inline":true},{"text":", using the root node","element":"span"}],[{"id":"id-13","style":{"fontWeight":"bold"},"text":"----------A------------Input-------------B---------- ","element":"span"},{"style":{"fontWeight":"bold"},"text":"( p / predicate-01 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"( p / predicate-01 :ARG0 ( ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"/ man ) ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG0 ( ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"/ man )","element":"span"}],[{"style":{"width":"93%"},"width":812,"height":49,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-10.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":":ARG2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":") ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG2 ( ","element":"span"},{"style":{"fontWeight":"bold"},"text":"x2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"/ man )) -----------------------Scores-----------------------S","element":"span"},{"style":{"fontWeight":"bold"},"text":"MATCH ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> ","element":"span"},{"style":{"fontWeight":"bold"},"text":"0.667 S","element":"span"},{"style":{"fontWeight":"bold"},"text":"EM","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"LEU ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> ","element":"span"},{"style":{"fontWeight":"bold"},"text":"1.0 ----------------------------------------------------","element":"span"}],[{"text":"Figure 2: Two AMRs with semantic roles filled differently, S","element":"figcaption","subtype":"caption"},{"text":"EM","element":"figcaption","subtype":"caption"},{"text":"B","element":"figcaption","subtype":"caption"},{"text":"LEU ","element":"figcaption","subtype":"caption"},{"text":"considers them as equivalent.","element":"figcaption","subtype":"caption"}],[{"text":"as the start for the extraction process. S","element":"span"},{"text":"MATCH","element":"span"},{"text":", by contrast, acts directly on variable-bound graphs matching triples based on a selected alignment. If in some application we wanted it, both metrics allow the capturing of more ‘global’ graph properties: S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"can increase its k-parameter and S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"may match conjunctions of (interconnected) triples. In the following analysis, however, we will adhere to their default configurations since this is how they are used in most applications.","element":"span"}]]},{"heading":"3 Assessing AMR metrics with principles","paragraphs":[[{"text":"This section evaluates S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"against the seven principles we established above by asking: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Why does a metric satisfy or violate a given principle? ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"What does this imply? ","element":"span"},{"text":"We start with principles from mathematics.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"I. Continuity, non-negativity and upper-bound ","element":"span"},{"text":"This principle is fulfilled by both metrics as they are functions of the form ","element":"span"},{"style":{"height":17.6},"width":439.05,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-11.png","element":"img","alt":" metric : D×D → [0, 1].","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"II. Identity of indiscernibles ","element":"span"},{"text":"This principle is fundamental: An AMR metric must return maximum score if and only if the graphs are equivalent in meaning. Yet, there are cases where S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":", in contrast to S","element":"span"},{"text":"MATCH","element":"span"},{"text":", does not satisfy this principle. Figure ","element":"span"},{"href":"#id-13","text":"2 ","element":"a"},{"text":"shows an example.","element":"span"}],[{"text":"Here, S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"yields a perfect score for two AMRs that differ in a single but crucial aspect: two of its ","element":"span"},{"text":"ARG","element":"span"},{"style":{"height":5.6},"width":19,"height":14,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-12.png","element":"img","alt":"x","inline":true,"padRight":true},{"text":"roles are filled with arguments that are meant to refer to distinct individuals that share the same concept. The graph on the left is an abstraction of, e.g. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The man","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-13.png","element":"img","alt":"1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"sees the other man","element":"span"},{"style":{"height":14.22},"width":64.9,"height":35.54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-14.png","element":"img","alt":"2 in","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"the other man","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-15.png","element":"img","alt":"2","inline":true},{"text":", while the graph on the right is an abstraction of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The man","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-16.png","element":"img","alt":"1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"sees himself","element":"span"},{"style":{"height":8.4},"width":17,"height":21,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-17.png","element":"img","alt":"1","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"in the other man","element":"span"},{"style":{"height":14.62},"width":230.82,"height":36.55,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/3-18.png","element":"img","alt":"2. SEMBLEU","inline":true,"padRight":true},{"text":"does not recognize the difference in meaning between a reflexive and a non-refle-xive relation, assigning maximum similarity score, whereas S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"reflects such differences appropriately since it accounts for variables.","element":"span"}],[{"text":"In sum, S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"does not satisfy principle II because it operates on a variable-free reduction of","element":"span"}],[{"id":"id-14","style":{"fontWeight":"bold"},"text":"----------","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontWeight":"bold"},"text":"------------Input-------------","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"----------(a / and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(k7 / know-01 :op1 (h / heat-01 ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG0 (i / i :ARG1 (t / thing) ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG0-of (d9 / do-02 :loc (b / between ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG1 t8 :op1 (w / we)) ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG1 (t0 / thing :degree (s / so)) ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG1-of (h2 / heat-01 :op2 (k / know-01 ","element":"span"},{"style":{"fontWeight":"bold"},"text":":degree (s1 / so) :polarity - ","element":"span"},{"style":{"fontWeight":"bold"},"text":":loc (b3 / between :ARG0 (i / i) ","element":"span"},{"style":{"fontWeight":"bold"},"text":":op1 (w4 / we)))))) :ARG1 (t2 / thing ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG1 (t8 / thing) :ARG1-of (d / do-02)))) :polarity -) -----------------------Scores-----------------------S","element":"span"},{"style":{"fontWeight":"bold"},"text":"E M","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"L E U ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontWeight":"bold"},"text":",","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":") = 0.422 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"<< ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"E M","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"L E U ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":",","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontWeight":"bold"},"text":") = 0.803) S","element":"span"},{"style":{"fontWeight":"bold"},"text":"M A T C H ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontWeight":"bold"},"text":",","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":") = 0.829 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"== ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"M A T C H ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":",","element":"span"},{"style":{"fontWeight":"bold"},"text":"A","element":"span"},{"style":{"fontWeight":"bold"},"text":") = 0.829) -----------------------------------------------------","element":"span"}],[{"text":"Figure 3: Symmetry violation for two parses of ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"Things are so heated between us, I don’t know what to do","element":"figcaption","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"text":"AMRs (","element":"span"},{"style":{"height":15.53},"width":70.94,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-0.png","element":"img","alt":"Gvf","inline":true},{"text":"). One could address this problem by reverting to canonical AMR graphs and adopting variable alignment in S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":". But this would adversely affect the advertised efficiency advantages over S","element":"span"},{"text":"MATCH","element":"span"},{"text":". ","element":"span"},{"text":"Re-integrating the alignment step would make S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"style":{"fontStyle":"italic"},"text":"less ","element":"span"},{"text":"efficient than S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"since it would add the complexity of breadth-first traversal, yielding a total complexity of ","element":"span"},{"style":{"height":17.6},"width":576.44,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-1.png","element":"img","alt":" O(SMATCH) plus O(|V | + |E|).","inline":true}],[{"style":{"fontWeight":"bold"},"text":"III. Symmetry ","element":"span"},{"text":"This principle is fulfilled if ","element":"span"},{"style":{"height":17.6},"width":873.16,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-2.png","element":"img","alt":"∀A, B ∈ D : metric(A, B) = metric(B, A).","inline":true,"padRight":true},{"text":"Figure ","element":"span"},{"href":"#id-14","text":"3 ","element":"a"},{"text":"shows an example where S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"does not comply with this principle, to a significant extent: when comparing AMR graph ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"against ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B","element":"span"},{"text":", it yields a score greater than 0.8, yet, when comparing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A ","element":"span"},{"text":"the score is smaller than 0.5. We perform an experiment that quantifies this effect on a larger scale by assessing the frequency and the extent of such divergences. To this end, we parse 1368 development sentences from an AMR corpus (LDC2017T10) with an AMR parser (obtaining graph bank ","element":"span"},{"style":{"fontStyle":"italic"},"text":"A","element":"span"},{"text":") and evaluate it against another graph bank ","element":"span"},{"style":{"fontStyle":"italic"},"text":"B ","element":"span"},{"text":"(gold graphs or another parseroutput). We quantify the symmetry violation by the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"symmetry violation ratio ","element":"span"},{"text":"(Eq. ","element":"span"},{"href":"#id-15","text":"4","element":"a"},{"text":") and the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"mean symmetry violation ","element":"span"},{"text":"(Eq. ","element":"span"},{"href":"#id-15","text":"5","element":"a"},{"text":") given some metric ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m","element":"span"},{"text":":","element":"span"}],[{"id":"id-15","style":{"width":"92%"},"width":811,"height":252,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-3.png","element":"img"}],[{"text":"We conduct the experiment with three AMR systems, CAMR (","element":"span"},{"href":"#id-16","referenceIndex":38,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":38,"text":"2016","element":"a"},{"text":"), GPLA (","element":"span"},{"href":"#id-17","referenceIndex":23,"text":"Lyu ","element":"a"},{"href":"#id-17","referenceIndex":23,"text":"and Titov","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":23,"text":"2018","element":"a"},{"text":") and JAMR (","element":"span"},{"href":"#id-18","referenceIndex":15,"text":"Flanigan et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":15,"text":"2014","element":"a"},{"text":"), and the gold graphs. Moreover, to provide a baseline that allows us to better put the results","element":"span"}],[{"id":"id-20","style":{"width":"95%"},"width":830,"height":54,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-4.png","element":"img"}],[{"text":"svr (%, ","element":"span"},{"style":{"height":11.2},"width":133.06,"height":28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-5.png","element":"img","alt":" ∆>0.0001)","inline":true,"padRight":true},{"text":"msv (in points) Graph banks ","element":"span"},{"text":"S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"}],[{"style":{"width":"95%"},"width":830,"height":276,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-6.png","element":"img"}],[{"text":"Table 1: svr (Eq. ","element":"figcaption","subtype":"caption"},{"href":"#id-15","text":"4","element":"a","subtype":"caption"},{"text":"), msv (Eq. ","element":"figcaption","subtype":"caption"},{"href":"#id-15","text":"5","element":"a","subtype":"caption"},{"text":") of AMR metrics.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"99%"},"width":866,"height":112,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-7.png","element":"img"}],[{"text":"worst-case ","element":"span"},{"text":"81.3 ","element":"span"},{"text":"0.2 avg-case ","element":"span"},{"text":"72.7 ","element":"span"},{"text":"0.2","element":"span"}],[{"text":"Table 2: svr (Eq. ","element":"figcaption","subtype":"caption"},{"href":"#id-15","text":"4","element":"a","subtype":"caption"},{"text":"), msv (Eq. ","element":"figcaption","subtype":"caption"},{"href":"#id-15","text":"5","element":"a","subtype":"caption"},{"text":") of B","element":"figcaption","subtype":"caption"},{"text":"LEU","element":"figcaption","subtype":"caption"},{"text":", MT setting.","element":"figcaption","subtype":"caption"}],[{"text":"into perspective, we also estimate the symmetry violation of B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":"’s MT ancestor) in an MT setting. Specifically, we fetch 16 system outputs of the WMT 2018 EN-DE metrics task (","element":"span"},{"href":"#id-19","referenceIndex":24,"text":"Ma et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-19","referenceIndex":24,"text":"2018","element":"a"},{"text":") and calculate B","element":"span"},{"text":"LEU","element":"span"},{"text":"(A,B) and B","element":"span"},{"text":"LEU","element":"span"},{"text":"(B,A) of each sentence-pair (A,B) from the MT system’s output and the reference (using the same smoothing method as S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":"). As ","element":"span"},{"style":{"fontStyle":"italic"},"text":"worst-case","element":"span"},{"text":"/","element":"span"},{"style":{"fontStyle":"italic"},"text":"avg.-case","element":"span"},{"text":", we use the outputs from the team where B","element":"span"},{"text":"LEU ","element":"span"},{"text":"exhibits maximum/median ","element":"span"},{"style":{"height":15.13},"width":108.4,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-8.png","element":"img","alt":" msv.4","inline":true}],[{"text":"Table ","element":"span"},{"href":"#id-20","text":"1 ","element":"a"},{"text":"shows that more than 80% of the evaluated AMR graph pairs lead to a symmetry violation with S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(as opposed to less than 10% for S","element":"span"},{"text":"MATCH","element":"span"},{"text":"). The ","element":"span"},{"style":{"height":12.4},"width":302.83,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/4-9.png","element":"img","alt":" msv of SMATCH","inline":true,"padRight":true},{"text":"is considerably smaller compared to S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":": 0.1 vs. 3.2 points F1 score. Even though the B","element":"span"},{"text":"LEU ","element":"span"},{"text":"metric is inherently asymmetric, most of the symmetry violations are negligible when applied in MT (high ","element":"span"},{"style":{"fontStyle":"italic"},"text":"svr","element":"span"},{"text":", low ","element":"span"},{"style":{"fontStyle":"italic"},"text":"msv","element":"span"},{"text":", Table ","element":"span"},{"href":"#id-20","text":"2","element":"a"},{"text":"). However, when applied to AMR graphs ‘via’ S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"the asymmetry is amplified by a factor of approximately 16 (0.2 vs. 3.2 points). Figure ","element":"span"},{"href":"#id-21","text":"4 ","element":"a"},{"text":"visualizes the symmetry violations of S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(left), S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"(middle) and B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(right). The S","element":"span"},{"text":"EMBLEU","element":"span"},{"text":"-plots show that the effect is widespread, some cases are extreme, many others are less extreme but still considerable. This stands in contrast to S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"but also to B","element":"span"},{"text":"LEU","element":"span"},{"text":", which itself appears well calibrated and does not suffer from any major asymmetry.","element":"span"}],[{"text":"In sum, symmetry violations with S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"are much fewer and less pronounced than those observed with S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":". ","element":"span"},{"text":"In theory, S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"is fully symmetric, however, violations can occur due to alignment errors from the greedy variable-","element":"span"}],[{"id":"id-21","style":{"width":"98%"},"width":1786,"height":797,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-0.png","element":"img"}],[{"text":"Figure 4: Symmetry evaluations of metrics. S","element":"figcaption","subtype":"caption"},{"text":"EM","element":"figcaption","subtype":"caption"},{"text":"B","element":"figcaption","subtype":"caption"},{"text":"LEU ","element":"figcaption","subtype":"caption"},{"text":"(left column) and S","element":"figcaption","subtype":"caption"},{"text":"MATCH ","element":"figcaption","subtype":"caption"},{"text":"(middle column) and B","element":"figcaption","subtype":"caption"},{"text":"LEU ","element":"figcaption","subtype":"caption"},{"text":"as a ‘baseline’ in an MT task setting on newstest2018. S","element":"figcaption","subtype":"caption"},{"text":"EM","element":"figcaption","subtype":"caption"},{"text":"B","element":"figcaption","subtype":"caption"},{"text":"LEU","element":"figcaption","subtype":"caption"},{"text":": large divergence, strong outliers. S","element":"figcaption","subtype":"caption"},{"text":"MATCH","element":"figcaption","subtype":"caption"},{"text":": few divergences, few outliers; B","element":"figcaption","subtype":"caption"},{"text":"LEU","element":"figcaption","subtype":"caption"},{"text":": many small divergences, zero outliers. (a) marks the case in Figure ","element":"figcaption","subtype":"caption"},{"href":"#id-14","text":"3","element":"a","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"id":"id-22","style":{"width":"100%"},"width":875,"height":637,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-1.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"IV. Determinacy ","element":"span"},{"text":"This principle states that repeated calculations of a metric should yield identical results. Since there is no randomness in S","element":"span"},{"text":"EM","element":"span"},{"text":"-B","element":"span"},{"text":"LEU","element":"span"},{"text":", it fully complies with this principle. The reference implementation of S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"does not fully guarantee deterministic variable alignment results, since it aligns the variables by means of greedy hill-climbing. However, multiple random initializations together with the small set of AMR variables imply that the deviation will be ","element":"span"},{"style":{"height":15.2},"width":120.98,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-2.png","element":"img","alt":" ≤ ϵ (a","inline":true,"padRight":true},{"text":"small number close to 0).","element":"span"},{"text":"6 ","element":"span"},{"text":"In Table ","element":"span"},{"href":"#id-22","text":"3 ","element":"a"},{"text":"we measure the expected ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-3.png","element":"img","alt":" ϵ","inline":true},{"text":": it displays the S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"F1 standard deviation with respect to 10 independent runs, on a corpus level and on a graph-pair level (arithmetic mean).","element":"span"},{"text":"7 ","element":"span"},{"text":"We see that ","element":"span"},{"style":{"height":8},"width":18,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-4.png","element":"img","alt":" ϵ","inline":true,"padRight":true},{"text":"is small, even when only one random start is performed (corpus level: ","element":"span"},{"style":{"height":12},"width":163.72,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-5.png","element":"img","alt":" ϵ=0.0003","inline":true},{"text":", graph level: ","element":"span"},{"style":{"height":15.2},"width":353.17,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-6.png","element":"img","alt":" ϵ=0.0013). We con-","inline":true,"padRight":true},{"text":"clude that the hill-climbing in S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"is unlikely to have any significant effects on the final score.","element":"span"}],[{"id":"id-40","style":{"fontWeight":"bold"},"text":"V. No bias ","element":"span"},{"text":"A similarity metric of (A)MRs should not unjustifiably or unintentionally favor the correctness or penalize errors pertaining to any (sub-)structures of the graphs. However, we find that S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"is affected by a bias that affects (some) leaf nodes attached to high-degree nodes. The bias arises from two related factors: (i) when transforming ","element":"span"},{"style":{"height":17.93},"width":381.57,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-7.png","element":"img","alt":" G to Gvf, SEMBLEU","inline":true,"padRight":true},{"text":"replaces variable nodes with concept nodes. Thus, nodes which were leaf nodes in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"can be raised to highly connected nodes in ","element":"span"},{"style":{"height":15.53},"width":70.94,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-8.png","element":"img","alt":" Gvf","inline":true},{"text":". (ii) breadth-first k-gram extraction starts from the root node. During graph traversal, concept leaves – now occupying the position of (former) variable nodes with a high number of outgoing (and incoming) edges – will be visited and extracted more frequently than others.","element":"span"}],[{"text":"The two factors in combination make S","element":"span"},{"text":"EM","element":"span"},{"text":"-B","element":"span"},{"text":"LEU ","element":"span"},{"text":"penalize a wrong concept node harshly when it is attached to a high-degree variable node (the leaf is raised to high-degree when transforming ","element":"span"},{"style":{"height":15.53},"width":161.36,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/5-9.png","element":"img","alt":" G to Gvf","inline":true},{"text":"). Conversely, correct or wrongly assigned concepts attached to nodes with low degree are only weakly considered.","element":"span"},{"text":"8 ","element":"span"},{"text":"E.g., consider Figure ","element":"span"},{"href":"#id-23","text":"5","element":"a"},{"text":". S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"considers two graphs that express quite distinct meanings (left and right) as more","element":"span"}],[{"id":"id-23","style":{"width":"79%"},"width":698,"height":317,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"-----------------------Scores-----------------------metric (leftA,leftB) ","element":"span"},{"style":{"fontWeight":"bold"},"text":"metric (leftA,right) --------------- ","element":"span"},{"style":{"fontWeight":"bold"},"text":"---------------S","element":"span"},{"style":{"fontWeight":"bold"},"text":"E M","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"L E U ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.38 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"< ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"E M","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"L E U ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.46 S","element":"span"},{"style":{"fontWeight":"bold"},"text":"M A T C H ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.87 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"> ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"M A T C H ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.73 -----------------------------------------------------","element":"span"}],[{"text":"Figure 5: Left: ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"In April, a woman rides a car from Rome to Pisa. ","element":"figcaption","subtype":"caption"},{"text":"root nodes A: ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"travel-01 ","element":"figcaption","subtype":"caption"},{"text":"vs. B: ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"drive-01","element":"figcaption","subtype":"caption"},{"text":". Right: ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"In Apr., a sailor travels with a ship from P. to N.","element":"figcaption","subtype":"caption"}],[{"style":{"width":"49%"},"width":433,"height":218,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-1.png","element":"img"}],[{"text":"Figure 6: # of ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"k","element":"figcaption","subtype":"caption"},{"text":"-grams entered by a node in S","element":"figcaption","subtype":"caption"},{"text":"EM","element":"figcaption","subtype":"caption"},{"text":"B","element":"figcaption","subtype":"caption"},{"text":"LEU","element":"figcaption","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"id":"id-24","style":{"width":"97%"},"width":848,"height":211,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-2.png","element":"img"}],[{"text":"Table 4: Error impact depending on error location in a tree with node degree ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"d","element":"figcaption","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"text":"similar than graphs that are almost equivalent in meaning (left, variant A vs. B). This is because the leaf that is attached to the root is raised to a highly connected node in ","element":"span"},{"style":{"height":15.53},"width":70.94,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-3.png","element":"img","alt":" Gvf ","inline":true,"padRight":true},{"text":"and thus is over-frequently contained in the extracted k-grams, whereas the other leaves will remain leaves in ","element":"span"},{"style":{"height":15.53},"width":84.73,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-4.png","element":"img","alt":" Gvf.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Analyzing and quantifying S","element":"span"},{"style":{"fontWeight":"bold"},"text":"EM","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"LEU","element":"span"},{"style":{"fontWeight":"bold"},"text":"’s bias ","element":"span"},{"text":"To better understand the bias, we study three limiting cases: (i) the root is wrong (","element":"span"},{"style":{"height":15.6},"width":244.77,"height":39,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-5.png","element":"img","alt":"√√√ ) (ii) d leaf","inline":true,"padRight":true},{"text":"nodes are wrong ( ","element":"span"},{"text":") and (iii) one branching node is wrong ( ). Depending on a specific node and its position in the graph, we would like to know onto how many k-grams (S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":") or triples (S","element":"span"},{"text":"MATCH","element":"span"},{"text":") the errors are projected. For the sake of simplicity, we assume that the graph always comes in its simplified form ","element":"span"},{"style":{"height":15.53},"width":70.94,"height":38.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-6.png","element":"img","alt":" Gvf","inline":true},{"text":", that it is a tree, and that every non-leaf node has the same out-degree ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":".","element":"span"}],[{"text":"The result of our analysis is given in Table ","element":"span"},{"href":"#id-24","text":"4","element":"a"},{"text":"9 ","element":"span"},{"text":"and exemplified in Figure ","element":"span"},{"href":"#id-23","text":"6","element":"a"},{"text":". Both show that the number of times k-gram extraction visits a node heavily depends on its position and that with growing ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":", the bias gets amplified (Table ","element":"span"},{"href":"#id-24","text":"4","element":"a"},{"text":").","element":"span"},{"text":"10 ","element":"span"},{"text":"E.g., when ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":"=3, 3 wrong leaves yield 9 wrong k-grams, and 1 wrong branching node can already yield 18 wrong k-grams. By contrast, in S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"the weight of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"leaves always approximates the weight of 1 branching node of degree ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":".","element":"span"}],[{"text":"In sum, in S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"the impact of a wrong node is constant for all node types and rises linearly with ","element":"span"},{"style":{"height":12.8},"width":297.63,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-7.png","element":"img","alt":" d. In SEMBLEU","inline":true,"padRight":true},{"text":"the impact of a node rises approximately quadratically with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"and it also depends on the node type, since it raises some (but not all) leaves in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"to connected nodes in ","element":"span"},{"style":{"height":15.53},"width":84.73,"height":38.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-8.png","element":"img","alt":" Gvf.","inline":true}],[{"style":{"fontWeight":"bold"},"text":"Eliminating biases ","element":"span"},{"text":"A possible approach to reduce S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":"’s biases could be to weigh the extracted k-gram matches according to the degree of the contained nodes. However, this would imply that we assume some k-grams (and thus also some nodes and edges) to be of greater importance than others – in other words, we would eliminate one bias by introducing another. Since the breadth-first traversal is the metric’s backbone, this issue may be hard to address well. When B","element":"span"},{"text":"LEU ","element":"span"},{"text":"is used for MT evaluation, there is no such bias because the k-grams in a sentence appear linearly.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"VI. Graph matching: ","element":"span"},{"style":{"fontWeight":"bold"},"text":"symbolic perspective ","element":"span"},{"text":"This principle requires that a metric’s score grows with increasing overlap of the conditions that are simultaneously contained in ","element":"span"},{"style":{"height":13.2},"width":357.79,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/6-9.png","element":"img","alt":" A and B. SMATCH","inline":true,"padRight":true},{"text":"fulfills this principle since it matches two AMR graphs inexactly (","element":"span"},{"href":"#id-25","referenceIndex":40,"text":"Yan et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-25","referenceIndex":40,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-26","referenceIndex":32,"text":"Riesen et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-26","referenceIndex":32,"text":"2010","element":"a"},{"text":") by aligning variables s.t. that the triple matches are maximized. Hence, S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"can be seen as a graph matching algorithm that works on any pair of graphs that contain (some) nodes that are variables. It fulfills the Jaccard-based overlap objective which symmetrically measures the amount of triples on which two graphs agree, normalized by their respective sizes (since S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"F1 = ","element":"span"},{"text":"2","element":"span"},{"style":{"fontStyle":"italic"},"text":"J/","element":"span"},{"text":"(1 + ","element":"span"},{"style":{"fontStyle":"italic"},"text":"J","element":"span"},{"text":") ","element":"span"},{"text":"is a monotonic relation).","element":"span"}],[{"text":"Since S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"does not satisfy principles II and III (id. of indescernibles and symmetry), it is a corollary that it cannot fulfill the overlap objective.","element":"span"},{"text":"11 ","element":"span"},{"text":"Generally, S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"does not com-","element":"span"}],[{"id":"id-28","style":{"width":"99%"},"width":872,"height":270,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-0.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"-----------------------Scores-----------------------","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"metric (A,B) ","element":"span"},{"style":{"fontWeight":"bold"},"text":"metric (B,C) ","element":"span"},{"style":{"fontWeight":"bold"},"text":"metric (A,C) --------------- ","element":"span"},{"style":{"fontWeight":"bold"},"text":"--------------- ","element":"span"},{"style":{"fontWeight":"bold"},"text":"---------------S","element":"span"},{"style":{"fontWeight":"bold"},"text":"E M","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"L E U ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.00 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"E M","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"L E U ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.00 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"E M","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"style":{"fontWeight":"bold"},"text":"L EU ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.00 S","element":"span"},{"style":{"fontWeight":"bold"},"text":"M A T C H ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.25 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"M A T C H ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.25 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"fontWeight":"bold"},"text":"M A T C H ","element":"span"},{"style":{"fontWeight":"bold"},"text":"-> 0.25 S","element":"span"},{"style":{"height":10.41},"width":867.04,"height":26.02,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-1.png","element":"img","alt":"2 M A T C H -> 0.39 S2 M A T C H -> 0.25 S2 M A T C H -> 0.25","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"-----------------------------------------------------","element":"span"}],[{"text":"Figure 7: Three different AMR graphs representing ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"The cat sprints; The kitten runs; The giraffe sleeps ","element":"figcaption","subtype":"caption"},{"text":"and pairwise similarity scores from S","element":"figcaption","subtype":"caption"},{"text":"EM","element":"figcaption","subtype":"caption"},{"text":"B","element":"figcaption","subtype":"caption"},{"text":"LEU","element":"figcaption","subtype":"caption"},{"text":", S","element":"figcaption","subtype":"caption"},{"text":"MATCH ","element":"figcaption","subtype":"caption"},{"text":"and S","element":"figcaption","subtype":"caption"},{"style":{"height":16.58},"width":528.38,"height":41.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-2.png","element":"img","alt":"2MATCH (see (§4) for S2Match).","inline":true}],[{"text":"pare and match two AMR graphs per se, instead it matches the results of a graph-to-bag-of-paths projection function (§","element":"span"},{"href":"#id-27","text":"2.2","element":"a"},{"text":") and the input may not be recoverable from the output (surjective-only). Thus, matching the outputs of this function cannot be equated to matching the inputs on a graph-level.","element":"span"}]]},{"heading":"4 Towards a more semantic metric for semantic graphs: S2MATCH","paragraphs":[[{"text":"This section focuses on principle VII, semantically graded graph matching, a principle that none of the AMR metrics considered so-far satisfies. A fulfil-ment of this principle also increases the capacity of a metric to assess the semantic similarity of two AMR graphs from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"different sentences","element":"span"},{"text":". E.g., when clustering AMR graphs or detecting paraphrases in AMR-parsed texts, the ability to abstract away from concrete lexicalizations is clearly desirable. Consider Figure ","element":"span"},{"href":"#id-28","text":"7 ","element":"a"},{"text":"with three different graphs. Two of them ","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") ","element":"span"},{"text":"are similar in meaning and differ significantly from ","element":"span"},{"style":{"fontStyle":"italic"},"text":"C","element":"span"},{"text":". However, both S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"yield the same result in the sense that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") = ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, C","element":"span"},{"text":")","element":"span"},{"text":". Put differently, neither metric takes into account that ","element":"span"},{"style":{"fontStyle":"italic"},"text":"giraffe ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"kitten ","element":"span"},{"text":"are two quite different concepts, while ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cat ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"kitten ","element":"span"},{"text":"are more similar. However, we would like this to be reflected by our metric and obtain ","element":"span"},{"style":{"fontStyle":"italic"},"text":"metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, B","element":"span"},{"text":") ","element":"span"},{"style":{"fontStyle":"italic"},"text":"> metric","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"A, C","element":"span"},{"text":") ","element":"span"},{"text":"in such a case.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"S","element":"span"},{"style":{"height":14.74},"width":158.97,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-3.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"We propose the S","element":"span"},{"style":{"height":15.14},"width":290.1,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-4.png","element":"img","alt":"2MATCH metric","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"Soft Semantic match","element":"span"},{"text":", pronounced: ","element":"span"},{"style":{"height":16.4},"width":217.3,"height":41,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-5.png","element":"img","alt":" [estu:mætS])","inline":true,"padRight":true},{"text":"that builds on S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"but differs from it in one important aspect: instead of maximizing the number of (hard) triple matches between two graphs during alignment search, we maximize the (soft) triple matches by taking into account the semantic","element":"span"}],[{"id":"id-33","style":{"width":"99%"},"width":866,"height":229,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-6.png","element":"img"}],[{"text":"Table 5: S","element":"figcaption","subtype":"caption"},{"style":{"height":13.39},"width":138.31,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-7.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"improves upon S","element":"figcaption","subtype":"caption"},{"text":"MATCH ","element":"figcaption","subtype":"caption"},{"text":"by reducing the extent of its non-determinacy.","element":"figcaption","subtype":"caption"}],[{"text":"similarity of concepts. Recall that an AMR graph contains two types of triples: instance and relation triples (e.g., Figure ","element":"span"},{"href":"#id-28","text":"7","element":"a"},{"text":", left: ","element":"span"},{"style":{"height":17.6},"width":39.97,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-8.png","element":"img","alt":" ⟨a","inline":true},{"text":", instance, cat","element":"span"},{"style":{"height":17.6},"width":94.37,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-9.png","element":"img","alt":"⟩ and","inline":true},{"style":{"height":17.6},"width":479.55,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-10.png","element":"img","alt":"⟨c, arg0, a⟩). In SMATCH","inline":true},{"text":", two triples can only be matched if they are identical. In S","element":"span"},{"style":{"height":17.53},"width":162.89,"height":43.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-11.png","element":"img","alt":"2MATCH,","inline":true,"padRight":true},{"text":"we relax this constraint, which has also the potential to yield a different, and possibly, a better variable alignment. More precisely, in S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"we match two instance triples ","element":"span"},{"style":{"height":17.6},"width":39.97,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-12.png","element":"img","alt":" ⟨a","inline":true},{"text":", instance, x","element":"span"},{"style":{"height":17.6},"width":101.25,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-13.png","element":"img","alt":"⟩ ∈ A","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":17.6},"width":157.34,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-14.png","element":"img","alt":" ⟨map(a)","inline":true},{"text":", instance, y","element":"span"},{"style":{"height":17.6},"width":103.09,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-15.png","element":"img","alt":"⟩ ∈ B","inline":true,"padRight":true},{"text":"as follows:","element":"span"}],[{"id":"id-29","style":{"width":"73%"},"width":644,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-16.png","element":"img"}],[{"text":"where ","element":"span"},{"text":"I","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"c","element":"span"},{"text":") ","element":"span"},{"text":"equals 1 if ","element":"span"},{"style":{"fontStyle":"italic"},"text":"c ","element":"span"},{"text":"is true and ","element":"span"},{"text":"0 ","element":"span"},{"text":"otherwise. S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-17.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"relaxes this condition:","element":"span"}],[{"id":"id-30","style":{"width":"76%"},"width":672,"height":45,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-18.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d ","element":"span"},{"text":"is an arbitrary distance function ","element":"span"},{"style":{"height":12.8},"width":132.83,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-19.png","element":"img","alt":" d : X×","inline":true},{"style":{"height":17.6},"width":204,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-20.png","element":"img","alt":"X → [0, 1]","inline":true},{"text":". E.g., in practice, if we represent the concepts as vectors ","element":"span"},{"style":{"height":16},"width":173.14,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-21.png","element":"img","alt":" x, y ∈ Rn","inline":true},{"text":", we can use","element":"span"}],[{"style":{"width":"86%"},"width":756,"height":110,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-22.png","element":"img"}],[{"text":"When plugged into Eq. ","element":"span"},{"href":"#id-29","text":"7","element":"a"},{"text":", this results in the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"cosine similarity ","element":"span"},{"style":{"height":17.6},"width":135.4,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-23.png","element":"img","alt":" ∈ [0, 1]","inline":true},{"text":". It may be suitable to set a threshold ","element":"span"},{"style":{"height":16},"width":279.96,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-24.png","element":"img","alt":" τ (e.g., τ = 0.5","inline":true},{"text":"), to only consider the similarity between two concepts if it is above ","element":"span"},{"style":{"height":8},"width":23,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-25.png","element":"img","alt":" τ","inline":true,"padRight":true},{"text":"(","element":"span"},{"style":{"height":17.6},"width":637.15,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-26.png","element":"img","alt":"softMatch = 0 if 1 − d(x, y) < τ","inline":true},{"text":"). In the following pilot experiments, we use cosine (Eq. ","element":"span"},{"href":"#id-30","text":"8","element":"a"},{"text":") and ","element":"span"},{"style":{"height":12.4},"width":143.78,"height":31,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-27.png","element":"img","alt":" τ = 0.5","inline":true,"padRight":true},{"text":"over 100 dimensional GloVe vectors (","element":"span"},{"href":"#id-31","referenceIndex":30,"text":"Pennington et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-31","referenceIndex":30,"text":"2014","element":"a"},{"text":").","element":"span"}],[{"text":"To summarize, S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-28.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"is designed to either yield the same score as S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"– or a slightly increased score when it aligns concepts that are symbolically distinct but semantically similar. An example, from parser evaluation, is shown in Figure ","element":"span"},{"href":"#id-32","text":"8","element":"a"},{"text":". Here, S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-29.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"increases the score to 63 F1 (+10 points) by detecting a more adequate alignment that accounts for the graded similarity of two related AMR concepts pairs. We believe that this is justified: The two graphs are very similar and an F1 of 53 is too low, doing the parser injustice.","element":"span"}],[{"text":"On a technical note, the changes in alignments also have the outcome that S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/7-30.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"mends some","element":"span"}],[{"id":"id-34","style":{"width":"97%"},"width":1765,"height":272,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-0.png","element":"img"}],[{"id":"id-32","text":"Table 6: Examples where S","element":"figcaption","subtype":"caption"},{"style":{"height":13.38},"width":138.31,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-1.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"assigns a higher score, accounting for the similarity of aligned concepts .","element":"figcaption","subtype":"caption"}],[{"style":{"width":"64%"},"width":567,"height":790,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-2.png","element":"img"}],[{"text":"Figure 8: ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"‘6 Abu Sayyaf suspects were captured last week in a raid in Metro Manila.’ ","element":"figcaption","subtype":"caption"},{"text":"gold ","element":"figcaption","subtype":"caption"},{"text":"(top) vs. parsed AMR (bottom). S","element":"figcaption","subtype":"caption"},{"text":"MATCH ","element":"figcaption","subtype":"caption"},{"text":"aligns ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"criminal-organization ","element":"figcaption","subtype":"caption"},{"text":"to ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"city ","element":"figcaption","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"text":"red","element":"figcaption","subtype":"caption"},{"text":"); S","element":"figcaption","subtype":"caption"},{"style":{"height":16.98},"width":244.26,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-3.png","element":"img","alt":"2MATCH aligns","inline":true,"padRight":true},{"style":{"fontStyle":"italic"},"text":"criminal-organization ","element":"figcaption","subtype":"caption"},{"text":"to ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"suspect-01","element":"figcaption","subtype":"caption"},{"text":", ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"city ","element":"figcaption","subtype":"caption"},{"text":"to ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"country-region ","element":"figcaption","subtype":"caption"},{"text":"(","element":"figcaption","subtype":"caption"},{"text":"blue","element":"figcaption","subtype":"caption"},{"text":").","element":"figcaption","subtype":"caption"}],[{"text":"of S","element":"span"},{"text":"MATCH","element":"span"},{"text":"’s flaws: It better addresses principles III and IV, reducing the symmetry violation and determinacy error (Table ","element":"span"},{"href":"#id-33","text":"5","element":"a"},{"text":").","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Qualitative study: Probing S","element":"span"},{"style":{"height":15.13},"width":335.06,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-4.png","element":"img","alt":"2MATCH’s choices","inline":true,"padRight":true},{"text":"This study randomly samples 100 graph pairs from those where S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-5.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"assigned higher scores than S","element":"span"},{"text":"MATCH","element":"span"},{"text":".","element":"span"},{"text":"12 ","element":"span"},{"text":"Two annotators were asked to judge the similarity of all aligned concepts with similarity score ","element":"span"},{"style":{"fontStyle":"italic"},"text":"<","element":"span"},{"text":"1.0: Are the concepts dissimilar, similar or extremely similar? For concepts judged dissimilar, we conclude that S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-6.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"erroneously increased the score; if judged as (extremely) similar, we conclude that the decision was justified. We calculate three agreement statistics that all show large consensus among our annotators (Cohen’s kappa equals 0.79, squared kappa: 0.87, Pearson’s ","element":"span"},{"style":{"height":12},"width":23,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-7.png","element":"img","alt":" ρ","inline":true},{"text":": 0.91) According to the annotations, the decision to increase the score is mostly jus-tified: in 56% and 12% of cases both annotators voted that the newly aligned concepts are ","element":"span"},{"style":{"fontStyle":"italic"},"text":"extremely similar ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"similar","element":"span"},{"text":", respectively, while the","element":"span"}],[{"text":"agreed ","element":"span"},{"style":{"fontStyle":"italic"},"text":"dissimilar ","element":"span"},{"text":"label makes up 25% of cases.","element":"span"}],[{"text":"Table ","element":"span"},{"href":"#id-34","text":"6 ","element":"a"},{"text":"lists examples of good or ill-founded score increases. We observe, e.g., that S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-8.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"accounts for the similarity of two concepts of different number: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"bacterium ","element":"span"},{"text":"(gold) vs. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"bacteria ","element":"span"},{"text":"(parser) (line 3). It also captures abbreviations (","element":"span"},{"style":{"fontStyle":"italic"},"text":"km – kilometer","element":"span"},{"text":") and closely related concepts (","element":"span"},{"style":{"fontStyle":"italic"},"text":"farming – agriculture","element":"span"},{"text":"). ","element":"span"},{"text":"S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"and S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"would penalize the corresponding triples in exactly the same way as predicting a truly dissimilar concept.","element":"span"}],[{"text":"An interesting case is seen in line 7. Here, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"usual ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"unusual ","element":"span"},{"text":"are correctly annotated as dissimilar, since they are opposite concepts. S","element":"span"},{"style":{"height":17.53},"width":162.89,"height":43.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-9.png","element":"img","alt":"2MATCH,","inline":true,"padRight":true},{"text":"equipped with GloVe embeddings, measures a cosine of 0.6, above the chosen threshold, which results in an increase of the score by 14 points (the increase is large as these two graphs are tiny). It is well-known that synonyms and antonyms are dif-ficult to distinguish with distributional word representations, since they often share similar contexts. However, the case at hand is orthogonal to this problem: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"usual ","element":"span"},{"text":"in the gold graph is modified with the polarity ‘","element":"span"},{"style":{"height":4.8},"width":34,"height":12,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-10.png","element":"img","alt":"−","inline":true},{"text":"’, whereas the predicted graph assigned the (non-negated) opposite concept ","element":"span"},{"style":{"fontStyle":"italic"},"text":"unusual","element":"span"},{"text":". Hence, given the context in the gold graph, the prediction is semantically almost equivalent. This points to an aspect of principle VII that is not yet covered by S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/8-11.png","element":"img","alt":"2MATCH","inline":true},{"text":": it assesses graded similarity at the lexical, but not at the phrasal level, and hence cannot account for compositional phenomena. In future work, we aim at alleviating this issue by developing extensions that measure semantic similarity for larger graph contexts, in order to fully satisfy all seven principles.","element":"span"},{"text":"13","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Quantitative study: metrics vs. human raters ","element":"span"},{"text":"This study investigates to what extent the judgments of the three metrics under discussion resemble human judgements, based on the following ","element":"span"},{"style":{"fontWeight":"bold"},"text":"two expectations","element":"span"},{"text":". First, the more a human rates two sentences to be semantically ","element":"span"},{"style":{"fontStyle":"italic"},"text":"similar ","element":"span"},{"text":"in their ","element":"span"},{"style":{"fontStyle":"italic"},"text":"meaning","element":"span"},{"text":", the higher the metric should rate","element":"span"}],[{"id":"id-38","style":{"width":"99%"},"width":868,"height":144,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-0.png","element":"img"}],[{"text":"Table 7: RMSE (lower is better) and correlation results of our metrics in our ","element":"figcaption","subtype":"caption"},{"style":{"fontWeight":"bold"},"text":"STS ","element":"figcaption","subtype":"caption"},{"text":"and ","element":"figcaption","subtype":"caption"},{"style":{"fontWeight":"bold"},"text":"SICK ","element":"figcaption","subtype":"caption"},{"text":"investigations. RMSE (quant): RMSE on empirical quantile distribution with quantiles 0.1,0.2,...,0.9.","element":"figcaption","subtype":"caption"}],[{"id":"id-39","style":{"width":"99%"},"width":867,"height":312,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-1.png","element":"img"}],[{"text":"Figure 9: Sentence meaning similarity distributions.","element":"figcaption","subtype":"caption"}],[{"text":"the corresponding AMR graphs (","element":"span"},{"style":{"fontWeight":"bold"},"text":"meaning similarity","element":"span"},{"text":"). Second, the more a human rates two sentences to be ","element":"span"},{"style":{"fontStyle":"italic"},"text":"related ","element":"span"},{"text":"in their ","element":"span"},{"style":{"fontStyle":"italic"},"text":"meaning ","element":"span"},{"text":"(maximum: equivalence), the higher the score of our metric of the corresponding AMR graphs should tend to be (","element":"span"},{"style":{"fontWeight":"bold"},"text":"meaning relatedness","element":"span"},{"text":"). Albeit not the exact same (","element":"span"},{"href":"#id-35","referenceIndex":6,"text":"Budanitsky and Hirst","element":"a"},{"text":", ","element":"span"},{"href":"#id-35","referenceIndex":6,"text":"2006","element":"a"},{"text":"), the tasks are closely related and both in conjunction should allow us to better assess the performance of our AMR metrics.","element":"span"}],[{"text":"As ground truth for the ","element":"span"},{"style":{"fontWeight":"bold"},"text":"meaning similarity ","element":"span"},{"text":"rating task we use test data of the Semantic Textual Similarity ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(STS) ","element":"span"},{"text":"shared task (","element":"span"},{"href":"#id-36","referenceIndex":9,"text":"Cer ","element":"a"},{"href":"#id-36","referenceIndex":9,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-36","referenceIndex":9,"text":"2017","element":"a"},{"text":"), with 1,379 sentence pairs annotated for meaning similarity. For the ","element":"span"},{"style":{"fontWeight":"bold"},"text":"meaningrelatedness ","element":"span"},{"text":"task we use ","element":"span"},{"style":{"fontWeight":"bold"},"text":"SICK ","element":"span"},{"text":"(","element":"span"},{"href":"#id-37","referenceIndex":25,"text":"Marelli et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-37","referenceIndex":25,"text":"2014","element":"a"},{"text":") with 9,840 sentence pairs that have been additionally annotated for semantic relatedness.","element":"span"},{"text":"14 ","element":"span"},{"text":"We proceed as follows: We normalize the human ratings to [0,1]. ","element":"span"},{"text":"Then we apply GPLA to parse the sentence tuples ","element":"span"},{"style":{"height":18.09},"width":121.32,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-2.png","element":"img","alt":" (si, s′i)","inline":true},{"text":", obtaining tuples ","element":"span"},{"style":{"height":18.09},"width":402.59,"height":45.22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-3.png","element":"img","alt":"(parse(si), parse(s′i))","inline":true,"padRight":true},{"text":"and score the graph pairs ","element":"span"},{"text":"with the metrics: S","element":"span"},{"text":"MATCH","element":"span"},{"text":"(i), S","element":"span"},{"style":{"height":17.93},"width":314.62,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-4.png","element":"img","alt":"2MATCH(i), SEM-","inline":true,"padRight":true},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":"(i) and H(i), where H(i) is the human score. For both tasks S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-5.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"yield better or equal correlations with human raters than S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(Table ","element":"span"},{"href":"#id-38","text":"7","element":"a"},{"text":"). When considering the RMS error","element":"span"},{"style":{"height":20.8},"width":772.38,"height":52,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-6.png","element":"img","alt":"�n−1 �ni=1(H(i) − metric(i))2. the dif-","inline":true,"padRight":true},{"text":"ference is even more pronounced. This deviation in the absolute scores is also reflected by the score density distributions plotted in Figure ","element":"span"},{"href":"#id-39","text":"9","element":"a"},{"text":": S","element":"span"},{"text":"EM","element":"span"},{"text":"-","element":"span"}],[{"id":"id-41","style":{"width":"100%"},"width":873,"height":273,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-7.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"(g2 / good-02 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(c0 / contrast-01 :ARG1 (i3 / idea ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG2 (i4 / idea","element":"span"}],[{"style":{"width":"84%"},"width":735,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-8.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":":polarity -) ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG1-of (g3 / good-02 :polarity -)))","element":"span"}],[{"style":{"width":"98%"},"width":859,"height":108,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-9.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"-----------------------------------------------------","element":"span"}],[{"text":"Figure 10: An example from STS, where S","element":"figcaption","subtype":"caption"},{"style":{"height":13.39},"width":138.31,"height":33.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-10.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"yields a score that better reflects the human judgement, due to detecting a similarity between the abstract anaphora ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"it ","element":"figcaption","subtype":"caption"},{"text":"and ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"this ","element":"figcaption","subtype":"caption"},{"text":".","element":"figcaption","subtype":"caption"}],[{"text":"BLEU ","element":"span"},{"text":"underrates a good proportion of graph pairs whose input sentences were rated as highly semantically similar or related by humans. ","element":"span"},{"text":"This may well relate to the biases of different node types (cf. §","element":"span"},{"href":"#id-40","text":"3","element":"a"},{"text":"). Overall S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-11.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"appears to provide a better fit with the score-distribution of the human rater when measuring ","element":"span"},{"style":{"fontWeight":"bold"},"text":"semantic similarity ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"relatedness","element":"span"},{"text":", the latter being notably closer to the human reference in some regions than the otherwise similar S","element":"span"},{"text":"MATCH","element":"span"},{"text":". A concrete example from the STS data is given in Figure ","element":"span"},{"href":"#id-41","text":"10","element":"a"},{"text":". Here, S","element":"span"},{"style":{"height":15.13},"width":282.39,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-12.png","element":"img","alt":"2MATCH detects","inline":true,"padRight":true},{"text":"the similarity between the abstract anaphors ","element":"span"},{"style":{"fontStyle":"italic"},"text":"it ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"this ","element":"span"},{"text":"and assigns a score that better reflects the human score compared to S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":", the latter being far too low. ","element":"span"},{"text":"However, in total, we conclude that S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-13.png","element":"img","alt":"2MATCH","inline":true},{"text":"’s improvements seem rather small and no metric is perfectly aligned with human scores, possibly because gradedness of semantic similarity that arises in combination with constructional variation is not yet captured – more research is needed to extend S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/9-14.png","element":"img","alt":"2MATCH","inline":true},{"text":"’s scope to such cases.","element":"span"}]]},{"heading":"5 Metrics’ effects on parser evaluation","paragraphs":[[{"text":"We have seen that different metrics can assign different scores to the same pair of graphs. We now want to assess to what extent this affects rankings: Does one metric rank a graph higher or lower than the other? And can this affect the ranking of parsers on benchmark datasets?","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Quantitative study: ","element":"span"},{"style":{"fontWeight":"bold"},"text":"graph rankings ","element":"span"},{"text":"In this experiment, we assess whether our metrics rank graphs differently. ","element":"span"},{"text":"We use LDC2017T10 (dev)","element":"span"}],[{"id":"id-42","style":{"width":"96%"},"width":843,"height":358,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-0.png","element":"img"}],[{"text":"Table 8: Cross-metric comparison on individual graph rankings. % of cases where metrics differ in their preference for one parse over the other. metric","element":"figcaption","subtype":"caption"},{"style":{"height":17.77},"width":27,"height":44.44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-1.png","element":"img","alt":"YX","inline":true},{"text":": short for ","element":"figcaption","subtype":"caption"},{"text":"metric(","element":"figcaption","subtype":"caption"},{"style":{"height":14.4},"width":146.27,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-2.png","element":"img","alt":"X, Y ). †","inline":true,"padRight":true},{"text":"indicates significance in score differences assigned to parse pairs at p","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":"<","element":"figcaption","subtype":"caption"},{"text":"0.005.","element":"figcaption","subtype":"caption"}],[{"text":"parses by CAMR ","element":"span"},{"style":{"height":17.6},"width":530.11,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-3.png","element":"img","alt":" [c1...cn], JAMR [j1...jn] and","inline":true,"padRight":true},{"text":"gold graphs ","element":"span"},{"style":{"height":17.6},"width":144.75,"height":44,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-4.png","element":"img","alt":" [y1...yn]","inline":true},{"text":". Given metrics ","element":"span"},{"style":{"fontStyle":"italic"},"text":"F ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"G ","element":"span"},{"text":"we obtain results ","element":"span"},{"style":{"height":19.53},"width":623.12,"height":48.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-5.png","element":"img","alt":" FC := [F(c1, y1)...F(cn, yn)] and","inline":true,"padRight":true},{"text":"analogously ","element":"span"},{"style":{"height":17.93},"width":290.95,"height":44.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-6.png","element":"img","alt":" FJ, GC and GJ","inline":true},{"text":". We calculate two statistics: (i) the ratio of cases ","element":"span"},{"style":{"fontStyle":"italic"},"text":"i ","element":"span"},{"text":"where the metrics differ in their preference for one parse over the other ","element":"span"},{"style":{"height":20.02},"width":700.66,"height":50.06,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-7.png","element":"img","alt":" (FJi − FCi ) · (GJi − GCi ) < 0, and, to","inline":true,"padRight":true},{"text":"assess significance, (ii) a t-test for paired samples on the differences assigned by the metrics between the parsers: ","element":"span"},{"style":{"height":17.53},"width":434.74,"height":43.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-8.png","element":"img","alt":" FJ − FC and GJ − GC.","inline":true}],[{"text":"Table ","element":"span"},{"href":"#id-42","text":"8 ","element":"a"},{"text":"shows that S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"style":{"height":15.13},"width":238.22,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-9.png","element":"img","alt":"2MATCH both","inline":true,"padRight":true},{"text":"differ (significantly) from S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"in 15% – 20% of cases. S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"style":{"height":15.13},"width":321.04,"height":37.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-10.png","element":"img","alt":"2MATCH differ on","inline":true,"padRight":true},{"text":"individual rankings in appr. 4% of cases. ","element":"span"},{"text":"Furthermore, we note a considerable amount of cases (8.1%) where S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"disagrees with itself in the preference for one parse over the other.","element":"span"},{"text":"15","element":"span"}],[{"text":"The differing preferences of S","element":"span"},{"style":{"height":16.33},"width":299.28,"height":40.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-11.png","element":"img","alt":"(2)MATCH for ei-","inline":true,"padRight":true},{"text":"ther candidate parse can be the outcome of small divergences due to the alignment search or because S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-12.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"accounts for the lexical similarity of concepts, perhaps supported by a new variable alignment. Figure ","element":"span"},{"href":"#id-43","text":"11 ","element":"a"},{"text":"shows two examples where S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-13.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"prefers a different candidate parse compared to S","element":"span"},{"text":"MATCH","element":"span"},{"text":". In the first example (Figure ","element":"span"},{"href":"#id-43","text":"11a","element":"a"},{"text":"), S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-14.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"prefers the parse produced by JAMR and changes the alignment ","element":"span"},{"style":{"fontStyle":"italic"},"text":"legally-NULL ","element":"span"},{"text":"(S","element":"span"},{"text":"MATCH","element":"span"},{"text":") to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"legally-law ","element":"span"},{"text":"(S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-15.png","element":"img","alt":"2MATCH","inline":true},{"text":"). In the second example (","element":"span"},{"href":"#id-43","text":"11b","element":"a"},{"text":"), S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-16.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"prefers the parse produced by CAMR, because it detects the similarity between ","element":"span"},{"style":{"fontStyle":"italic"},"text":"military ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"navy ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"poor ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"poverty","element":"span"},{"text":". ","element":"span"},{"text":"Therefore, S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-17.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"can assess that the CAMR parse and the gold graph substantially agree on the root concept of the graph, which is not the case in the JAMR parse.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Quantitative study: parser rankings ","element":"span"},{"text":"Having seen that our metrics disagree on the ranking of","element":"span"}],[{"id":"id-43","style":{"width":"99%"},"width":869,"height":354,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-18.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"------------Alignments (parse, gold)----------------","element":"span"}],[{"style":{"width":"77%"},"width":678,"height":67,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-19.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"---------------------Scores-------------------------S","element":"span"},{"style":{"fontWeight":"bold"},"text":"MATCH","element":"span"},{"style":{"fontWeight":"bold"},"text":": ","element":"span"},{"style":{"fontWeight":"bold"},"text":"0.200 ","element":"span"},{"style":{"fontWeight":"bold"},"text":">> ","element":"span"},{"style":{"fontWeight":"bold"},"text":"0.167 S","element":"span"},{"style":{"height":10.41},"width":625.54,"height":26.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-20.png","element":"img","alt":"2 MATCH: 0.200 << 0.252","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"----------------------------------------------------","element":"span"}],[{"style":{"width":"63%"},"width":552,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-21.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"------------Gold Graph & Input Sentence-------------","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"(n3 / navy ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"\"The Navy of the Russian ","element":"span"},{"style":{"fontWeight":"bold"},"text":":mod (c / country ","element":"span"},{"style":{"fontStyle":"italic","fontWeight":"bold"},"text":"Federation is in poor shape.\" ","element":"span"},{"style":{"fontWeight":"bold"},"text":":name (n2 / name :op1 \"Russian\" :op2 \"Federation\")) :mod (s / shape :mod ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(p / poverty)))","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"----------CAMR Parse------------JAMR Parse----------","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"(x2 / military ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(s / shape-01 :name (n / name ","element":"span"},{"style":{"fontWeight":"bold"},"text":":ARG1 (c / country :op1 \"Navy\") ","element":"span"},{"style":{"fontWeight":"bold"},"text":":name (n / name :poss (x5 / country ","element":"span"},{"style":{"fontWeight":"bold"},"text":":op1 \"Russian\" :name (n1 / name ","element":"span"},{"style":{"fontWeight":"bold"},"text":":op2 \"Federation\") :op1 \"Russia\" ","element":"span"},{"style":{"fontWeight":"bold"},"text":":poss (o / organization :op2 \"Federation\")) ","element":"span"},{"style":{"fontWeight":"bold"},"text":":name (n2 / name :prep-in (x10 / shape-01 ","element":"span"},{"style":{"fontWeight":"bold"},"text":":op1 \"Navy\" :op2 \"of\" :mod ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(x9 / poor))) ","element":"span"},{"style":{"fontWeight":"bold"},"text":":op3 \"the\"))) :manner (p / poor))","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"------------Alignments (parse, gold)----------------","element":"span"}],[{"style":{"width":"84%"},"width":735,"height":205,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-22.png","element":"img"}],[{"style":{"fontWeight":"bold"},"text":"---------------------Scores-------------------------S","element":"span"},{"style":{"fontWeight":"bold"},"text":"MATCH","element":"span"},{"style":{"fontWeight":"bold"},"text":": ","element":"span"},{"style":{"fontWeight":"bold"},"text":"0.357 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"<< ","element":"span"},{"style":{"fontWeight":"bold"},"text":"0.387 S","element":"span"},{"style":{"height":10.41},"width":625.54,"height":26.03,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-23.png","element":"img","alt":"2 MATCH: 0.488 >> 0.460","inline":true,"padRight":true},{"style":{"fontWeight":"bold"},"text":"----------------------------------------------------","element":"span"}],[{"style":{"width":"65%"},"width":568,"height":42,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-24.png","element":"img"}],[{"text":"Figure 11: Two examples, where S","element":"figcaption","subtype":"caption"},{"style":{"height":16.99},"width":300.46,"height":42.46,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-25.png","element":"img","alt":"2MATCH disagrees","inline":true,"padRight":true},{"text":"with S","element":"figcaption","subtype":"caption"},{"text":"MATCH ","element":"figcaption","subtype":"caption"},{"text":"in its preference of a candidate parse (for clarity, wiki-links are omitted in this display).","element":"figcaption","subtype":"caption"}],[{"text":"individual graphs, we now quantify the effects on the ranking of parsers. We collect outputs of three state-of-art parsers on the test set of LDC2017T10: GPLA, a sequence-to-graph transducer (STOG) and a neural top-down parser (TOP-DOWN).","element":"span"}],[{"text":"Table ","element":"span"},{"href":"#id-44","text":"9 ","element":"a"},{"text":"shows that S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/10-26.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"agree on the ranking of all three parsers, but both","element":"span"}],[{"id":"id-44","style":{"width":"96%"},"width":844,"height":200,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-0.png","element":"img"}],[{"text":"Table 9: ","element":"figcaption","subtype":"caption"},{"text":"Ranking parsers: ","element":"figcaption","subtype":"caption"},{"text":"STOG (","element":"figcaption","subtype":"caption"},{"href":"#id-45","referenceIndex":41,"text":"Zhang et al.","element":"a","subtype":"caption"},{"text":"); GPLA (","element":"figcaption","subtype":"caption"},{"href":"#id-17","referenceIndex":23,"text":"Lyu and Titov","element":"a","subtype":"caption"},{"text":"); ","element":"figcaption","subtype":"caption"},{"text":"TOP-DOWN (","element":"figcaption","subtype":"caption"},{"href":"#id-7","referenceIndex":7,"text":"Cai and ","element":"a","subtype":"caption"},{"href":"#id-7","referenceIndex":7,"text":"Lam","element":"a","subtype":"caption"},{"text":", ","element":"figcaption","subtype":"caption"},{"href":"#id-7","referenceIndex":7,"text":"2019","element":"a","subtype":"caption"},{"text":"). ","element":"figcaption","subtype":"caption"},{"text":"The structure error is defined as","element":"figcaption","subtype":"caption"}],[{"href":"#id-7","referenceIndex":7,"style":{"height":20.8},"width":793.82,"height":51.99,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-1.png","element":"img","alt":"�1371i=1 |f(goldi) − f(predi)|, where f either is","inline":true},{"text":"node degree or graph density. All four metrics differ significantly in their scores (paired t-test, p","element":"span"},{"style":{"fontStyle":"italic"},"text":"<","element":"span"},{"text":"0.05).","element":"span"}],[{"text":"disagree with S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"on the ranks of the 2","element":"span"},{"style":{"height":8.8},"width":38.56,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-2.png","element":"img","alt":"nd","inline":true,"padRight":true},{"text":"and 3","element":"span"},{"style":{"height":8.8},"width":34.26,"height":22,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-3.png","element":"img","alt":"rd ","inline":true,"padRight":true},{"text":"parser: unlike S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":", the S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"variants rate GPLA higher than TOP-DOWN. A factor that may have contributed to the different rankings perhaps lies in S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":"’s biases towards connected nodes: ","element":"span"},{"text":"Compared with TOPDOWN, GPLA delivers more complex parses, with more edges (avg. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"|","element":"span"},{"style":{"fontStyle":"italic"},"text":"E","element":"span"},{"style":{"fontStyle":"italic"},"text":"|","element":"span"},{"text":": 32.8 vs. 32.1) and higher graph density (avg. density: ","element":"span"},{"text":"0.065 vs. 0.059). This is a nice property, since it indicates that the graphs of GPLA better resemble the rich gold graph structures (avg. density: 0.063, avg. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"|","element":"span"},{"style":{"fontStyle":"italic"},"text":"E","element":"span"},{"style":{"fontStyle":"italic"},"text":"|","element":"span"},{"text":": 34.2). When inspecting this more closely, and looking at single (parse, gold) pairs, we observe further evidence for this: the structural error, in degree and density, is lower for GPLA than for TOP-DOWN (Table ","element":"span"},{"href":"#id-44","text":"9","element":"a"},{"text":", right columns), with an error reduction of -27% (degree, 0.08 vs. 0.11) and -14% (density, 0.0067 vs. 0.0078).","element":"span"}],[{"text":"In sum, by building graphs of higher complexity, GPLA takes a greater risk when attaching wrong concepts to connected nodes where errors are penalized more strongly by S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"than S","element":"span"},{"text":"MATCH","element":"span"},{"text":", according to the biases we have studied in §","element":"span"},{"href":"#id-40","text":"3 ","element":"a"},{"text":"(Table ","element":"span"},{"href":"#id-24","text":"4","element":"a"},{"text":"). In that sense, STOG also takes more risks, but it may get more of such concepts right and so the bias transitions from penalty to reward, potentially explaining the large performance ","element":"span"},{"style":{"height":12.8},"width":37,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-4.png","element":"img","alt":"∆","inline":true,"padRight":true},{"text":"(+6) of STOG to the other parsers, as measured by S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":", in contrast to S(2)M","element":"span"},{"text":"ATCH ","element":"span"},{"text":"(","element":"span"},{"style":{"height":16},"width":132.8,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-5.png","element":"img","alt":"∆: +2).","inline":true}]]},{"heading":"6 Summary of our metric analyses","paragraphs":[[{"text":"Table ","element":"span"},{"href":"#id-46","text":"10 ","element":"a"},{"text":"summarizes our analyses’ integral results. ","element":"span"},{"text":"Principle I is fulfilled by all metrics as they exhibit ","element":"span"},{"style":{"fontStyle":"italic"},"text":"continuity, non-negativity and an upper bound","element":"span"},{"text":". Principle II, however, is not satisfied by S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"since it can mistake two graphs of different meaning as equivalent. This is because it ablates a variable-alignment and therefore cannot capture facets of coreference. Yet, a positive outcome of this is that it is ","element":"span"},{"style":{"fontStyle":"italic"},"text":"fast to compute","element":"span"},{"text":". This","element":"span"}],[{"id":"id-46","style":{"width":"99%"},"width":866,"height":360,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-6.png","element":"img"}],[{"text":"Table 10: Evaluation of three AMR metrics using our seven principles. ","element":"figcaption","subtype":"caption"},{"style":{"height":13.99},"width":44.09,"height":34.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-7.png","element":"img","alt":" \u0013ϵ","inline":true},{"text":": fulfilled with a very small ","element":"figcaption","subtype":"caption"},{"style":{"height":8},"width":114.68,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/11-8.png","element":"img","alt":" ϵ-error.","inline":true}],[{"text":"could make it first choice in some recent AMR parsing approaches that use reinforcement learning (","element":"span"},{"href":"#id-47","referenceIndex":26,"text":"Naseem et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-47","referenceIndex":26,"text":"2019","element":"a"},{"text":"), where rapid feedback is needed. It also marks a point by fully satisfying Principle IV, yielding fully deterministic results. S","element":"span"},{"text":"MATCH","element":"span"},{"text":", by contrast, either needs to resort to a costly ILP solution or (in practice) uses hill-climbing with multiple restarts to reduce divergence to a negligible amount.","element":"span"}],[{"text":"A central insight brought out by our analysis is that S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"exhibits ","element":"span"},{"style":{"fontStyle":"italic"},"text":"biases ","element":"span"},{"text":"that are hard to control. This is caused by two (interacting) factors: (i) The extraction of k-grams is applied on the graph top to bottom and visits some nodes more frequently than others. (ii) It raises some (but not all) leaf nodes to connected nodes, and these nodes will be overly frequently contained in extracted k-grams. We have shown that these two factors in combination lead to large biases that researchers should be aware of when using S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"(§","element":"span"},{"href":"#id-40","text":"3","element":"a"},{"text":"). Its ancestor B","element":"span"},{"text":"LEU ","element":"span"},{"text":"does not suffer from such biases since it extracts k-grams linearly from a sentence.","element":"span"}],[{"text":"Given that S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"is built on B","element":"span"},{"text":"LEU","element":"span"},{"text":", it is inherently ","element":"span"},{"style":{"fontStyle":"italic"},"text":"asymmetric","element":"span"},{"text":". However, we have shown that the asymmetry (Principle III) measured for B","element":"span"},{"text":"LEU ","element":"span"},{"text":"in MT is amplified by S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"in AMR, mainly due to the biases it incurs (Principle V). While asymmetry can be tolerated in parser evaluation if outputs are compared against gold graphs in a standardized manner, it is difficult to apply an asymmetric metric to measure IAA or to compare parses for detecting paraphrases, or in tri-parsing, where no reference is available. If the asymmetry is amplified by a bias, it becomes harder to judge the scores. ","element":"span"},{"text":"Finally, considering that S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"does not match AMR graphs on the graph-level but matches extracted bags-of-k-grams, it turns out that it cannot be categorized as a graph matching algorithm as defined in Principle VI.","element":"span"}],[{"text":"Principle VI is fulfilled by S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"without any transformation on AMR graphs. It searches for an optimal variable alignment and counts matching triples. As a corollary, it fulfills principles I, II, III and V. The fact that S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"fulfills all but one principle backs up many prior works that use it as sole criterion for IAA and parse evaluation.","element":"span"}],[{"text":"Our principles also helped us detect a weakness of all present AMR metrics: they operate on a discrete level and cannot assess graded meaning differences. As a first step, we propose S","element":"span"},{"style":{"height":15.13},"width":163.89,"height":37.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/12-0.png","element":"img","alt":"2MATCH:","inline":true,"padRight":true},{"text":"it preserves beneficial properties of S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"but is benevolent to slight lexical meaning deviations. Besides parser evaluation, this property makes the metric also more suitable for other tasks, e.g., it can be used as a kernel in an SVM that classi-fies AMRs to determine whether two sentences are equivalent in meaning. In such a case, S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/12-1.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"is bound to detect meaning-similarities that cannot be captured by S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"or S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU","element":"span"},{"text":", e.g., due to paraphrases being projected into the parses.","element":"span"}]]},{"heading":"7 Related work","paragraphs":[[{"text":"Developing similarity metrics for meaning representations (MRs) is important, as it, i.a., affects semantic parser evaluation and computation of IAA statistics for sembanking. MRs are designed to represent the meaning of text in a well-defined, interpretable form that is able to identify meaning differences and support inference. ","element":"span"},{"href":"#id-48","referenceIndex":4,"text":"Bos ","element":"a"},{"text":"(","element":"span"},{"href":"#id-48","referenceIndex":4,"text":"2016","element":"a"},{"text":", ","element":"span"},{"href":"#id-49","referenceIndex":5,"text":"2019","element":"a"},{"text":") has shown how AMR can be translated to FOL, a well-established MR formalism. Discourse Representation Theory (DRT, ","element":"span"},{"href":"#id-50","referenceIndex":20,"text":"Kamp ","element":"a"},{"text":"(","element":"span"},{"href":"#id-50","referenceIndex":20,"text":"1981","element":"a"},{"text":"); ","element":"span"},{"href":"#id-51","referenceIndex":21,"text":"Kamp and Reyle ","element":"a"},{"text":"(","element":"span"},{"href":"#id-51","referenceIndex":21,"text":"1993","element":"a"},{"text":")) is based on and extends FOL to discourse representation. A recent shared task on DRS parsing used the C","element":"span"},{"text":"OUNTER ","element":"span"},{"text":"metric (","element":"span"},{"href":"#id-52","referenceIndex":1,"text":"Abzianidze et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-52","referenceIndex":1,"text":"2019","element":"a"},{"text":"; ","element":"span"},{"href":"#id-53","referenceIndex":14,"text":"Evang","element":"a"},{"text":", ","element":"span"},{"href":"#id-53","referenceIndex":14,"text":"2019","element":"a"},{"text":"), an adaption of S","element":"span"},{"text":"MATCH","element":"span"},{"text":", underlining S","element":"span"},{"text":"MATCH","element":"span"},{"text":"’s general applicability. ","element":"span"},{"text":"Its extension S","element":"span"},{"style":{"height":18.73},"width":246.76,"height":46.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/12-2.png","element":"img","alt":"2MATCH may","inline":true,"padRight":true},{"text":"also prove beneficial for DRS.","element":"span"}],[{"text":"Other research into AMR metrics aims at making the comparison fairer by normalizing graphs (","element":"span"},{"href":"#id-54","referenceIndex":16,"text":"Goodman","element":"a"},{"text":", ","element":"span"},{"href":"#id-54","referenceIndex":16,"text":"2019","element":"a"},{"text":"). ","element":"span"},{"href":"#id-55","referenceIndex":2,"text":"Anchiêta et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-55","referenceIndex":2,"text":"2019","element":"a"},{"text":") argue that one should not, e.g., insert an extra ","element":"span"},{"style":{"fontStyle":"italic"},"text":"is-root ","element":"span"},{"text":"node when comparing AMR graphs (as done in S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"and S","element":"span"},{"text":"MATCH","element":"span"},{"text":"). ","element":"span"},{"href":"#id-56","referenceIndex":12,"text":"Damonte et al. ","element":"a"},{"text":"(","element":"span"},{"href":"#id-56","referenceIndex":12,"text":"2017","element":"a"},{"text":") extend S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"to analyze individual AMR facets (co-reference, WSD, etc.). ","element":"span"},{"href":"#id-7","referenceIndex":7,"text":"Cai and Lam ","element":"a"},{"text":"(","element":"span"},{"href":"#id-7","referenceIndex":7,"text":"2019","element":"a"},{"text":") adapt S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"to analyze their parser’s performance in predicting triples that are in close proximity to the root. Our metric S","element":"span"},{"style":{"height":14.73},"width":150.62,"height":36.83,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/12-3.png","element":"img","alt":"2MATCH","inline":true,"padRight":true},{"text":"allows for straightforward integration of these approaches.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Computational AMR tasks ","element":"span"},{"text":"Since the introduction of AMR, many AMR-related tasks have emerged. Most prominent is AMR parsing (","element":"span"},{"href":"#id-57","referenceIndex":39,"text":"Wang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-57","referenceIndex":39,"text":"2015","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":38,"text":"2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-56","referenceIndex":12,"text":"Damonte et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-56","referenceIndex":12,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-58","referenceIndex":22,"text":"Konstas et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-58","referenceIndex":22,"text":"2017","element":"a"},{"text":"; ","element":"span"},{"href":"#id-17","referenceIndex":23,"text":"Lyu and Titov","element":"a"},{"text":", ","element":"span"},{"href":"#id-17","referenceIndex":23,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-45","referenceIndex":41,"text":"Zhang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-45","referenceIndex":41,"text":"2019","element":"a"},{"text":"). The inverse task generates text from AMR graphs (","element":"span"},{"href":"#id-59","referenceIndex":35,"text":"Song et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-59","referenceIndex":35,"text":"2017","element":"a"},{"text":", ","element":"span"},{"href":"#id-60","referenceIndex":37,"text":"2018","element":"a"},{"text":"; ","element":"span"},{"href":"#id-61","referenceIndex":11,"text":"Damonte and Cohen","element":"a"},{"text":", ","element":"span"},{"href":"#id-61","referenceIndex":11,"text":"2019","element":"a"},{"text":"). ","element":"span"},{"href":"#id-62","referenceIndex":27,"text":"Opitz and Frank ","element":"a"},{"text":"(","element":"span"},{"href":"#id-62","referenceIndex":27,"text":"2019","element":"a"},{"text":") rate the quality of automatic AMR parses without costly gold data.","element":"span"}]]},{"heading":"8 Conclusion","paragraphs":[[{"text":"We motivated seven principles for metrics measuring the similarity of graph-based (Abstract) Meaning Representations, from mathematical, linguistic and engineering perspectives. A metric that ful-fills all principles is applicable to a wide spectrum of cases, ranging from parser evaluation to sound IAA calculation. ","element":"span"},{"text":"Hence ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(i) our principles can inform (A)MR researchers who desire to compare and select among metrics","element":"span"},{"text":", and ","element":"span"},{"style":{"fontWeight":"bold"},"text":"(ii) they ease and guide the development of new metrics","element":"span"},{"text":".","element":"span"}],[{"text":"We provided examples for both scenarios. We showcased (i) by utilizing our principles as guidelines for an in-depth analysis of two AMR metrics: S","element":"span"},{"text":"MATCH ","element":"span"},{"text":"and the recent S","element":"span"},{"text":"EM","element":"span"},{"text":"B","element":"span"},{"text":"LEU ","element":"span"},{"text":"metrics, two quite distinct approaches. Our analysis uncovered that the latter does not satisfy some principles, which might reduce its safety and applicability. In line of (ii), we target the fulfilment of all seven principles and propose S","element":"span"},{"style":{"height":17.53},"width":282.74,"height":43.84,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2001.10929/images/12-4.png","element":"img","alt":"2MATCH, a met-","inline":true,"padRight":true},{"text":"ric that accounts for graded similarity of concepts as atomic graph components. In future work, we aim for a metric that accounts for graded compositional similarity of subgraphs.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Acknowledgements.","element":"span"}],[{"text":"We are grateful to the anonymous reviewers and the action editors for their valuable time and comments. This work has been partially funded by DFG within the project ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ExpLAIN. Between the Lines â ","element":"span"},{"style":{"fontStyle":"italic"},"text":"˘","element":"span"},{"style":{"fontStyle":"italic"},"text":"A¸S Knowledge-based Analysis of Argumentation in a Formal Argumentation Inference System","element":"span"},{"text":", FR 1707/-4-1, as part of the RATIO Priority Program and by the the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Leibniz ScienceCampus Empirical Linguistics & Computational Language Modeling","element":"span"},{"text":", supported by Leibniz Association grant no. SAS2015-IDS-LWC and by the Ministry of Science, Research, and Art of BadenWurttemberg.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-52","text":"Lasha ","element":"span"},{"text":"Abzianidze, ","element":"span"},{"text":"Rik ","element":"span"},{"text":"van ","element":"span"},{"text":"Noord, ","element":"span"},{"text":"Hessel Haagsma, ","element":"span"},{"text":"and ","element":"span"},{"text":"Johan ","element":"span"},{"text":"Bos. ","element":"span"},{"text":"2019. ","element":"span"},{"href":"https://doi.org/10.18653/v1/W19-1201","text":"The ","element":"a"},{"href":"https://doi.org/10.18653/v1/W19-1201","text":"First Shared Task on Discourse Representation ","element":"a"},{"href":"https://doi.org/10.18653/v1/W19-1201","text":"Structure Parsing","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the IWCS Shared Task on Semantic Parsing","element":"span"},{"text":", Gothenburg, Sweden.","element":"span"}],[{"id":"id-55","text":"Rafael Torres Anchiêta, Marco Antonio Sobrevilla ","element":"span"},{"text":"Cabezudo, and Thiago Alexandre Salgueiro Pardo. 2019. ","element":"span"},{"text":"SEMA: an Extended Semantic Evaluation for AMR. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"(To appear) Proceedings of the 20th Computational Linguistics and Intelligent Text Processing","element":"span"},{"text":". Springer International Publishg.","element":"span"}],[{"id":"id-3","text":"Laura ","element":"span"},{"text":"Banarescu, ","element":"span"},{"text":"Claire ","element":"span"},{"text":"Bonial, ","element":"span"},{"text":"Shu ","element":"span"},{"text":"Cai, Madalina Georgescu, Kira Griffitt, Ulf Hermjakob, Kevin Knight, Philipp Koehn, Martha Palmer, and Nathan Schneider. 2013. Abstract meaning representation for sembanking. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 7th Linguistic Annotation Workshop and Interoperability with Discourse","element":"span"},{"text":", pages 178–186.","element":"span"}],[{"id":"id-48","text":"Johan Bos. 2016. ","element":"span"},{"href":"https://doi.org/10.1162/COLI_a_00257","text":"Expressive Power of Abstract ","element":"a"},{"href":"https://doi.org/10.1162/COLI_a_00257","text":"Meaning Representations","element":"a"},{"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Computational Linguistics","element":"span"},{"text":", 42(3):527–535.","element":"span"}],[{"id":"id-49","text":"Johan Bos. 2019. Separating Argument Structure ","element":"span"},{"text":"from Logical Structure in AMR. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1908.01355","element":"span"},{"text":".","element":"span"}],[{"id":"id-35","text":"Alexander Budanitsky and Graeme Hirst. 2006. ","element":"span"},{"href":"https://doi.org/10.1162/coli.2006.32.1.13","text":"Evaluating WordNet-based Measures of Lexi- ","element":"a"},{"href":"https://doi.org/10.1162/coli.2006.32.1.13","text":"cal Semantic Relatedness","element":"a"},{"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Computational Linguistics","element":"span"},{"text":", 32(1):13–47.","element":"span"}],[{"id":"id-7","text":"Deng Cai and Wai Lam. 2019. ","element":"span"},{"href":"https://doi.org/10.18653/v1/D19-1393","text":"Core Seman- ","element":"a"},{"text":"tic First: ","element":"span"},{"href":"https://doi.org/10.18653/v1/D19-1393","text":"A Top-down Approach for AMR ","element":"a"},{"href":"https://doi.org/10.18653/v1/D19-1393","text":"Parsing","element":"a"},{"text":". ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)","element":"span"},{"text":", pages 3797–3807, Hong Kong, China.","element":"span"}],[{"id":"id-0","text":"Shu Cai and Kevin Knight. 2013. ","element":"span"},{"href":"https://www.aclweb.org/anthology/P13-2131","text":"Smatch: an ","element":"a"},{"href":"https://www.aclweb.org/anthology/P13-2131","text":"Evaluation Metric for Semantic Feature Struc- ","element":"a"},{"href":"https://www.aclweb.org/anthology/P13-2131","text":"tures","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)","element":"span"},{"text":", pages 748– 752, Sofia, Bulgaria.","element":"span"}],[{"id":"id-36","text":"Daniel Cer, Mona Diab, Eneko Agirre, Iñigo ","element":"span"},{"text":"Lopez-Gazpio, ","element":"span"},{"text":"and ","element":"span"},{"text":"Lucia ","element":"span"},{"text":"Specia. ","element":"span"},{"text":"2017. ","element":"span"},{"href":"https://doi.org/10.18653/v1/S17-2001","text":"SemEval-2017 Task 1: ","element":"a"},{"text":"Semantic Textual ","element":"span"},{"href":"https://doi.org/10.18653/v1/S17-2001","text":"Similarity Multilingual and Crosslingual Fo- ","element":"a"},{"href":"https://doi.org/10.18653/v1/S17-2001","text":"cused Evaluation","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)","element":"span"},{"text":", pages 1–14, Vancouver, Canada.","element":"span"}],[{"id":"id-12","text":"Boxing Chen and Colin Cherry. 2014. ","element":"span"},{"href":"http://www.aclweb.org/anthology/W/W14/W14-3346","text":"A System- ","element":"a"},{"href":"http://www.aclweb.org/anthology/W/W14/W14-3346","text":"atic Comparison of Smoothing Techniques for ","element":"a"},{"href":"http://www.aclweb.org/anthology/W/W14/W14-3346","text":"Sentence-Level BLEU","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the Ninth Workshop on Statistical Machine Translation","element":"span"},{"text":", pages 362–367, Baltimore, Maryland, USA.","element":"span"}],[{"id":"id-61","text":"Marco Damonte and Shay B. Cohen. 2019. ","element":"span"},{"href":"https://doi.org/10.18653/v1/N19-1366","text":"Struc- ","element":"a"},{"href":"https://doi.org/10.18653/v1/N19-1366","text":"tural Neural Encoders for AMR-to-text Gener- ","element":"a"},{"href":"https://doi.org/10.18653/v1/N19-1366","text":"ation","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","element":"span"},{"text":", pages 3649–3658, Minneapolis, Minnesota.","element":"span"}],[{"id":"id-56","text":"Marco Damonte, Shay B. Cohen, and Giorgio ","element":"span"},{"text":"Satta. 2017. ","element":"span"},{"href":"https://www.aclweb.org/anthology/E17-1051","text":"An Incremental Parser for Abstract ","element":"a"},{"href":"https://www.aclweb.org/anthology/E17-1051","text":"Meaning Representation","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers","element":"span"},{"text":", pages 536–546, Valencia, Spain.","element":"span"}],[{"id":"id-6","text":"Philip Edmonds and Graeme Hirst. 2002. ","element":"span"},{"href":"https://doi.org/10.1162/089120102760173625","text":"Near- ","element":"a"},{"href":"https://doi.org/10.1162/089120102760173625","text":"synonymy and Lexical Choice","element":"a"},{"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Computational Linguistics","element":"span"},{"text":", 28(2):105–144.","element":"span"}],[{"id":"id-53","text":"Kilian Evang. 2019. ","element":"span"},{"href":"https://doi.org/10.18653/v1/W19-1202","text":"Transition-based DRS Pars- ","element":"a"},{"href":"https://doi.org/10.18653/v1/W19-1202","text":"ing Using Stack-LSTMs","element":"a"},{"text":". ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the IWCS Shared Task on Semantic Parsing","element":"span"},{"text":", Gothenburg, Sweden.","element":"span"}],[{"id":"id-18","text":"Jeffrey Flanigan, Sam Thomson, Jaime Carbonell, ","element":"span"},{"text":"Chris Dyer, and Noah A. Smith. 2014. ","element":"span"},{"href":"https://doi.org/10.3115/v1/P14-1134","text":"A Dis- ","element":"a"},{"href":"https://doi.org/10.3115/v1/P14-1134","text":"criminative Graph-Based Parser for the Abstract ","element":"a"},{"href":"https://doi.org/10.3115/v1/P14-1134","text":"Meaning Representation","element":"a"},{"text":". ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","element":"span"},{"text":", pages 1426–1436, Baltimore, Maryland.","element":"span"}],[{"id":"id-54","text":"Michael Wayne Goodman. 2019. AMR normal- ","element":"span"},{"text":"ization for fairer evaluation. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 33rd Pacific Asia Conference on Language, Information, and Computation","element":"span"},{"text":", Hakodate.","element":"span"}],[{"text":"Matthias Huck, Fabienne Braune, and Alexan- ","element":"span"},{"text":"der Fraser. 2017. ","element":"span"},{"href":"https://doi.org/10.18653/v1/W17-4730","text":"LMU Munich’s Neural Ma- ","element":"a"},{"href":"https://doi.org/10.18653/v1/W17-4730","text":"chine Translation Systems for News Articles ","element":"a"},{"href":"https://doi.org/10.18653/v1/W17-4730","text":"and Health Information Texts","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the Second Conference on Machine Translation","element":"span"},{"text":", pages 315–322, Copenhagen, Denmark.","element":"span"}],[{"id":"id-5","text":"Diana Inkpen and Graeme Hirst. 2006. ","element":"span"},{"href":"https://doi.org/10.1162/coli.2006.32.2.223","text":"Building ","element":"a"},{"href":"https://doi.org/10.1162/coli.2006.32.2.223","text":"and Using a Lexical Knowledge Base of Near- ","element":"a"},{"href":"https://doi.org/10.1162/coli.2006.32.2.223","text":"Synonym Differences","element":"a"},{"text":". ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Computational Linguistics","element":"span"},{"text":", 32(2):223–262.","element":"span"}],[{"id":"id-9","text":"Paul Jaccard. 1912. The distribution of the flora in ","element":"span"},{"text":"the alpine zone. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"New phytologist","element":"span"},{"text":", 11(2):37–50.","element":"span"}],[{"id":"id-50","text":"Hans Kamp. 1981. A theory of truth and semantic ","element":"span"},{"text":"representation. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Formal semantics: The Essential Readings","element":"span"},{"text":", pages 189–222.","element":"span"}],[{"id":"id-51","text":"Hans Kamp and Uwe Reyle. 1993. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"From Discourse to Logic. Introduction to Modeltheoretic Semantics of Natural Language, Formal Logic and Discourse Representation Theory","element":"span"},{"text":". Kluwer, Dordrecht.","element":"span"}],[{"id":"id-58","text":"Ioannis Konstas, Srinivasan Iyer, Mark Yatskar, ","element":"span"},{"text":"Yejin Choi, and Luke Zettlemoyer. 2017. ","element":"span"},{"href":"https://doi.org/10.18653/v1/P17-1014","text":"Neu- ","element":"a"},{"href":"https://doi.org/10.18653/v1/P17-1014","text":"ral AMR: Sequence-to-Sequence Models for ","element":"a"},{"href":"https://doi.org/10.18653/v1/P17-1014","text":"Parsing and Generation","element":"a"},{"text":". ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics, ACL 2017, Vancouver, Canada, July 30 - August 4, Volume 1: Long Papers","element":"span"},{"text":", pages 146–157.","element":"span"}],[{"id":"id-17","text":"Chunchuan Lyu and Ivan Titov. 2018. ","element":"span"},{"href":"https://www.aclweb.org/anthology/P18-1037","text":"AMR Pars- ","element":"a"},{"href":"https://www.aclweb.org/anthology/P18-1037","text":"ing as Graph Prediction with Latent Alignment","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","element":"span"},{"text":", pages 397–407, Melbourne, Australia.","element":"span"}],[{"id":"id-19","text":"Qingsong Ma, Ondrej Bojar, and Yvette Graham. ","element":"span"},{"text":"2018. ","element":"span"},{"href":"https://www.aclweb.org/anthology/W18-6450/","text":"Results of the WMT18 Metrics Shared ","element":"a"},{"href":"https://www.aclweb.org/anthology/W18-6450/","text":"Task: Both characters and embeddings achieve ","element":"a"},{"href":"https://www.aclweb.org/anthology/W18-6450/","text":"good performance","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the Third Conference on Machine Translation: Shared Task Papers, WMT 2018, Belgium, Brussels, October 31 - November 1, 2018","element":"span"},{"text":", pages 671– 688.","element":"span"}],[{"id":"id-37","text":"Marco Marelli, Luisa Bentivogli, Marco Ba- ","element":"span"},{"text":"roni, Raffaella Bernardi, Stefano Menini, and Roberto Zamparelli. 2014. ","element":"span"},{"href":"https://doi.org/10.3115/v1/S14-2001","text":"SemEval-2014 Task ","element":"a"},{"href":"https://doi.org/10.3115/v1/S14-2001","text":"1: Evaluation of Compositional Distributional","element":"a"}],[{"href":"https://doi.org/10.3115/v1/S14-2001","text":"Semantic Models on Full Sentences through Se- ","element":"a"},{"href":"https://doi.org/10.3115/v1/S14-2001","text":"mantic Relatedness and Textual Entailment","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 8th International Workshop on Semantic Evaluation (SemEval 2014)","element":"span"},{"text":", pages 1–8, Dublin, Ireland.","element":"span"}],[{"id":"id-47","text":"Tahira Naseem, Abhishek Shah, Hui Wan, Radu ","element":"span"},{"text":"Florian, Salim Roukos, and Miguel Ballesteros. 2019. ","element":"span"},{"href":"https://doi.org/10.18653/v1/P19-1451","text":"Rewarding Smatch: Transition-Based ","element":"a"},{"href":"https://doi.org/10.18653/v1/P19-1451","text":"AMR Parsing with Reinforcement Learning","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","element":"span"},{"text":", pages 4586–4592, Florence, Italy.","element":"span"}],[{"id":"id-62","text":"Juri Opitz and Anette Frank. 2019. ","element":"span"},{"href":"https://www.aclweb.org/anthology/S19-1024","text":"Auto- ","element":"a"},{"href":"https://www.aclweb.org/anthology/S19-1024","text":"matic Accuracy Prediction for AMR Parsing","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the Eighth Joint Conference on Lexical and Computational Semantics (*SEM 2019)","element":"span"},{"text":", pages 212–223, Minneapolis, Minnesota.","element":"span"}],[{"id":"id-11","text":"Panagiotis Papadimitriou, Ali Dasdan, and Hector ","element":"span"},{"text":"Garcia-Molina. 2010. Web graph similarity for anomaly detection. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Internet Services and Applications","element":"span"},{"text":", 1(1):19–30.","element":"span"}],[{"id":"id-2","text":"Kishore Papineni, Salim Roukos, Todd Ward, and ","element":"span"},{"text":"Wei-Jing Zhu. 2002. ","element":"span"},{"text":"Bleu: ","element":"span"},{"href":"https://doi.org/10.3115/1073083.1073135","text":"a Method for ","element":"a"},{"href":"https://doi.org/10.3115/1073083.1073135","text":"Automatic Evaluation of Machine Translation","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","element":"span"},{"text":", pages 311–318, Philadelphia, Pennsylvania, USA.","element":"span"}],[{"id":"id-31","text":"Jeffrey Pennington, Richard Socher, and Christo- ","element":"span"},{"text":"pher Manning. 2014. ","element":"span"},{"href":"https://doi.org/10.3115/v1/D14-1162","text":"GloVe: Global Vectors ","element":"a"},{"href":"https://doi.org/10.3115/v1/D14-1162","text":"for Word Representation","element":"a"},{"text":". ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)","element":"span"},{"text":", pages 1532–1543, Doha, Qatar.","element":"span"}],[{"id":"id-10","text":"Raimundo Real and Juan M Vargas. 1996. The ","element":"span"},{"text":"probabilistic basis of Jaccard’s index of similarity. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Systematic biology","element":"span"},{"text":", 45(3):380–385.","element":"span"}],[{"id":"id-26","text":"Kaspar Riesen, Xiaoyi Jiang, and Horst Bunke. ","element":"span"},{"text":"2010. ","element":"span"},{"text":"Exact and inexact graph matching: Methodology and applications. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Managing and Mining Graph Data","element":"span"},{"text":", pages 217–247. Springer.","element":"span"}],[{"id":"id-8","text":"Adam Schenker, Horst Bunke, Mark Last, and ","element":"span"},{"text":"Abraham Kandel. 2005. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Graph-Theoretic Techniques for Web Content Mining","element":"span"},{"text":". World Scien-tific Publishing Co., Inc., USA.","element":"span"}],[{"id":"id-1","text":"Linfeng Song and Daniel Gildea. 2019. ","element":"span"},{"href":"https://doi.org/10.18653/v1/P19-1446","text":"SemBleu: ","element":"a"},{"href":"https://doi.org/10.18653/v1/P19-1446","text":"A Robust Metric for AMR Parsing Evaluation","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","element":"span"},{"text":", pages 4547–4552, Florence, Italy.","element":"span"}],[{"id":"id-59","text":"Linfeng Song, Xiaochang Peng, Yue Zhang, ","element":"span"},{"text":"Zhiguo Wang, and Daniel Gildea. 2017. ","element":"span"},{"href":"https://doi.org/10.18653/v1/P17-2002","text":"AMR- ","element":"a"},{"href":"https://doi.org/10.18653/v1/P17-2002","text":"to-text Generation with Synchronous Node Re- ","element":"a"},{"href":"https://doi.org/10.18653/v1/P17-2002","text":"placement Grammar","element":"a"},{"text":". ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Short Papers)","element":"span"},{"text":", pages 7–13, Vancouver, Canada.","element":"span"}],[{"text":"Linfeng Song, Yue Zhang, Xiaochang Peng, ","element":"span"},{"text":"Zhiguo Wang, and Daniel Gildea. 2016. ","element":"span"},{"href":"https://doi.org/10.18653/v1/D16-1224","text":"AMR- ","element":"a"},{"href":"https://doi.org/10.18653/v1/D16-1224","text":"to-text generation as a Traveling Salesman ","element":"a"},{"href":"https://doi.org/10.18653/v1/D16-1224","text":"Problem","element":"a"},{"text":". ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing","element":"span"},{"text":", pages 2084–2089, Austin, Texas.","element":"span"}],[{"id":"id-60","text":"Linfeng Song, Yue Zhang, Zhiguo Wang, and ","element":"span"},{"text":"Daniel Gildea. 2018. ","element":"span"},{"href":"https://doi.org/10.18653/v1/P18-1150","text":"A Graph-to-Sequence ","element":"a"},{"href":"https://doi.org/10.18653/v1/P18-1150","text":"Model for AMR-to-Text Generation","element":"a"},{"text":". ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","element":"span"},{"text":", pages 1616–1626, Melbourne, Australia.","element":"span"}],[{"id":"id-16","text":"Chuan Wang, Sameer Pradhan, Xiaoman Pan, ","element":"span"},{"text":"Heng Ji, and Nianwen Xue. 2016. ","element":"span"},{"href":"https://doi.org/10.18653/v1/S16-1181","text":"CAMR ","element":"a"},{"text":"at ","element":"span"},{"href":"https://doi.org/10.18653/v1/S16-1181","text":"SemEval-2016 ","element":"a"},{"text":"Task ","element":"span"},{"text":"8: ","element":"span"},{"text":"An ","element":"span"},{"text":"Extended ","element":"span"},{"href":"https://doi.org/10.18653/v1/S16-1181","text":"Transition-based AMR Parser","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 10th International Workshop on Semantic Evaluation (SemEval-2016)","element":"span"},{"text":", pages 1173–1178, San Diego, California.","element":"span"}],[{"id":"id-57","text":"Chuan Wang, Nianwen Xue, and Sameer Pradhan. ","element":"span"},{"text":"2015. ","element":"span"},{"href":"https://doi.org/10.3115/v1/P15-2141","text":"Boosting Transition-based AMR Parsing ","element":"a"},{"href":"https://doi.org/10.3115/v1/P15-2141","text":"with Refined Actions and Auxiliary Analyzers","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)","element":"span"},{"text":", pages 857–862, Beijing, China.","element":"span"}],[{"id":"id-25","text":"Junchi Yan, Xu-Cheng Yin, Weiyao Lin, Cheng ","element":"span"},{"text":"Deng, Hongyuan Zha, and Xiaokang Yang. 2016. ","element":"span"},{"text":"A short survey of recent advances in graph matching. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval","element":"span"},{"text":", pages 167–174. ACM.","element":"span"}],[{"id":"id-45","text":"Sheng Zhang, Xutai Ma, Kevin Duh, and Ben- ","element":"span"},{"text":"jamin Van Durme. 2019. ","element":"span"},{"href":"https://doi.org/10.18653/v1/P19-1009","text":"AMR Parsing as ","element":"a"},{"href":"https://doi.org/10.18653/v1/P19-1009","text":"Sequence-to-Graph Transduction","element":"a"},{"text":". In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","element":"span"},{"text":", pages 80– 94, Florence, Italy.","element":"span"}]]}],"_version":"3.3.2"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]