浏览代码

Fixed conll2tree.py; handling correctly a sentence that has multiple elements with identical representation (#292)

sgn-andot 9 年之前
父节点
当前提交
4fddb5e09e
共有 1 个文件被更改,包括 10 次插入3 次删除
  1. 10 3
      syntaxnet/syntaxnet/conll2tree.py

+ 10 - 3
syntaxnet/syntaxnet/conll2tree.py

@@ -15,6 +15,7 @@
 """A program to generate ASCII trees from conll files."""
 
 import collections
+import re
 
 import asciitree
 import tensorflow as tf
@@ -39,18 +40,21 @@ flags.DEFINE_string('corpus_name', 'stdin-conll',
 
 def to_dict(sentence):
   """Builds a dictionary representing the parse tree of a sentence.
+     Note that the suffix "@id" (where 'id' is a number) is appended to each element
+     to handle the sentence that has multiple elements with identical representation.
+     Those suffix needs to be removed after the asciitree is rendered.
 
   Args:
     sentence: Sentence protocol buffer to represent.
   Returns:
     Dictionary mapping tokens to children.
   """
-  token_str = ['%s %s %s' % (token.word, token.tag, token.label)
-               for token in sentence.token]
+  token_str = list()
   children = [[] for token in sentence.token]
   root = -1
   for i in range(0, len(sentence.token)):
     token = sentence.token[i]
+    token_str.append('%s %s %s @%d' % (token.word, token.tag, token.label, (i+1)))
     if token.head == -1:
       root = i
     else:
@@ -83,7 +87,10 @@ def main(unused_argv):
         d = to_dict(sentence)
         print 'Input: %s' % sentence.text
         print 'Parse:'
-        print tr(d)
+        tr_str = tr(d)
+        pat = re.compile('\s*@\d+$')
+        for tr_ln in tr_str.splitlines():
+          print pat.sub('', tr_ln)
 
       if finished:
         break