# Copyright 2016 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """A program to generate ASCII trees from conll files.""" import collections import re import asciitree import tensorflow as tf import syntaxnet.load_parser_ops from tensorflow.python.platform import tf_logging as logging from syntaxnet import sentence_pb2 from syntaxnet.ops import gen_parser_ops flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('task_context', 'syntaxnet/models/parsey_mcparseface/context.pbtxt', 'Path to a task context with inputs and parameters for ' 'feature extractors.') flags.DEFINE_string('corpus_name', 'stdin-conll', 'Path to a task context with inputs and parameters for ' 'feature extractors.') def to_dict(sentence): """Builds a dictionary representing the parse tree of a sentence. Note that the suffix "@id" (where 'id' is a number) is appended to each element to handle the sentence that has multiple elements with identical representation. Those suffix needs to be removed after the asciitree is rendered. Args: sentence: Sentence protocol buffer to represent. Returns: Dictionary mapping tokens to children. """ token_str = list() children = [[] for token in sentence.token] root = -1 for i in range(0, len(sentence.token)): token = sentence.token[i] token_str.append('%s %s %s @%d' % (token.word, token.tag, token.label, (i+1))) if token.head == -1: root = i else: children[token.head].append(i) def _get_dict(i): d = collections.OrderedDict() for c in children[i]: d[token_str[c]] = _get_dict(c) return d tree = collections.OrderedDict() tree[token_str[root]] = _get_dict(root) return tree def main(unused_argv): logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source(batch_size=32, corpus_name=FLAGS.corpus_name, task_context=FLAGS.task_context) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) for d in documents: sentence.ParseFromString(d) tr = asciitree.LeftAligned() d = to_dict(sentence) print 'Input: %s' % sentence.text print 'Parse:' tr_str = tr(d) pat = re.compile(r'\s*@\d+$') for tr_ln in tr_str.splitlines(): print pat.sub('', tr_ln) if finished: break if __name__ == '__main__': tf.app.run()