radu
/
TensorFlow-Models


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
							# coding=utf-8
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for english_tokenizer."""


# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import os.path
import tensorflow as tf

import syntaxnet.load_parser_ops

from tensorflow.python.framework import test_util
from tensorflow.python.platform import googletest
from tensorflow.python.platform import tf_logging as logging

from syntaxnet import sentence_pb2
from syntaxnet import task_spec_pb2
from syntaxnet.ops import gen_parser_ops

FLAGS = tf.app.flags.FLAGS


class TextFormatsTest(test_util.TensorFlowTestCase):

  def setUp(self):
    if not hasattr(FLAGS, 'test_srcdir'):
      FLAGS.test_srcdir = ''
    if not hasattr(FLAGS, 'test_tmpdir'):
      FLAGS.test_tmpdir = tf.test.get_temp_dir()
    self.corpus_file = os.path.join(FLAGS.test_tmpdir, 'documents.conll')
    self.context_file = os.path.join(FLAGS.test_tmpdir, 'context.pbtxt')

  def AddInput(self, name, file_pattern, record_format, context):
    inp = context.input.add()
    inp.name = name
    inp.record_format.append(record_format)
    inp.part.add().file_pattern = file_pattern

  def AddParameter(self, name, value, context):
    param = context.parameter.add()
    param.name = name
    param.value = value

  def WriteContext(self, corpus_format):
    context = task_spec_pb2.TaskSpec()
    self.AddInput('documents', self.corpus_file, corpus_format, context)
    for name in ('word-map', 'lcword-map', 'tag-map', 'category-map',
                 'label-map', 'prefix-table', 'suffix-table',
                 'tag-to-category'):
      self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
    logging.info('Writing context to: %s', self.context_file)
    with open(self.context_file, 'w') as f:
      f.write(str(context))

  def ReadNextDocument(self, sess, sentence):
    sentence_str, = sess.run([sentence])
    if sentence_str:
      sentence_doc = sentence_pb2.Sentence()
      sentence_doc.ParseFromString(sentence_str[0])
    else:
      sentence_doc = None
    return sentence_doc

  def CheckTokenization(self, sentence, tokenization):
    self.WriteContext('english-text')
    logging.info('Writing text file to: %s', self.corpus_file)
    with open(self.corpus_file, 'w') as f:
      f.write(sentence)
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(' '.join([t.word
                                 for t in sentence_doc.token]), tokenization)

  def CheckUntokenizedDoc(self, sentence, words, starts, ends):
    self.WriteContext('untokenized-text')
    logging.info('Writing text file to: %s', self.corpus_file)
    with open(self.corpus_file, 'w') as f:
      f.write(sentence)
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(len(sentence_doc.token), len(words))
      self.assertEqual(len(sentence_doc.token), len(starts))
      self.assertEqual(len(sentence_doc.token), len(ends))
      for i, token in enumerate(sentence_doc.token):
        self.assertEqual(token.word.encode('utf-8'), words[i])
        self.assertEqual(token.start, starts[i])
        self.assertEqual(token.end, ends[i])

  def testUntokenized(self):
    self.CheckUntokenizedDoc('一个测试', ['一', '个', '测', '试'], [0, 3, 6, 9],
                             [2, 5, 8, 11])
    self.CheckUntokenizedDoc('Hello ', ['H', 'e', 'l', 'l', 'o', ' '],
                             [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5])

  def testConllSentence(self):
    # This test sentence includes a multiword token and an empty node,
    # both of which are to be ignored.
    test_sentence = """
1-2	We've	_
1	We	we	PRON	PRP	Case=Nom	3	nsubj	_	SpaceAfter=No
2	've	have	AUX	VBP	Mood=Ind	3	aux	_	_
3	moved	move	VERB	VBN	Tense=Past	0	root	_	_
4	on	on	ADV	RB	_	3	advmod	_	SpaceAfter=No
4.1	ignored	ignore	VERB	VBN	Tense=Past	0	_	_	_
5	.	.	PUNCT	.	_	3	punct	_	_
"""

    # Prepare test sentence.
    with open(self.corpus_file, 'w') as f:
      f.write(test_sentence)

    # Prepare context.
    self.WriteContext('conll-sentence')

    # Test converted sentence.
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)

    # Expected texts, words, and start/end offsets.
    expected_text = u'We\'ve moved on.'
    expected_words = [u'We', u'\'ve', u'moved', u'on', u'.']
    expected_starts = [0, 2, 6, 12, 14]
    expected_ends = [1, 4, 10, 13, 14]
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(expected_text, sentence_doc.text)
      self.assertEqual(expected_words, [t.word for t in sentence_doc.token])
      self.assertEqual(expected_starts, [t.start for t in sentence_doc.token])
      self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])

  def testSentencePrototext(self):
    # Note: lstrip() is to avoid an empty line at the beginning, which will
    # cause an empty record to be emitted. These empty records currently aren't
    # supported by the sentence prototext format (which is currently mostly for
    # debugging).
    test_sentence = """
text: "fair enough; you people have eaten me."
token {
  word: "fair"
  start: 0
  end: 3
  break_level: NO_BREAK
}
token {
  word: "enough"
  start: 5
  end: 10
  head: 0
  break_level: SPACE_BREAK
}
""".lstrip()

    # Prepare test sentence.
    with open(self.corpus_file, 'w') as f:
      f.write(test_sentence)

    # Prepare context.
    self.WriteContext('sentence-prototext')

    # Test converted sentence.
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)

    # Expected texts, words, and start/end offsets.
    expected_text = u'fair enough; you people have eaten me.'
    expected_words = [u'fair', u'enough']
    expected_starts = [0, 5]
    expected_ends = [3, 10]
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(expected_text, sentence_doc.text)
      self.assertEqual(expected_words, [t.word for t in sentence_doc.token])
      self.assertEqual(expected_starts, [t.start for t in sentence_doc.token])
      self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])

  def testSegmentationTrainingData(self):
    doc1_lines = ['测试	NO_SPACE\n', '的	NO_SPACE\n', '句子	NO_SPACE']
    doc1_text = '测试的句子'
    doc1_tokens = ['测', '试', '的', '句', '子']
    doc1_break_levles = [1, 0, 1, 1, 0]
    doc2_lines = [
        'That	NO_SPACE\n', '\'s	SPACE\n', 'a	SPACE\n', 'good	SPACE\n',
        'point	NO_SPACE\n', '.	NO_SPACE'
    ]
    doc2_text = 'That\'s a good point.'
    doc2_tokens = [
        'T', 'h', 'a', 't', '\'', 's', ' ', 'a', ' ', 'g', 'o', 'o', 'd', ' ',
        'p', 'o', 'i', 'n', 't', '.'
    ]
    doc2_break_levles = [
        1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1
    ]
    self.CheckSegmentationTrainingData(doc1_lines, doc1_text, doc1_tokens,
                                       doc1_break_levles)
    self.CheckSegmentationTrainingData(doc2_lines, doc2_text, doc2_tokens,
                                       doc2_break_levles)

  def CheckSegmentationTrainingData(self, doc_lines, doc_text, doc_words,
                                    break_levels):
    # Prepare context.
    self.WriteContext('segment-train-data')

    # Prepare test sentence.
    with open(self.corpus_file, 'w') as f:
      f.write(''.join(doc_lines))

    # Test converted sentence.
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(doc_text.decode('utf-8'), sentence_doc.text)
      self.assertEqual([t.decode('utf-8') for t in doc_words],
                       [t.word for t in sentence_doc.token])
      self.assertEqual(break_levels,
                       [t.break_level for t in sentence_doc.token])

  def testSimple(self):
    self.CheckTokenization('Hello, world!', 'Hello , world !')
    self.CheckTokenization('"Hello"', "`` Hello ''")
    self.CheckTokenization('{"Hello@#$', '-LRB- `` Hello @ # $')
    self.CheckTokenization('"Hello..."', "`` Hello ... ''")
    self.CheckTokenization('()[]{}<>',
                           '-LRB- -RRB- -LRB- -RRB- -LRB- -RRB- < >')
    self.CheckTokenization('Hello--world', 'Hello -- world')
    self.CheckTokenization("Isn't", "Is n't")
    self.CheckTokenization("n't", "n't")
    self.CheckTokenization('Hello Mr. Smith.', 'Hello Mr. Smith .')
    self.CheckTokenization("It's Mr. Smith's.", "It 's Mr. Smith 's .")
    self.CheckTokenization("It's the Smiths'.", "It 's the Smiths ' .")
    self.CheckTokenization('Gotta go', 'Got ta go')
    self.CheckTokenization('50-year-old', '50-year-old')

  def testUrl(self):
    self.CheckTokenization('http://www.google.com/news is down',
                           'http : //www.google.com/news is down')


if __name__ == '__main__':
  googletest.main()