radu
/
TensorFlow-Models


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
							# coding=utf-8
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Tests for lexicon_builder."""


# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import os.path
import tensorflow as tf

import syntaxnet.load_parser_ops

from tensorflow.python.framework import test_util
from tensorflow.python.platform import googletest
from tensorflow.python.platform import tf_logging as logging

from syntaxnet import sentence_pb2
from syntaxnet import task_spec_pb2
from syntaxnet.ops import gen_parser_ops

FLAGS = tf.app.flags.FLAGS

CONLL_DOC1 = u'''1 बात _ n NN _ _ _ _ _
2 गलत _ adj JJ _ _ _ _ _
3 हो _ v VM _ _ _ _ _
4 तो _ avy CC _ _ _ _ _
5 गुस्सा _ n NN _ _ _ _ _
6 सेलेब्रिटिज _ n NN _ _ _ _ _
7 को _ psp PSP _ _ _ _ _
8 भी _ avy RP _ _ _ _ _
9 आना _ v VM _ _ _ _ _
10 लाजमी _ adj JJ _ _ _ _ _
11 है _ v VM _ _ _ _ _
12 । _ punc SYM _ _ _ _ _'''

CONLL_DOC2 = u'''1 लेकिन _ avy CC _ _ _ _ _
2 अभिनेत्री _ n NN _ _ _ _ _
3 के _ psp PSP _ _ _ _ _
4 इस _ pn DEM _ _ _ _ _
5 कदम _ n NN _ _ _ _ _
6 से _ psp PSP _ _ _ _ _
7 वहां _ pn PRP _ _ _ _ _
8 रंग _ n NN _ _ _ _ _
9 में _ psp PSP _ _ _ _ _
10 भंग _ adj JJ _ _ _ _ _
11 पड़ _ v VM _ _ _ _ _
12 गया _ v VAUX _ _ _ _ _
13 । _ punc SYM _ _ _ _ _'''

TAGS = ['NN', 'JJ', 'VM', 'CC', 'PSP', 'RP', 'JJ', 'SYM', 'DEM', 'PRP', 'VAUX']

CATEGORIES = ['n', 'adj', 'v', 'avy', 'n', 'psp', 'punc', 'pn']

TOKENIZED_DOCS = u'''बात गलत हो तो गुस्सा सेलेब्रिटिज को भी आना लाजमी है ।
लेकिन अभिनेत्री के इस कदम से वहां रंग में भंग पड़ गया ।
'''

CHARS = u'''अ इ आ क ग ज ट त द न प भ ब य म र ल व ह स ि ा ु ी े ै ो ् ड़ । ं'''.split(' ')

CHAR_NGRAMS = u'''^ अ  ^ अभ  ^ आ  ^ आन  ^ इ  ^ इस $  ^ क  ^ कद  ^ के $  ^ को $  ^ ग  ^ गय  ^ गल  ^ गु  ^ त  ^ तो $  ^ प  ^ पड़ $  ^ ब  ^ बा  ^ भ  ^ भं  ^ भी $  ^ म  ^ मे  ^ र  ^ रं  ^ ल  ^ ला  ^ ले  ^ व  ^ वह  ^ स  ^ से  ^ से $  ^ ह  ^ है $  ^ हो $  ^ । $  ं  ं $  ंग $  क  कि  ग $  ज  ज $  जम  ट  टि  त  त $  त्  द  दम $  न  न $  ना $  ने  ब  ब्  भ  भि  म  म $  मी $  य  या $  र  रि  री $  ल  ल  लत $  ले  स  स $  सा $  स्  ह  हा  ा  ा $  ां $  ाज  ात $  ि  िज $  िट  िन  िन $  ी $  ु  ुस  े  े $  ें $  ेक  ेत  ेब  ै $  ो $  ्  ्र  ्स  ड़ $'''.split('  ')

COMMENTS = u'# Line with fake comments.'


class LexiconBuilderTest(test_util.TensorFlowTestCase):

  def setUp(self):
    if not hasattr(FLAGS, 'test_srcdir'):
      FLAGS.test_srcdir = ''
    if not hasattr(FLAGS, 'test_tmpdir'):
      FLAGS.test_tmpdir = tf.test.get_temp_dir()
    self.corpus_file = os.path.join(FLAGS.test_tmpdir, 'documents.conll')
    self.context_file = os.path.join(FLAGS.test_tmpdir, 'context.pbtxt')

  def AddInput(self, name, file_pattern, record_format, context):
    inp = context.input.add()
    inp.name = name
    inp.record_format.append(record_format)
    inp.part.add().file_pattern = file_pattern

  def AddParameter(self, name, value, context):
    param = context.parameter.add()
    param.name = name
    param.value = value

  def WriteContext(self, corpus_format):
    context = task_spec_pb2.TaskSpec()
    self.AddParameter('brain_parser_embedding_names', 'words;tags', context)
    self.AddParameter('brain_parser_features', 'input.token.word;input.tag',
                      context)
    self.AddInput('documents', self.corpus_file, corpus_format, context)
    for name in ('word-map', 'lcword-map', 'tag-map',
                 'category-map', 'label-map', 'prefix-table',
                 'suffix-table', 'tag-to-category', 'char-map',
                 'char-ngram-map'):
      self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
    logging.info('Writing context to: %s', self.context_file)
    with open(self.context_file, 'w') as f:
      f.write(str(context))

  def ReadNextDocument(self, sess, doc_source):
    doc_str, last = sess.run(doc_source)
    if doc_str:
      doc = sentence_pb2.Sentence()
      doc.ParseFromString(doc_str[0])
    else:
      doc = None
    return doc, last

  def ValidateDocuments(self):
    doc_source = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)
    with self.test_session() as sess:
      logging.info('Reading document1')
      doc, last = self.ReadNextDocument(sess, doc_source)
      self.assertEqual(len(doc.token), 12)
      self.assertEqual(u'लाजमी', doc.token[9].word)
      self.assertFalse(last)
      logging.info('Reading document2')
      doc, last = self.ReadNextDocument(sess, doc_source)
      self.assertEqual(len(doc.token), 13)
      self.assertEqual(u'भंग', doc.token[9].word)
      self.assertFalse(last)
      logging.info('Hitting end of the dataset')
      doc, last = self.ReadNextDocument(sess, doc_source)
      self.assertTrue(doc is None)
      self.assertTrue(last)

  def ValidateTagToCategoryMap(self):
    with file(os.path.join(FLAGS.test_tmpdir, 'tag-to-category'), 'r') as f:
      entries = [line.strip().split('\t') for line in f.readlines()]
    for tag, category in entries:
      self.assertIn(tag, TAGS)
      self.assertIn(category, CATEGORIES)

  def LoadMap(self, map_name):
    loaded_map = {}
    with file(os.path.join(FLAGS.test_tmpdir, map_name), 'r') as f:
      for line in f:
        entries = line.strip().split(' ')
        if len(entries) >= 2:
          loaded_map[' '.join(entries[:-1])] = entries[-1]
    return loaded_map

  def ValidateCharMap(self):
    char_map = self.LoadMap('char-map')
    self.assertEqual(len(char_map), len(CHARS))
    for char in CHARS:
      self.assertIn(char.encode('utf-8'), char_map)

  def ValidateCharNgramMap(self):
    char_ngram_map = self.LoadMap('char-ngram-map')
    self.assertEqual(len(char_ngram_map), len(CHAR_NGRAMS))
    for char_ngram in CHAR_NGRAMS:
      self.assertIn(char_ngram.encode('utf-8'), char_ngram_map)

  def ValidateWordMap(self):
    word_map = self.LoadMap('word-map')
    for word in filter(None, TOKENIZED_DOCS.replace('\n', ' ').split(' ')):
      self.assertIn(word.encode('utf-8'), word_map)

  def BuildLexicon(self):
    with self.test_session():
      gen_parser_ops.lexicon_builder(
          task_context=self.context_file,
          lexicon_max_char_ngram_length=2,
          lexicon_char_ngram_mark_boundaries=True).run()

  def testCoNLLFormat(self):
    self.WriteContext('conll-sentence')
    logging.info('Writing conll file to: %s', self.corpus_file)
    with open(self.corpus_file, 'w') as f:
      f.write((CONLL_DOC1 + u'\n\n' + CONLL_DOC2 + u'\n')
              .replace(' ', '\t').encode('utf-8'))
    self.ValidateDocuments()
    self.BuildLexicon()
    self.ValidateTagToCategoryMap()
    self.ValidateCharMap()
    self.ValidateCharNgramMap()
    self.ValidateWordMap()

  def testCoNLLFormatExtraNewlinesAndComments(self):
    self.WriteContext('conll-sentence')
    with open(self.corpus_file, 'w') as f:
      f.write((u'\n\n\n' + CONLL_DOC1 + u'\n\n\n' + COMMENTS +
               u'\n\n' + CONLL_DOC2).replace(' ', '\t').encode('utf-8'))
    self.ValidateDocuments()
    self.BuildLexicon()
    self.ValidateTagToCategoryMap()

  def testTokenizedTextFormat(self):
    self.WriteContext('tokenized-text')
    with open(self.corpus_file, 'w') as f:
      f.write(TOKENIZED_DOCS.encode('utf-8'))
    self.ValidateDocuments()
    self.BuildLexicon()

  def testTokenizedTextFormatExtraNewlines(self):
    self.WriteContext('tokenized-text')
    with open(self.corpus_file, 'w') as f:
      f.write((u'\n\n\n' + TOKENIZED_DOCS + u'\n\n\n').encode('utf-8'))
    self.ValidateDocuments()
    self.BuildLexicon()

  def testFeatureVocab(self):
    words_vocab_op = gen_parser_ops.feature_vocab(
        task_context=self.context_file)
    foo_vocab_op = gen_parser_ops.feature_vocab(
        task_context=self.context_file, embedding_name='foo')

    with self.test_session() as sess:
      words_vocab, foo_vocab = sess.run([words_vocab_op, foo_vocab_op])

    self.assertEqual(0, len(foo_vocab))

    # Explicitly generate the expected vocabulary from the test documents.
    expected_vocab = set(['<UNKNOWN>', '<OUTSIDE>'])
    for doc in [CONLL_DOC1, CONLL_DOC2]:
      for line in doc.split('\n'):
        expected_vocab.add(line.split(' ')[1])

    actual_vocab = set(s.decode('utf-8') for s in words_vocab)
    self.assertEqual(expected_vocab, actual_vocab)


if __name__ == '__main__':
  googletest.main()