lexicon_builder_test.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. # coding=utf-8
  2. # Copyright 2016 Google Inc. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ==============================================================================
  16. """Tests for lexicon_builder."""
  17. # disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
  18. import os.path
  19. import tensorflow as tf
  20. import syntaxnet.load_parser_ops
  21. from tensorflow.python.framework import test_util
  22. from tensorflow.python.platform import googletest
  23. from tensorflow.python.platform import tf_logging as logging
  24. from syntaxnet import sentence_pb2
  25. from syntaxnet import task_spec_pb2
  26. from syntaxnet.ops import gen_parser_ops
  27. FLAGS = tf.app.flags.FLAGS
  28. CONLL_DOC1 = u'''1 बात _ n NN _ _ _ _ _
  29. 2 गलत _ adj JJ _ _ _ _ _
  30. 3 हो _ v VM _ _ _ _ _
  31. 4 तो _ avy CC _ _ _ _ _
  32. 5 गुस्सा _ n NN _ _ _ _ _
  33. 6 सेलेब्रिटिज _ n NN _ _ _ _ _
  34. 7 को _ psp PSP _ _ _ _ _
  35. 8 भी _ avy RP _ _ _ _ _
  36. 9 आना _ v VM _ _ _ _ _
  37. 10 लाजमी _ adj JJ _ _ _ _ _
  38. 11 है _ v VM _ _ _ _ _
  39. 12 । _ punc SYM _ _ _ _ _'''
  40. CONLL_DOC2 = u'''1 लेकिन _ avy CC _ _ _ _ _
  41. 2 अभिनेत्री _ n NN _ _ _ _ _
  42. 3 के _ psp PSP _ _ _ _ _
  43. 4 इस _ pn DEM _ _ _ _ _
  44. 5 कदम _ n NN _ _ _ _ _
  45. 6 से _ psp PSP _ _ _ _ _
  46. 7 वहां _ pn PRP _ _ _ _ _
  47. 8 रंग _ n NN _ _ _ _ _
  48. 9 में _ psp PSP _ _ _ _ _
  49. 10 भंग _ adj JJ _ _ _ _ _
  50. 11 पड़ _ v VM _ _ _ _ _
  51. 12 गया _ v VAUX _ _ _ _ _
  52. 13 । _ punc SYM _ _ _ _ _'''
  53. TAGS = ['NN', 'JJ', 'VM', 'CC', 'PSP', 'RP', 'JJ', 'SYM', 'DEM', 'PRP', 'VAUX']
  54. CATEGORIES = ['n', 'adj', 'v', 'avy', 'n', 'psp', 'punc', 'pn']
  55. TOKENIZED_DOCS = u'''बात गलत हो तो गुस्सा सेलेब्रिटिज को भी आना लाजमी है ।
  56. लेकिन अभिनेत्री के इस कदम से वहां रंग में भंग पड़ गया ।
  57. '''
  58. CHARS = u'''अ इ आ क ग ज ट त द न प भ ब य म र ल व ह स ि ा ु ी े ै ो ् ड़ । ं'''.split(' ')
  59. CHAR_NGRAMS = u'''^ अ ^ अभ ^ आ ^ आन ^ इ ^ इस $ ^ क ^ कद ^ के $ ^ को $ ^ ग ^ गय ^ गल ^ गु ^ त ^ तो $ ^ प ^ पड़ $ ^ ब ^ बा ^ भ ^ भं ^ भी $ ^ म ^ मे ^ र ^ रं ^ ल ^ ला ^ ले ^ व ^ वह ^ स ^ से ^ से $ ^ ह ^ है $ ^ हो $ ^ । $ ं ं $ ंग $ क कि ग $ ज ज $ जम ट टि त त $ त् द दम $ न न $ ना $ ने ब ब् भ भि म म $ मी $ य या $ र रि री $ ल ल लत $ ले स स $ सा $ स् ह हा ा ा $ ां $ ाज ात $ ि िज $ िट िन िन $ ी $ ु ुस े े $ ें $ ेक ेत ेब ै $ ो $ ् ्र ्स ड़ $'''.split(' ')
  60. COMMENTS = u'# Line with fake comments.'
  61. class LexiconBuilderTest(test_util.TensorFlowTestCase):
  62. def setUp(self):
  63. if not hasattr(FLAGS, 'test_srcdir'):
  64. FLAGS.test_srcdir = ''
  65. if not hasattr(FLAGS, 'test_tmpdir'):
  66. FLAGS.test_tmpdir = tf.test.get_temp_dir()
  67. self.corpus_file = os.path.join(FLAGS.test_tmpdir, 'documents.conll')
  68. self.context_file = os.path.join(FLAGS.test_tmpdir, 'context.pbtxt')
  69. def AddInput(self, name, file_pattern, record_format, context):
  70. inp = context.input.add()
  71. inp.name = name
  72. inp.record_format.append(record_format)
  73. inp.part.add().file_pattern = file_pattern
  74. def AddParameter(self, name, value, context):
  75. param = context.parameter.add()
  76. param.name = name
  77. param.value = value
  78. def WriteContext(self, corpus_format):
  79. context = task_spec_pb2.TaskSpec()
  80. self.AddParameter('brain_parser_embedding_names', 'words;tags', context)
  81. self.AddParameter('brain_parser_features', 'input.token.word;input.tag',
  82. context)
  83. self.AddInput('documents', self.corpus_file, corpus_format, context)
  84. for name in ('word-map', 'lcword-map', 'tag-map',
  85. 'category-map', 'label-map', 'prefix-table',
  86. 'suffix-table', 'tag-to-category', 'char-map',
  87. 'char-ngram-map'):
  88. self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
  89. logging.info('Writing context to: %s', self.context_file)
  90. with open(self.context_file, 'w') as f:
  91. f.write(str(context))
  92. def ReadNextDocument(self, sess, doc_source):
  93. doc_str, last = sess.run(doc_source)
  94. if doc_str:
  95. doc = sentence_pb2.Sentence()
  96. doc.ParseFromString(doc_str[0])
  97. else:
  98. doc = None
  99. return doc, last
  100. def ValidateDocuments(self):
  101. doc_source = gen_parser_ops.document_source(
  102. task_context=self.context_file, batch_size=1)
  103. with self.test_session() as sess:
  104. logging.info('Reading document1')
  105. doc, last = self.ReadNextDocument(sess, doc_source)
  106. self.assertEqual(len(doc.token), 12)
  107. self.assertEqual(u'लाजमी', doc.token[9].word)
  108. self.assertFalse(last)
  109. logging.info('Reading document2')
  110. doc, last = self.ReadNextDocument(sess, doc_source)
  111. self.assertEqual(len(doc.token), 13)
  112. self.assertEqual(u'भंग', doc.token[9].word)
  113. self.assertFalse(last)
  114. logging.info('Hitting end of the dataset')
  115. doc, last = self.ReadNextDocument(sess, doc_source)
  116. self.assertTrue(doc is None)
  117. self.assertTrue(last)
  118. def ValidateTagToCategoryMap(self):
  119. with file(os.path.join(FLAGS.test_tmpdir, 'tag-to-category'), 'r') as f:
  120. entries = [line.strip().split('\t') for line in f.readlines()]
  121. for tag, category in entries:
  122. self.assertIn(tag, TAGS)
  123. self.assertIn(category, CATEGORIES)
  124. def LoadMap(self, map_name):
  125. loaded_map = {}
  126. with file(os.path.join(FLAGS.test_tmpdir, map_name), 'r') as f:
  127. for line in f:
  128. entries = line.strip().split(' ')
  129. if len(entries) >= 2:
  130. loaded_map[' '.join(entries[:-1])] = entries[-1]
  131. return loaded_map
  132. def ValidateCharMap(self):
  133. char_map = self.LoadMap('char-map')
  134. self.assertEqual(len(char_map), len(CHARS))
  135. for char in CHARS:
  136. self.assertIn(char.encode('utf-8'), char_map)
  137. def ValidateCharNgramMap(self):
  138. char_ngram_map = self.LoadMap('char-ngram-map')
  139. self.assertEqual(len(char_ngram_map), len(CHAR_NGRAMS))
  140. for char_ngram in CHAR_NGRAMS:
  141. self.assertIn(char_ngram.encode('utf-8'), char_ngram_map)
  142. def ValidateWordMap(self):
  143. word_map = self.LoadMap('word-map')
  144. for word in filter(None, TOKENIZED_DOCS.replace('\n', ' ').split(' ')):
  145. self.assertIn(word.encode('utf-8'), word_map)
  146. def BuildLexicon(self):
  147. with self.test_session():
  148. gen_parser_ops.lexicon_builder(
  149. task_context=self.context_file,
  150. lexicon_max_char_ngram_length=2,
  151. lexicon_char_ngram_mark_boundaries=True).run()
  152. def testCoNLLFormat(self):
  153. self.WriteContext('conll-sentence')
  154. logging.info('Writing conll file to: %s', self.corpus_file)
  155. with open(self.corpus_file, 'w') as f:
  156. f.write((CONLL_DOC1 + u'\n\n' + CONLL_DOC2 + u'\n')
  157. .replace(' ', '\t').encode('utf-8'))
  158. self.ValidateDocuments()
  159. self.BuildLexicon()
  160. self.ValidateTagToCategoryMap()
  161. self.ValidateCharMap()
  162. self.ValidateCharNgramMap()
  163. self.ValidateWordMap()
  164. def testCoNLLFormatExtraNewlinesAndComments(self):
  165. self.WriteContext('conll-sentence')
  166. with open(self.corpus_file, 'w') as f:
  167. f.write((u'\n\n\n' + CONLL_DOC1 + u'\n\n\n' + COMMENTS +
  168. u'\n\n' + CONLL_DOC2).replace(' ', '\t').encode('utf-8'))
  169. self.ValidateDocuments()
  170. self.BuildLexicon()
  171. self.ValidateTagToCategoryMap()
  172. def testTokenizedTextFormat(self):
  173. self.WriteContext('tokenized-text')
  174. with open(self.corpus_file, 'w') as f:
  175. f.write(TOKENIZED_DOCS.encode('utf-8'))
  176. self.ValidateDocuments()
  177. self.BuildLexicon()
  178. def testTokenizedTextFormatExtraNewlines(self):
  179. self.WriteContext('tokenized-text')
  180. with open(self.corpus_file, 'w') as f:
  181. f.write((u'\n\n\n' + TOKENIZED_DOCS + u'\n\n\n').encode('utf-8'))
  182. self.ValidateDocuments()
  183. self.BuildLexicon()
  184. def testFeatureVocab(self):
  185. words_vocab_op = gen_parser_ops.feature_vocab(
  186. task_context=self.context_file)
  187. foo_vocab_op = gen_parser_ops.feature_vocab(
  188. task_context=self.context_file, embedding_name='foo')
  189. with self.test_session() as sess:
  190. words_vocab, foo_vocab = sess.run([words_vocab_op, foo_vocab_op])
  191. self.assertEqual(0, len(foo_vocab))
  192. # Explicitly generate the expected vocabulary from the test documents.
  193. expected_vocab = set(['<UNKNOWN>', '<OUTSIDE>'])
  194. for doc in [CONLL_DOC1, CONLL_DOC2]:
  195. for line in doc.split('\n'):
  196. expected_vocab.add(line.split(' ')[1])
  197. actual_vocab = set(s.decode('utf-8') for s in words_vocab)
  198. self.assertEqual(expected_vocab, actual_vocab)
  199. if __name__ == '__main__':
  200. googletest.main()