text_formats_test.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. # coding=utf-8
  2. # Copyright 2016 Google Inc. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ==============================================================================
  16. """Tests for english_tokenizer."""
  17. # disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
  18. import os.path
  19. import tensorflow as tf
  20. import syntaxnet.load_parser_ops
  21. from tensorflow.python.framework import test_util
  22. from tensorflow.python.platform import googletest
  23. from tensorflow.python.platform import tf_logging as logging
  24. from syntaxnet import sentence_pb2
  25. from syntaxnet import task_spec_pb2
  26. from syntaxnet.ops import gen_parser_ops
  27. FLAGS = tf.app.flags.FLAGS
  28. class TextFormatsTest(test_util.TensorFlowTestCase):
  29. def setUp(self):
  30. if not hasattr(FLAGS, 'test_srcdir'):
  31. FLAGS.test_srcdir = ''
  32. if not hasattr(FLAGS, 'test_tmpdir'):
  33. FLAGS.test_tmpdir = tf.test.get_temp_dir()
  34. self.corpus_file = os.path.join(FLAGS.test_tmpdir, 'documents.conll')
  35. self.context_file = os.path.join(FLAGS.test_tmpdir, 'context.pbtxt')
  36. def AddInput(self, name, file_pattern, record_format, context):
  37. inp = context.input.add()
  38. inp.name = name
  39. inp.record_format.append(record_format)
  40. inp.part.add().file_pattern = file_pattern
  41. def AddParameter(self, name, value, context):
  42. param = context.parameter.add()
  43. param.name = name
  44. param.value = value
  45. def WriteContext(self, corpus_format):
  46. context = task_spec_pb2.TaskSpec()
  47. self.AddInput('documents', self.corpus_file, corpus_format, context)
  48. for name in ('word-map', 'lcword-map', 'tag-map', 'category-map',
  49. 'label-map', 'prefix-table', 'suffix-table',
  50. 'tag-to-category'):
  51. self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
  52. logging.info('Writing context to: %s', self.context_file)
  53. with open(self.context_file, 'w') as f:
  54. f.write(str(context))
  55. def ReadNextDocument(self, sess, sentence):
  56. sentence_str, = sess.run([sentence])
  57. if sentence_str:
  58. sentence_doc = sentence_pb2.Sentence()
  59. sentence_doc.ParseFromString(sentence_str[0])
  60. else:
  61. sentence_doc = None
  62. return sentence_doc
  63. def CheckTokenization(self, sentence, tokenization):
  64. self.WriteContext('english-text')
  65. logging.info('Writing text file to: %s', self.corpus_file)
  66. with open(self.corpus_file, 'w') as f:
  67. f.write(sentence)
  68. sentence, _ = gen_parser_ops.document_source(
  69. task_context=self.context_file, batch_size=1)
  70. with self.test_session() as sess:
  71. sentence_doc = self.ReadNextDocument(sess, sentence)
  72. self.assertEqual(' '.join([t.word
  73. for t in sentence_doc.token]), tokenization)
  74. def CheckUntokenizedDoc(self, sentence, words, starts, ends):
  75. self.WriteContext('untokenized-text')
  76. logging.info('Writing text file to: %s', self.corpus_file)
  77. with open(self.corpus_file, 'w') as f:
  78. f.write(sentence)
  79. sentence, _ = gen_parser_ops.document_source(
  80. task_context=self.context_file, batch_size=1)
  81. with self.test_session() as sess:
  82. sentence_doc = self.ReadNextDocument(sess, sentence)
  83. self.assertEqual(len(sentence_doc.token), len(words))
  84. self.assertEqual(len(sentence_doc.token), len(starts))
  85. self.assertEqual(len(sentence_doc.token), len(ends))
  86. for i, token in enumerate(sentence_doc.token):
  87. self.assertEqual(token.word.encode('utf-8'), words[i])
  88. self.assertEqual(token.start, starts[i])
  89. self.assertEqual(token.end, ends[i])
  90. def testUntokenized(self):
  91. self.CheckUntokenizedDoc('一个测试', ['一', '个', '测', '试'], [0, 3, 6, 9],
  92. [2, 5, 8, 11])
  93. self.CheckUntokenizedDoc('Hello ', ['H', 'e', 'l', 'l', 'o', ' '],
  94. [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5])
  95. def testConllSentence(self):
  96. # This test sentence includes a multiword token and an empty node,
  97. # both of which are to be ignored.
  98. test_sentence = """
  99. 1-2 We've _
  100. 1 We we PRON PRP Case=Nom 3 nsubj _ SpaceAfter=No
  101. 2 've have AUX VBP Mood=Ind 3 aux _ _
  102. 3 moved move VERB VBN Tense=Past 0 root _ _
  103. 4 on on ADV RB _ 3 advmod _ SpaceAfter=No
  104. 4.1 ignored ignore VERB VBN Tense=Past 0 _ _ _
  105. 5 . . PUNCT . _ 3 punct _ _
  106. """
  107. # Prepare test sentence.
  108. with open(self.corpus_file, 'w') as f:
  109. f.write(test_sentence)
  110. # Prepare context.
  111. self.WriteContext('conll-sentence')
  112. # Test converted sentence.
  113. sentence, _ = gen_parser_ops.document_source(
  114. task_context=self.context_file, batch_size=1)
  115. # Expected texts, words, and start/end offsets.
  116. expected_text = u'We\'ve moved on.'
  117. expected_words = [u'We', u'\'ve', u'moved', u'on', u'.']
  118. expected_starts = [0, 2, 6, 12, 14]
  119. expected_ends = [1, 4, 10, 13, 14]
  120. with self.test_session() as sess:
  121. sentence_doc = self.ReadNextDocument(sess, sentence)
  122. self.assertEqual(expected_text, sentence_doc.text)
  123. self.assertEqual(expected_words, [t.word for t in sentence_doc.token])
  124. self.assertEqual(expected_starts, [t.start for t in sentence_doc.token])
  125. self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
  126. def testSentencePrototext(self):
  127. # Note: lstrip() is to avoid an empty line at the beginning, which will
  128. # cause an empty record to be emitted. These empty records currently aren't
  129. # supported by the sentence prototext format (which is currently mostly for
  130. # debugging).
  131. test_sentence = """
  132. text: "fair enough; you people have eaten me."
  133. token {
  134. word: "fair"
  135. start: 0
  136. end: 3
  137. break_level: NO_BREAK
  138. }
  139. token {
  140. word: "enough"
  141. start: 5
  142. end: 10
  143. head: 0
  144. break_level: SPACE_BREAK
  145. }
  146. """.lstrip()
  147. # Prepare test sentence.
  148. with open(self.corpus_file, 'w') as f:
  149. f.write(test_sentence)
  150. # Prepare context.
  151. self.WriteContext('sentence-prototext')
  152. # Test converted sentence.
  153. sentence, _ = gen_parser_ops.document_source(
  154. task_context=self.context_file, batch_size=1)
  155. # Expected texts, words, and start/end offsets.
  156. expected_text = u'fair enough; you people have eaten me.'
  157. expected_words = [u'fair', u'enough']
  158. expected_starts = [0, 5]
  159. expected_ends = [3, 10]
  160. with self.test_session() as sess:
  161. sentence_doc = self.ReadNextDocument(sess, sentence)
  162. self.assertEqual(expected_text, sentence_doc.text)
  163. self.assertEqual(expected_words, [t.word for t in sentence_doc.token])
  164. self.assertEqual(expected_starts, [t.start for t in sentence_doc.token])
  165. self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
  166. def testSegmentationTrainingData(self):
  167. doc1_lines = ['测试 NO_SPACE\n', '的 NO_SPACE\n', '句子 NO_SPACE']
  168. doc1_text = '测试的句子'
  169. doc1_tokens = ['测', '试', '的', '句', '子']
  170. doc1_break_levles = [1, 0, 1, 1, 0]
  171. doc2_lines = [
  172. 'That NO_SPACE\n', '\'s SPACE\n', 'a SPACE\n', 'good SPACE\n',
  173. 'point NO_SPACE\n', '. NO_SPACE'
  174. ]
  175. doc2_text = 'That\'s a good point.'
  176. doc2_tokens = [
  177. 'T', 'h', 'a', 't', '\'', 's', ' ', 'a', ' ', 'g', 'o', 'o', 'd', ' ',
  178. 'p', 'o', 'i', 'n', 't', '.'
  179. ]
  180. doc2_break_levles = [
  181. 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1
  182. ]
  183. self.CheckSegmentationTrainingData(doc1_lines, doc1_text, doc1_tokens,
  184. doc1_break_levles)
  185. self.CheckSegmentationTrainingData(doc2_lines, doc2_text, doc2_tokens,
  186. doc2_break_levles)
  187. def CheckSegmentationTrainingData(self, doc_lines, doc_text, doc_words,
  188. break_levels):
  189. # Prepare context.
  190. self.WriteContext('segment-train-data')
  191. # Prepare test sentence.
  192. with open(self.corpus_file, 'w') as f:
  193. f.write(''.join(doc_lines))
  194. # Test converted sentence.
  195. sentence, _ = gen_parser_ops.document_source(
  196. task_context=self.context_file, batch_size=1)
  197. with self.test_session() as sess:
  198. sentence_doc = self.ReadNextDocument(sess, sentence)
  199. self.assertEqual(doc_text.decode('utf-8'), sentence_doc.text)
  200. self.assertEqual([t.decode('utf-8') for t in doc_words],
  201. [t.word for t in sentence_doc.token])
  202. self.assertEqual(break_levels,
  203. [t.break_level for t in sentence_doc.token])
  204. def testSimple(self):
  205. self.CheckTokenization('Hello, world!', 'Hello , world !')
  206. self.CheckTokenization('"Hello"', "`` Hello ''")
  207. self.CheckTokenization('{"Hello@#$', '-LRB- `` Hello @ # $')
  208. self.CheckTokenization('"Hello..."', "`` Hello ... ''")
  209. self.CheckTokenization('()[]{}<>',
  210. '-LRB- -RRB- -LRB- -RRB- -LRB- -RRB- < >')
  211. self.CheckTokenization('Hello--world', 'Hello -- world')
  212. self.CheckTokenization("Isn't", "Is n't")
  213. self.CheckTokenization("n't", "n't")
  214. self.CheckTokenization('Hello Mr. Smith.', 'Hello Mr. Smith .')
  215. self.CheckTokenization("It's Mr. Smith's.", "It 's Mr. Smith 's .")
  216. self.CheckTokenization("It's the Smiths'.", "It 's the Smiths ' .")
  217. self.CheckTokenization('Gotta go', 'Got ta go')
  218. self.CheckTokenization('50-year-old', '50-year-old')
  219. def testUrl(self):
  220. self.CheckTokenization('http://www.google.com/news is down',
  221. 'http : //www.google.com/news is down')
  222. if __name__ == '__main__':
  223. googletest.main()