sentence_io.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. """Utilities for reading and writing sentences in dragnn."""
  2. import tensorflow as tf
  3. from syntaxnet.ops import gen_parser_ops
  4. class ConllSentenceReader(object):
  5. """A reader for conll files, with optional projectivizing."""
  6. def __init__(self, filepath, batch_size=32,
  7. projectivize=False, morph_to_pos=False):
  8. self._graph = tf.Graph()
  9. self._session = tf.Session(graph=self._graph)
  10. task_context_str = """
  11. input {
  12. name: 'documents'
  13. record_format: 'conll-sentence'
  14. Part {
  15. file_pattern: '%s'
  16. }
  17. }""" % filepath
  18. if morph_to_pos:
  19. task_context_str += """
  20. Parameter {
  21. name: "join_category_to_pos"
  22. value: "true"
  23. }
  24. Parameter {
  25. name: "add_pos_as_attribute"
  26. value: "true"
  27. }
  28. Parameter {
  29. name: "serialize_morph_to_pos"
  30. value: "true"
  31. }
  32. """
  33. with self._graph.as_default():
  34. self._source, self._is_last = gen_parser_ops.document_source(
  35. task_context_str=task_context_str, batch_size=batch_size)
  36. self._source = gen_parser_ops.well_formed_filter(self._source)
  37. if projectivize:
  38. self._source = gen_parser_ops.projectivize_filter(self._source)
  39. def read(self):
  40. """Reads a single batch of sentences."""
  41. if self._session:
  42. sentences, is_last = self._session.run([self._source, self._is_last])
  43. if is_last:
  44. self._session.close()
  45. self._session = None
  46. else:
  47. sentences, is_last = [], True
  48. return sentences, is_last
  49. def corpus(self):
  50. """Reads the entire corpus, and returns in a list."""
  51. tf.logging.info('Reading corpus...')
  52. corpus = []
  53. while True:
  54. sentences, is_last = self.read()
  55. corpus.extend(sentences)
  56. if is_last:
  57. break
  58. tf.logging.info('Read %d sentences.' % len(corpus))
  59. return corpus