lexicon.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. # Copyright 2017 Google Inc. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """SyntaxNet lexicon utils."""
  16. import os.path
  17. import tensorflow as tf
  18. from syntaxnet import task_spec_pb2
  19. from syntaxnet.ops import gen_parser_ops
  20. def create_lexicon_context(path):
  21. """Construct a SyntaxNet TaskContext file for standard lexical resources."""
  22. context = task_spec_pb2.TaskSpec()
  23. for name in [
  24. 'word-map', 'tag-map', 'tag-to-category', 'lcword-map', 'category-map',
  25. 'char-map', 'char-ngram-map', 'label-map', 'prefix-table', 'suffix-table'
  26. ]:
  27. context.input.add(name=name).part.add(file_pattern=os.path.join(path, name))
  28. return context
  29. def build_lexicon(output_path,
  30. training_corpus_path,
  31. tf_master='',
  32. training_corpus_format='conll-sentence',
  33. morph_to_pos=False,
  34. **kwargs):
  35. """Constructs a SyntaxNet lexicon at the given path.
  36. Args:
  37. output_path: Location to construct the lexicon.
  38. training_corpus_path: Path to CONLL formatted training data.
  39. tf_master: TensorFlow master executor (string, defaults to '' to use the
  40. local instance).
  41. training_corpus_format: Format of the training corpus (defaults to CONLL;
  42. search for REGISTER_SYNTAXNET_DOCUMENT_FORMAT for other formats).
  43. morph_to_pos: Whether to serialize morph attributes to the tag field,
  44. combined with category and fine POS tag.
  45. **kwargs: Forwarded to the LexiconBuilder op.
  46. """
  47. context = create_lexicon_context(output_path)
  48. if morph_to_pos:
  49. context.parameter.add(name='join_category_to_pos', value='true')
  50. context.parameter.add(name='add_pos_as_attribute', value='true')
  51. context.parameter.add(name='serialize_morph_to_pos', value='true')
  52. # Add the training data to the context.
  53. resource = context.input.add()
  54. resource.name = 'corpus'
  55. resource.record_format.extend([training_corpus_format])
  56. part = resource.part.add()
  57. part.file_pattern = training_corpus_path
  58. # Run the lexicon builder op.
  59. with tf.Session(tf_master) as sess:
  60. sess.run(
  61. gen_parser_ops.lexicon_builder(
  62. task_context_str=str(context), corpus_name='corpus', **kwargs))