populate_test_inputs.h 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. // A utility for populating a set of inputs of a task. This knows how to create
  13. // tag-map, category-map, label-map and has hooks to
  14. // populate other kinds of inputs. The expected set of operations are:
  15. //
  16. // Sentence document_for_init = ...;
  17. // TaskContext context;
  18. // context->SetParameter("my_parameter", "true");
  19. // MyDocumentProcessor processor;
  20. // processor.Setup(&context);
  21. // PopulateTestInputs::Defaults(document_for_init).Populate(&context);
  22. // processor.Init(&context);
  23. //
  24. // This will check the inputs requested by the processor's Setup(TaskContext *)
  25. // function, and files corresponding to them. For example, if the processor
  26. // asked for the a "tag-map" input, it will create a TermFrequencyMap, populate
  27. // it with the POS tags found in the Sentence document_for_init, save it to disk
  28. // and update the TaskContext with the location of the file. By convention, the
  29. // location is the name of the input. Conceptually, the logic is very simple:
  30. //
  31. // for (TaskInput &input : context->mutable_spec()->mutable_input()) {
  32. // creators[input.name()](&input);
  33. // // check for missing inputs, incompatible formats, etc...
  34. // }
  35. //
  36. // The Populate() routine will also check compatability between requested and
  37. // supplied formats. The Default mapping knows how to populate the following
  38. // inputs:
  39. //
  40. // - category-map: TermFrequencyMap containing POS categories.
  41. //
  42. // - label-map: TermFrequencyMap containing parser labels.
  43. //
  44. // - tag-map: TermFrequencyMap containing POS tags.
  45. //
  46. // - tag-to-category: StringToStringMap mapping POS tags to categories.
  47. //
  48. // - word-map: TermFrequencyMap containing words.
  49. //
  50. // Clients can add creation routines by defining a std::function:
  51. //
  52. // auto creators = PopulateTestInputs::Defaults(document_for_init);
  53. // creators["my-input"] = [](TaskInput *input) { ...; }
  54. //
  55. // See also creators.Add() for more convenience functions.
  56. #ifndef SYNTAXNET_POPULATE_TEST_INPUTS_H_
  57. #define SYNTAXNET_POPULATE_TEST_INPUTS_H_
  58. #include <functional>
  59. #include <string>
  60. #include <unordered_map>
  61. #include <vector>
  62. #include "syntaxnet/utils.h"
  63. namespace syntaxnet {
  64. class Sentence;
  65. class TaskContext;
  66. class TaskInput;
  67. class TaskOutput;
  68. class Token;
  69. class PopulateTestInputs {
  70. public:
  71. // When called, Create() should populate an input by creating a file and
  72. // adding one or more parts to the TaskInput.
  73. typedef std::function<void(TaskInput *)> Create;
  74. // When called, CreateFile() should create a file resource at the given
  75. // path. These are typically less inconvient to write.
  76. typedef std::function<void(const string &)> CreateFile;
  77. // A set of creators, one for each input in a TaskContext.
  78. class CreatorMap : public std::unordered_map<string, Create> {
  79. public:
  80. // A simplified way to add a single-file creator. The name of the file
  81. // location will be file::JoinPath(FLAGS_test_tmpdir, name).
  82. void Add(const string &name, const string &file_format,
  83. const string &record_format, CreateFile makefile);
  84. // Convenience method to populate the inputs in context. Returns true if it
  85. // was possible to populate each input, and false otherwise. If a mandatory
  86. // input does not have a creator, then we LOG(FATAL).
  87. bool Populate(TaskContext *context) const;
  88. };
  89. // Default creator set. This knows how to generate from a given Document
  90. // - category-map
  91. // - label-map
  92. // - tag-map
  93. // - tag-to-category
  94. // - word-map
  95. //
  96. // Note: the default creators capture the document input by value: this means
  97. // that subsequent modifications to the document will NOT be
  98. // reflected in the inputs. However, the following is perfectly valid:
  99. //
  100. // CreatorMap creators;
  101. // {
  102. // Sentence document;
  103. // creators = PopulateTestInputs::Defaults(document);
  104. // }
  105. // creators.Populate(context);
  106. static CreatorMap Defaults(const Sentence &document);
  107. // Populates the TaskContext object from a map of creator functions. Note that
  108. // this static version is compatible with any hash map of the correct type.
  109. static bool Populate(const std::unordered_map<string, Create> &creator_map,
  110. TaskContext *context);
  111. // Helper function for creating a term frequency map from a document. This
  112. // iterates over all the tokens in the document, calls token2str on each
  113. // token, and adds each returned string to the term frequency map. The map is
  114. // then saved to FLAGS_test_tmpdir/name.
  115. static Create CreateTFMapFromDocumentTokens(
  116. const Sentence &document,
  117. std::function<vector<string>(const Token &)> token2str);
  118. // Creates a StringToStringMap protocol buffer input that maps tags to
  119. // categories. Uses whatever mapping is present in the document.
  120. static Create CreateTagToCategoryFromTokens(const Sentence &document);
  121. // Default implementations for "token2str" above.
  122. static vector<string> TokenCategory(const Token &token);
  123. static vector<string> TokenLabel(const Token &token);
  124. static vector<string> TokenTag(const Token &token);
  125. static vector<string> TokenWord(const Token &token);
  126. // Utility function. Sets the TaskInput->part() fields for a new input part.
  127. // Returns the file name.
  128. static string AddPart(TaskInput *input, const string &file_format,
  129. const string &record_format);
  130. };
  131. } // namespace syntaxnet
  132. #endif // SYNTAXNET_POPULATE_TEST_INPUTS_H_