sentence_features_test.cc 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #include "syntaxnet/sentence_features.h"
  13. #include <string>
  14. #include <vector>
  15. #include "syntaxnet/feature_extractor.h"
  16. #include "syntaxnet/populate_test_inputs.h"
  17. #include "syntaxnet/sentence.pb.h"
  18. #include "syntaxnet/task_context.h"
  19. #include "syntaxnet/task_spec.pb.h"
  20. #include "syntaxnet/utils.h"
  21. #include "syntaxnet/workspace.h"
  22. #include <gmock/gmock.h>
  23. #include "tensorflow/core/platform/test.h"
  24. using testing::UnorderedElementsAreArray;
  25. namespace syntaxnet {
  26. // A basic fixture for testing Features. Takes a string of a
  27. // Sentence protobuf that is used as the test data in the constructor.
  28. class SentenceFeaturesTest : public ::testing::Test {
  29. protected:
  30. explicit SentenceFeaturesTest(const string &prototxt)
  31. : sentence_(ParseASCII(prototxt)),
  32. creators_(PopulateTestInputs::Defaults(sentence_)) {}
  33. static Sentence ParseASCII(const string &prototxt) {
  34. Sentence document;
  35. CHECK(TextFormat::ParseFromString(prototxt, &document));
  36. return document;
  37. }
  38. // Prepares a new feature for extracting from the attached sentence,
  39. // regenerating the TaskContext and all resources. Will automatically add
  40. // anything in info_ field into the LexiFuse repository.
  41. virtual void PrepareFeature(const string &fml) {
  42. context_.mutable_spec()->mutable_input()->Clear();
  43. context_.mutable_spec()->mutable_output()->Clear();
  44. extractor_.reset(new SentenceExtractor());
  45. extractor_->Parse(fml);
  46. extractor_->Setup(&context_);
  47. creators_.Populate(&context_);
  48. extractor_->Init(&context_);
  49. extractor_->RequestWorkspaces(&registry_);
  50. workspaces_.Reset(registry_);
  51. extractor_->Preprocess(&workspaces_, &sentence_);
  52. }
  53. // Returns the string representation of the prepared feature extracted at the
  54. // given index.
  55. virtual string ExtractFeature(int index) {
  56. FeatureVector result;
  57. extractor_->ExtractFeatures(workspaces_, sentence_, index,
  58. &result);
  59. return result.type(0)->GetFeatureValueName(result.value(0));
  60. }
  61. // Extracts a vector of string representations from evaluating the prepared
  62. // set feature (returning multiple values) at the given index.
  63. virtual std::vector<string> ExtractMultiFeature(int index) {
  64. std::vector<string> values;
  65. FeatureVector result;
  66. extractor_->ExtractFeatures(workspaces_, sentence_, index,
  67. &result);
  68. for (int i = 0; i < result.size(); ++i) {
  69. values.push_back(result.type(i)->GetFeatureValueName(result.value(i)));
  70. }
  71. return values;
  72. }
  73. // Adds an input to the task context.
  74. void AddInputToContext(const string &name, const string &file_pattern,
  75. const string &file_format,
  76. const string &record_format) {
  77. TaskInput *input = context_.GetInput(name);
  78. TaskInput::Part *part = input->add_part();
  79. part->set_file_pattern(file_pattern);
  80. part->set_file_format(file_format);
  81. part->set_record_format(record_format);
  82. }
  83. // Checks that a vector workspace is equal to a target vector.
  84. void CheckVectorWorkspace(const VectorIntWorkspace &workspace,
  85. std::vector<int> target) {
  86. std::vector<int> src;
  87. for (int i = 0; i < workspace.size(); ++i) {
  88. src.push_back(workspace.element(i));
  89. }
  90. EXPECT_THAT(src, testing::ContainerEq(target));
  91. }
  92. Sentence sentence_;
  93. WorkspaceSet workspaces_;
  94. PopulateTestInputs::CreatorMap creators_;
  95. TaskContext context_;
  96. WorkspaceRegistry registry_;
  97. std::unique_ptr<SentenceExtractor> extractor_;
  98. };
  99. // Test fixture for simple common features that operate on just a sentence.
  100. class CommonSentenceFeaturesTest : public SentenceFeaturesTest {
  101. protected:
  102. CommonSentenceFeaturesTest()
  103. : SentenceFeaturesTest(
  104. "text: 'I saw a man with a telescope.' "
  105. "token { word: 'I' start: 0 end: 0 tag: 'PRP' category: 'PRON'"
  106. " head: 1 label: 'nsubj' break_level: NO_BREAK } "
  107. "token { word: 'saw' start: 2 end: 4 tag: 'VBD' category: 'VERB'"
  108. " label: 'ROOT' break_level: SPACE_BREAK } "
  109. "token { word: 'a' start: 6 end: 6 tag: 'DT' category: 'DET'"
  110. " head: 3 label: 'det' break_level: SPACE_BREAK } "
  111. "token { word: 'man' start: 8 end: 10 tag: 'NN' category: 'NOUN'"
  112. " head: 1 label: 'dobj' break_level: SPACE_BREAK"
  113. " [syntaxnet.TokenMorphology.morphology] { "
  114. " attribute { name:'morph' value:'Sg' } "
  115. " attribute { name:'morph' value:'Masc' } "
  116. " } "
  117. "} "
  118. "token { word: 'with' start: 12 end: 15 tag: 'IN' category: 'ADP'"
  119. " head: 1 label: 'prep' break_level: SPACE_BREAK } "
  120. "token { word: 'a' start: 17 end: 17 tag: 'DT' category: 'DET'"
  121. " head: 6 label: 'det' break_level: SPACE_BREAK } "
  122. "token { word: 'telescope' start: 19 end: 27 tag: 'NN' category: "
  123. "'NOUN'"
  124. " head: 4 label: 'pobj' break_level: SPACE_BREAK } "
  125. "token { word: '.' start: 28 end: 28 tag: '.' category: '.'"
  126. " head: 1 label: 'p' break_level: NO_BREAK }") {}
  127. };
  128. TEST_F(CommonSentenceFeaturesTest, TagFeature) {
  129. PrepareFeature("tag");
  130. EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
  131. EXPECT_EQ("PRP", ExtractFeature(0));
  132. EXPECT_EQ("VBD", ExtractFeature(1));
  133. EXPECT_EQ("DT", ExtractFeature(2));
  134. EXPECT_EQ("NN", ExtractFeature(3));
  135. EXPECT_EQ("<OUTSIDE>", ExtractFeature(8));
  136. }
  137. TEST_F(CommonSentenceFeaturesTest, TagFeaturePassesArgs) {
  138. PrepareFeature("tag(min-freq=5)"); // don't load any tags
  139. EXPECT_EQ(ExtractFeature(-1), "<OUTSIDE>");
  140. EXPECT_EQ(ExtractFeature(0), "<UNKNOWN>");
  141. EXPECT_EQ(ExtractFeature(8), "<OUTSIDE>");
  142. // Only 2 features: <UNKNOWN> and <OUTSIDE>.
  143. EXPECT_EQ(2, extractor_->feature_type(0)->GetDomainSize());
  144. }
  145. TEST_F(CommonSentenceFeaturesTest, OffsetPlusTag) {
  146. PrepareFeature("offset(-1).tag(min-freq=2)");
  147. EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
  148. EXPECT_EQ("<OUTSIDE>", ExtractFeature(0));
  149. EXPECT_EQ("<UNKNOWN>", ExtractFeature(1));
  150. EXPECT_EQ("<UNKNOWN>", ExtractFeature(2));
  151. EXPECT_EQ("DT", ExtractFeature(3)); // DT, NN are the only freq tags
  152. EXPECT_EQ("NN", ExtractFeature(4));
  153. EXPECT_EQ("<UNKNOWN>", ExtractFeature(5));
  154. EXPECT_EQ("DT", ExtractFeature(6));
  155. EXPECT_EQ("NN", ExtractFeature(7));
  156. EXPECT_EQ("<UNKNOWN>", ExtractFeature(8));
  157. EXPECT_EQ("<OUTSIDE>", ExtractFeature(9));
  158. }
  159. TEST_F(CommonSentenceFeaturesTest, CharNgramFeature) {
  160. TermFrequencyMap char_ngram_map;
  161. char_ngram_map.Increment("a");
  162. char_ngram_map.Increment("aw");
  163. char_ngram_map.Increment("sa");
  164. creators_.Add(
  165. "char-ngram-map", "text", "",
  166. [&char_ngram_map](const string &path) { char_ngram_map.Save(path); });
  167. // Test that CharNgram works as expected.
  168. PrepareFeature("char-ngram");
  169. EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
  170. EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
  171. EXPECT_EQ("sa,a,aw", utils::Join(ExtractMultiFeature(1), ","));
  172. EXPECT_EQ("a", utils::Join(ExtractMultiFeature(2), ","));
  173. EXPECT_EQ("a", utils::Join(ExtractMultiFeature(3), ","));
  174. EXPECT_EQ("", utils::Join(ExtractMultiFeature(8), ","));
  175. }
  176. TEST_F(CommonSentenceFeaturesTest, MorphologySetFeature) {
  177. TermFrequencyMap morphology_map;
  178. morphology_map.Increment("morph=Sg");
  179. morphology_map.Increment("morph=Sg");
  180. morphology_map.Increment("morph=Masc");
  181. morphology_map.Increment("morph=Masc");
  182. morphology_map.Increment("morph=Pl");
  183. creators_.Add(
  184. "morphology-map", "text", "",
  185. [&morphology_map](const string &path) { morphology_map.Save(path); });
  186. // Test that CharNgram works as expected.
  187. PrepareFeature("morphology-set");
  188. EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
  189. EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
  190. EXPECT_EQ("morph=Sg,morph=Masc", utils::Join(ExtractMultiFeature(3), ","));
  191. }
  192. TEST_F(CommonSentenceFeaturesTest, CapitalizationProcessesCorrectly) {
  193. Capitalization feature;
  194. feature.RequestWorkspaces(&registry_);
  195. workspaces_.Reset(registry_);
  196. feature.Preprocess(&workspaces_, &sentence_);
  197. // Check the workspace contains what we expect.
  198. EXPECT_TRUE(workspaces_.Has<VectorIntWorkspace>(feature.Workspace()));
  199. const VectorIntWorkspace &workspace =
  200. workspaces_.Get<VectorIntWorkspace>(feature.Workspace());
  201. constexpr int UPPERCASE = Capitalization::UPPERCASE;
  202. constexpr int LOWERCASE = Capitalization::LOWERCASE;
  203. constexpr int NON_ALPHABETIC = Capitalization::NON_ALPHABETIC;
  204. CheckVectorWorkspace(workspace,
  205. {UPPERCASE, LOWERCASE, LOWERCASE, LOWERCASE, LOWERCASE,
  206. LOWERCASE, LOWERCASE, NON_ALPHABETIC});
  207. }
  208. class CharFeatureTest : public SentenceFeaturesTest {
  209. protected:
  210. CharFeatureTest()
  211. : SentenceFeaturesTest(
  212. "text: '一 个 测 试 员 ' "
  213. "token { word: '一' start: 0 end: 2 } "
  214. "token { word: '个' start: 3 end: 5 } "
  215. "token { word: '测' start: 6 end: 8 } "
  216. "token { word: '试' start: 9 end: 11 } "
  217. "token { word: '员' start: 12 end: 14 } "
  218. "token { word: ' ' start: 15 end: 15 } "
  219. "token { word: '\t' start: 16 end: 16 } ") {}
  220. };
  221. TEST_F(CharFeatureTest, CharFeature) {
  222. TermFrequencyMap char_map;
  223. char_map.Increment("一");
  224. char_map.Increment("个");
  225. char_map.Increment("试");
  226. char_map.Increment("员");
  227. creators_.Add(
  228. "char-map", "text", "",
  229. [&char_map](const string &path) { char_map.Save(path); });
  230. // Test that Char works as expected.
  231. PrepareFeature("char");
  232. EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
  233. EXPECT_EQ("一", ExtractFeature(0));
  234. EXPECT_EQ("个", ExtractFeature(1));
  235. EXPECT_EQ("<UNKNOWN>", ExtractFeature(2)); // "测" is not in the char map.
  236. EXPECT_EQ("试", ExtractFeature(3));
  237. EXPECT_EQ("员", ExtractFeature(4));
  238. EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(5));
  239. EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(6));
  240. EXPECT_EQ("<OUTSIDE>", ExtractFeature(7));
  241. }
  242. } // namespace syntaxnet