| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275 |
- /* Copyright 2016 Google Inc. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- #include "syntaxnet/sentence_features.h"
- #include <string>
- #include <vector>
- #include "syntaxnet/feature_extractor.h"
- #include "syntaxnet/populate_test_inputs.h"
- #include "syntaxnet/sentence.pb.h"
- #include "syntaxnet/task_context.h"
- #include "syntaxnet/task_spec.pb.h"
- #include "syntaxnet/utils.h"
- #include "syntaxnet/workspace.h"
- #include <gmock/gmock.h>
- #include "tensorflow/core/platform/test.h"
- using testing::UnorderedElementsAreArray;
- namespace syntaxnet {
- // A basic fixture for testing Features. Takes a string of a
- // Sentence protobuf that is used as the test data in the constructor.
- class SentenceFeaturesTest : public ::testing::Test {
- protected:
- explicit SentenceFeaturesTest(const string &prototxt)
- : sentence_(ParseASCII(prototxt)),
- creators_(PopulateTestInputs::Defaults(sentence_)) {}
- static Sentence ParseASCII(const string &prototxt) {
- Sentence document;
- CHECK(TextFormat::ParseFromString(prototxt, &document));
- return document;
- }
- // Prepares a new feature for extracting from the attached sentence,
- // regenerating the TaskContext and all resources. Will automatically add
- // anything in info_ field into the LexiFuse repository.
- virtual void PrepareFeature(const string &fml) {
- context_.mutable_spec()->mutable_input()->Clear();
- context_.mutable_spec()->mutable_output()->Clear();
- extractor_.reset(new SentenceExtractor());
- extractor_->Parse(fml);
- extractor_->Setup(&context_);
- creators_.Populate(&context_);
- extractor_->Init(&context_);
- extractor_->RequestWorkspaces(®istry_);
- workspaces_.Reset(registry_);
- extractor_->Preprocess(&workspaces_, &sentence_);
- }
- // Returns the string representation of the prepared feature extracted at the
- // given index.
- virtual string ExtractFeature(int index) {
- FeatureVector result;
- extractor_->ExtractFeatures(workspaces_, sentence_, index,
- &result);
- return result.type(0)->GetFeatureValueName(result.value(0));
- }
- // Extracts a vector of string representations from evaluating the prepared
- // set feature (returning multiple values) at the given index.
- virtual vector<string> ExtractMultiFeature(int index) {
- vector<string> values;
- FeatureVector result;
- extractor_->ExtractFeatures(workspaces_, sentence_, index,
- &result);
- for (int i = 0; i < result.size(); ++i) {
- values.push_back(result.type(i)->GetFeatureValueName(result.value(i)));
- }
- return values;
- }
- // Adds an input to the task context.
- void AddInputToContext(const string &name, const string &file_pattern,
- const string &file_format,
- const string &record_format) {
- TaskInput *input = context_.GetInput(name);
- TaskInput::Part *part = input->add_part();
- part->set_file_pattern(file_pattern);
- part->set_file_format(file_format);
- part->set_record_format(record_format);
- }
- // Checks that a vector workspace is equal to a target vector.
- void CheckVectorWorkspace(const VectorIntWorkspace &workspace,
- vector<int> target) {
- vector<int> src;
- for (int i = 0; i < workspace.size(); ++i) {
- src.push_back(workspace.element(i));
- }
- EXPECT_THAT(src, testing::ContainerEq(target));
- }
- Sentence sentence_;
- WorkspaceSet workspaces_;
- PopulateTestInputs::CreatorMap creators_;
- TaskContext context_;
- WorkspaceRegistry registry_;
- std::unique_ptr<SentenceExtractor> extractor_;
- };
- // Test fixture for simple common features that operate on just a sentence.
- class CommonSentenceFeaturesTest : public SentenceFeaturesTest {
- protected:
- CommonSentenceFeaturesTest()
- : SentenceFeaturesTest(
- "text: 'I saw a man with a telescope.' "
- "token { word: 'I' start: 0 end: 0 tag: 'PRP' category: 'PRON'"
- " head: 1 label: 'nsubj' break_level: NO_BREAK } "
- "token { word: 'saw' start: 2 end: 4 tag: 'VBD' category: 'VERB'"
- " label: 'ROOT' break_level: SPACE_BREAK } "
- "token { word: 'a' start: 6 end: 6 tag: 'DT' category: 'DET'"
- " head: 3 label: 'det' break_level: SPACE_BREAK } "
- "token { word: 'man' start: 8 end: 10 tag: 'NN' category: 'NOUN'"
- " head: 1 label: 'dobj' break_level: SPACE_BREAK"
- " [syntaxnet.TokenMorphology.morphology] { "
- " attribute { name:'morph' value:'Sg' } "
- " attribute { name:'morph' value:'Masc' } "
- " } "
- "} "
- "token { word: 'with' start: 12 end: 15 tag: 'IN' category: 'ADP'"
- " head: 1 label: 'prep' break_level: SPACE_BREAK } "
- "token { word: 'a' start: 17 end: 17 tag: 'DT' category: 'DET'"
- " head: 6 label: 'det' break_level: SPACE_BREAK } "
- "token { word: 'telescope' start: 19 end: 27 tag: 'NN' category: "
- "'NOUN'"
- " head: 4 label: 'pobj' break_level: SPACE_BREAK } "
- "token { word: '.' start: 28 end: 28 tag: '.' category: '.'"
- " head: 1 label: 'p' break_level: NO_BREAK }") {}
- };
- TEST_F(CommonSentenceFeaturesTest, TagFeature) {
- PrepareFeature("tag");
- EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
- EXPECT_EQ("PRP", ExtractFeature(0));
- EXPECT_EQ("VBD", ExtractFeature(1));
- EXPECT_EQ("DT", ExtractFeature(2));
- EXPECT_EQ("NN", ExtractFeature(3));
- EXPECT_EQ("<OUTSIDE>", ExtractFeature(8));
- }
- TEST_F(CommonSentenceFeaturesTest, TagFeaturePassesArgs) {
- PrepareFeature("tag(min-freq=5)"); // don't load any tags
- EXPECT_EQ(ExtractFeature(-1), "<OUTSIDE>");
- EXPECT_EQ(ExtractFeature(0), "<UNKNOWN>");
- EXPECT_EQ(ExtractFeature(8), "<OUTSIDE>");
- // Only 2 features: <UNKNOWN> and <OUTSIDE>.
- EXPECT_EQ(2, extractor_->feature_type(0)->GetDomainSize());
- }
- TEST_F(CommonSentenceFeaturesTest, OffsetPlusTag) {
- PrepareFeature("offset(-1).tag(min-freq=2)");
- EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
- EXPECT_EQ("<OUTSIDE>", ExtractFeature(0));
- EXPECT_EQ("<UNKNOWN>", ExtractFeature(1));
- EXPECT_EQ("<UNKNOWN>", ExtractFeature(2));
- EXPECT_EQ("DT", ExtractFeature(3)); // DT, NN are the only freq tags
- EXPECT_EQ("NN", ExtractFeature(4));
- EXPECT_EQ("<UNKNOWN>", ExtractFeature(5));
- EXPECT_EQ("DT", ExtractFeature(6));
- EXPECT_EQ("NN", ExtractFeature(7));
- EXPECT_EQ("<UNKNOWN>", ExtractFeature(8));
- EXPECT_EQ("<OUTSIDE>", ExtractFeature(9));
- }
- TEST_F(CommonSentenceFeaturesTest, CharNgramFeature) {
- TermFrequencyMap char_ngram_map;
- char_ngram_map.Increment("a");
- char_ngram_map.Increment("aw");
- char_ngram_map.Increment("sa");
- creators_.Add(
- "char-ngram-map", "text", "",
- [&char_ngram_map](const string &path) { char_ngram_map.Save(path); });
- // Test that CharNgram works as expected.
- PrepareFeature("char-ngram");
- EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
- EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
- EXPECT_EQ("sa,a,aw", utils::Join(ExtractMultiFeature(1), ","));
- EXPECT_EQ("a", utils::Join(ExtractMultiFeature(2), ","));
- EXPECT_EQ("a", utils::Join(ExtractMultiFeature(3), ","));
- EXPECT_EQ("", utils::Join(ExtractMultiFeature(8), ","));
- }
- TEST_F(CommonSentenceFeaturesTest, MorphologySetFeature) {
- TermFrequencyMap morphology_map;
- morphology_map.Increment("morph=Sg");
- morphology_map.Increment("morph=Sg");
- morphology_map.Increment("morph=Masc");
- morphology_map.Increment("morph=Masc");
- morphology_map.Increment("morph=Pl");
- creators_.Add(
- "morphology-map", "text", "",
- [&morphology_map](const string &path) { morphology_map.Save(path); });
- // Test that CharNgram works as expected.
- PrepareFeature("morphology-set");
- EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
- EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
- EXPECT_EQ("morph=Sg,morph=Masc", utils::Join(ExtractMultiFeature(3), ","));
- }
- TEST_F(CommonSentenceFeaturesTest, CapitalizationProcessesCorrectly) {
- Capitalization feature;
- feature.RequestWorkspaces(®istry_);
- workspaces_.Reset(registry_);
- feature.Preprocess(&workspaces_, &sentence_);
- // Check the workspace contains what we expect.
- EXPECT_TRUE(workspaces_.Has<VectorIntWorkspace>(feature.Workspace()));
- const VectorIntWorkspace &workspace =
- workspaces_.Get<VectorIntWorkspace>(feature.Workspace());
- constexpr int UPPERCASE = Capitalization::UPPERCASE;
- constexpr int LOWERCASE = Capitalization::LOWERCASE;
- constexpr int NON_ALPHABETIC = Capitalization::NON_ALPHABETIC;
- CheckVectorWorkspace(workspace,
- {UPPERCASE, LOWERCASE, LOWERCASE, LOWERCASE, LOWERCASE,
- LOWERCASE, LOWERCASE, NON_ALPHABETIC});
- }
- class CharFeatureTest : public SentenceFeaturesTest {
- protected:
- CharFeatureTest()
- : SentenceFeaturesTest(
- "text: '一 个 测 试 员 ' "
- "token { word: '一' start: 0 end: 2 } "
- "token { word: '个' start: 3 end: 5 } "
- "token { word: '测' start: 6 end: 8 } "
- "token { word: '试' start: 9 end: 11 } "
- "token { word: '员' start: 12 end: 14 } "
- "token { word: ' ' start: 15 end: 15 } "
- "token { word: '\t' start: 16 end: 16 } ") {}
- };
- TEST_F(CharFeatureTest, CharFeature) {
- TermFrequencyMap char_map;
- char_map.Increment("一");
- char_map.Increment("个");
- char_map.Increment("试");
- char_map.Increment("员");
- creators_.Add(
- "char-map", "text", "",
- [&char_map](const string &path) { char_map.Save(path); });
- // Test that Char works as expected.
- PrepareFeature("char");
- EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
- EXPECT_EQ("一", ExtractFeature(0));
- EXPECT_EQ("个", ExtractFeature(1));
- EXPECT_EQ("<UNKNOWN>", ExtractFeature(2)); // "测" is not in the char map.
- EXPECT_EQ("试", ExtractFeature(3));
- EXPECT_EQ("员", ExtractFeature(4));
- EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(5));
- EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(6));
- EXPECT_EQ("<OUTSIDE>", ExtractFeature(7));
- }
- } // namespace syntaxnet
|