sentence_features.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. // Features that operate on Sentence objects. Most features are defined
  13. // in this header so they may be re-used via composition into other more
  14. // advanced feature classes.
  15. #ifndef $TARGETDIR_SENTENCE_FEATURES_H_
  16. #define $TARGETDIR_SENTENCE_FEATURES_H_
  17. #include "syntaxnet/affix.h"
  18. #include "syntaxnet/feature_extractor.h"
  19. #include "syntaxnet/feature_types.h"
  20. #include "syntaxnet/shared_store.h"
  21. #include "syntaxnet/task_context.h"
  22. #include "syntaxnet/workspace.h"
  23. namespace syntaxnet {
  24. // Feature function for any component that processes Sentences, whose
  25. // focus is a token index into the sentence.
  26. typedef FeatureFunction<Sentence, int> SentenceFeature;
  27. // Alias for Locator type features that take (Sentence, int) signatures
  28. // and call other (Sentence, int) features.
  29. template <class DER>
  30. using Locator = FeatureLocator<DER, Sentence, int>;
  31. class TokenLookupFeature : public SentenceFeature {
  32. public:
  33. void Init(TaskContext *context) override {
  34. set_feature_type(new ResourceBasedFeatureType<TokenLookupFeature>(
  35. name(), this, {{NumValues(), "<OUTSIDE>"}}));
  36. }
  37. // Given a position in a sentence and workspaces, looks up the corresponding
  38. // feature value. The index is relative to the start of the sentence.
  39. virtual FeatureValue ComputeValue(const Token &token) const = 0;
  40. // Number of unique values.
  41. virtual int64 NumValues() const = 0;
  42. // Convert the numeric value of the feature to a human readable string.
  43. virtual string GetFeatureValueName(FeatureValue value) const = 0;
  44. // Name of the shared workspace.
  45. virtual string WorkspaceName() const = 0;
  46. // Runs ComputeValue for each token in the sentence.
  47. void Preprocess(WorkspaceSet *workspaces,
  48. Sentence *sentence) const override {
  49. if (workspaces->Has<VectorIntWorkspace>(workspace_)) return;
  50. VectorIntWorkspace *workspace = new VectorIntWorkspace(
  51. sentence->token_size());
  52. for (int i = 0; i < sentence->token_size(); ++i) {
  53. const int value = ComputeValue(sentence->token(i));
  54. workspace->set_element(i, value);
  55. }
  56. workspaces->Set<VectorIntWorkspace>(workspace_, workspace);
  57. }
  58. // Requests a vector of int's to store in the workspace registry.
  59. void RequestWorkspaces(WorkspaceRegistry *registry) override {
  60. workspace_ = registry->Request<VectorIntWorkspace>(WorkspaceName());
  61. }
  62. // Returns the precomputed value, or NumValues() for features outside
  63. // the sentence.
  64. FeatureValue Compute(const WorkspaceSet &workspaces,
  65. const Sentence &sentence, int focus,
  66. const FeatureVector *result) const override {
  67. if (focus < 0 || focus >= sentence.token_size()) return NumValues();
  68. return workspaces.Get<VectorIntWorkspace>(workspace_).element(focus);
  69. }
  70. private:
  71. int workspace_;
  72. };
  73. // Lookup feature that uses a TermFrequencyMap to store a string->int mapping.
  74. class TermFrequencyMapFeature : public TokenLookupFeature {
  75. public:
  76. explicit TermFrequencyMapFeature(const string &input_name)
  77. : input_name_(input_name), min_freq_(0), max_num_terms_(0) {}
  78. ~TermFrequencyMapFeature() override;
  79. // Requests the input map as a resource.
  80. void Setup(TaskContext *context) override;
  81. // Loads the input map into memory (using SharedStore to avoid redundancy.)
  82. void Init(TaskContext *context) override;
  83. // Number of unique values.
  84. virtual int64 NumValues() const { return term_map_->Size() + 1; }
  85. // Special value for strings not in the map.
  86. FeatureValue UnknownValue() const { return term_map_->Size(); }
  87. // Uses the TermFrequencyMap to lookup the string associated with a value.
  88. string GetFeatureValueName(FeatureValue value) const override;
  89. // Name of the shared workspace.
  90. string WorkspaceName() const override;
  91. protected:
  92. const TermFrequencyMap &term_map() const { return *term_map_; }
  93. private:
  94. // Shortcut pointer to shared map. Not owned.
  95. const TermFrequencyMap *term_map_ = nullptr;
  96. // Name of the input for the term map.
  97. string input_name_;
  98. // Filename of the underlying resource.
  99. string file_name_;
  100. // Minimum frequency for term map.
  101. int min_freq_;
  102. // Maximum number of terms for term map.
  103. int max_num_terms_;
  104. };
  105. class Word : public TermFrequencyMapFeature {
  106. public:
  107. Word() : TermFrequencyMapFeature("word-map") {}
  108. FeatureValue ComputeValue(const Token &token) const override {
  109. string form = token.word();
  110. return term_map().LookupIndex(form, UnknownValue());
  111. }
  112. };
  113. class LowercaseWord : public TermFrequencyMapFeature {
  114. public:
  115. LowercaseWord() : TermFrequencyMapFeature("lc-word-map") {}
  116. FeatureValue ComputeValue(const Token &token) const override {
  117. const string lcword = utils::Lowercase(token.word());
  118. return term_map().LookupIndex(lcword, UnknownValue());
  119. }
  120. };
  121. class Tag : public TermFrequencyMapFeature {
  122. public:
  123. Tag() : TermFrequencyMapFeature("tag-map") {}
  124. FeatureValue ComputeValue(const Token &token) const override {
  125. return term_map().LookupIndex(token.tag(), UnknownValue());
  126. }
  127. };
  128. class Label : public TermFrequencyMapFeature {
  129. public:
  130. Label() : TermFrequencyMapFeature("label-map") {}
  131. FeatureValue ComputeValue(const Token &token) const override {
  132. return term_map().LookupIndex(token.label(), UnknownValue());
  133. }
  134. };
  135. class LexicalCategoryFeature : public TokenLookupFeature {
  136. public:
  137. LexicalCategoryFeature(const string &name, int cardinality)
  138. : name_(name), cardinality_(cardinality) {}
  139. ~LexicalCategoryFeature() override {}
  140. FeatureValue NumValues() const override { return cardinality_; }
  141. // Returns the identifier for the workspace for this preprocessor.
  142. string WorkspaceName() const override {
  143. return tensorflow::strings::StrCat(name_, ":", cardinality_);
  144. }
  145. private:
  146. // Name of the category type.
  147. const string name_;
  148. // Number of values.
  149. const int cardinality_;
  150. };
  151. // Preprocessor that computes whether a word has a hyphen or not.
  152. class Hyphen : public LexicalCategoryFeature {
  153. public:
  154. // Enumeration of values.
  155. enum Category {
  156. NO_HYPHEN = 0,
  157. HAS_HYPHEN = 1,
  158. CARDINALITY = 2,
  159. };
  160. // Default constructor.
  161. Hyphen() : LexicalCategoryFeature("hyphen", CARDINALITY) {}
  162. // Returns a string representation of the enum value.
  163. string GetFeatureValueName(FeatureValue value) const override;
  164. // Returns the category value for the token.
  165. FeatureValue ComputeValue(const Token &token) const override;
  166. };
  167. // Preprocessor that computes whether a word has a hyphen or not.
  168. class Digit : public LexicalCategoryFeature {
  169. public:
  170. // Enumeration of values.
  171. enum Category {
  172. NO_DIGIT = 0,
  173. SOME_DIGIT = 1,
  174. ALL_DIGIT = 2,
  175. CARDINALITY = 3,
  176. };
  177. // Default constructor.
  178. Digit() : LexicalCategoryFeature("digit", CARDINALITY) {}
  179. // Returns a string representation of the enum value.
  180. string GetFeatureValueName(FeatureValue value) const override;
  181. // Returns the category value for the token.
  182. FeatureValue ComputeValue(const Token &token) const override;
  183. };
  184. // TokenLookupPreprocessor object to compute prefixes and suffixes of words. The
  185. // AffixTable is stored in the SharedStore. This is very similar to the
  186. // implementation of TermFrequencyMapPreprocessor, but using an AffixTable to
  187. // perform the lookups. There are only two specializations, for prefixes and
  188. // suffixes.
  189. class AffixTableFeature : public TokenLookupFeature {
  190. public:
  191. // Explicit constructor to set the type of the table. This determines the
  192. // requested input.
  193. explicit AffixTableFeature(AffixTable::Type type);
  194. ~AffixTableFeature() override;
  195. // Requests inputs for the affix table.
  196. void Setup(TaskContext *context) override;
  197. // Loads the affix table from the SharedStore.
  198. void Init(TaskContext *context) override;
  199. // The workspace name is specific to which affix length we are computing.
  200. string WorkspaceName() const override;
  201. // Returns the total number of affixes in the table, regardless of specified
  202. // length.
  203. FeatureValue NumValues() const override { return affix_table_->size() + 1; }
  204. // Special value for strings not in the map.
  205. FeatureValue UnknownValue() const { return affix_table_->size(); }
  206. // Looks up the affix for a given word.
  207. FeatureValue ComputeValue(const Token &token) const override;
  208. // Returns the string associated with a value.
  209. string GetFeatureValueName(FeatureValue value) const override;
  210. private:
  211. // Size parameter for the affix table.
  212. int affix_length_;
  213. // Name of the input for the table.
  214. string input_name_;
  215. // The type of the affix table.
  216. const AffixTable::Type type_;
  217. // Affix table used for indexing. This comes from the shared store, and is not
  218. // owned directly.
  219. const AffixTable *affix_table_ = nullptr;
  220. };
  221. // Specific instantiation for computing prefixes. This requires the input
  222. // "prefix-table".
  223. class PrefixFeature : public AffixTableFeature {
  224. public:
  225. PrefixFeature() : AffixTableFeature(AffixTable::PREFIX) {}
  226. };
  227. // Specific instantiation for computing suffixes. Requires the input
  228. // "suffix-table."
  229. class SuffixFeature : public AffixTableFeature {
  230. public:
  231. SuffixFeature() : AffixTableFeature(AffixTable::SUFFIX) {}
  232. };
  233. // Offset locator. Simple locator: just changes the focus by some offset.
  234. class Offset : public Locator<Offset> {
  235. public:
  236. void UpdateArgs(const WorkspaceSet &workspaces,
  237. const Sentence &sentence, int *focus) const {
  238. *focus += argument();
  239. }
  240. };
  241. typedef FeatureExtractor<Sentence, int> SentenceExtractor;
  242. // Utility to register the sentence_instance::Feature functions.
  243. #define REGISTER_SENTENCE_IDX_FEATURE(name, type) \
  244. REGISTER_FEATURE_FUNCTION(SentenceFeature, name, type)
  245. } // namespace syntaxnet
  246. #endif // $TARGETDIR_SENTENCE_FEATURES_H_