embedding_feature_extractor.h 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef SYNTAXNET_EMBEDDING_FEATURE_EXTRACTOR_H_
  13. #define SYNTAXNET_EMBEDDING_FEATURE_EXTRACTOR_H_
  14. #include <functional>
  15. #include <memory>
  16. #include <string>
  17. #include <vector>
  18. #include "syntaxnet/utils.h"
  19. #include "syntaxnet/feature_extractor.h"
  20. #include "syntaxnet/feature_types.h"
  21. #include "syntaxnet/parser_features.h"
  22. #include "syntaxnet/sentence_features.h"
  23. #include "syntaxnet/sparse.pb.h"
  24. #include "syntaxnet/task_context.h"
  25. #include "syntaxnet/workspace.h"
  26. #include "tensorflow/core/lib/strings/strcat.h"
  27. namespace syntaxnet {
  28. // An EmbeddingFeatureExtractor manages the extraction of features for
  29. // embedding-based models. It wraps a sequence of underlying classes of feature
  30. // extractors, along with associated predicate maps. Each class of feature
  31. // extractors is associated with a name, e.g., "words", "labels", "tags".
  32. //
  33. // The class is split between a generic abstract version,
  34. // GenericEmbeddingFeatureExtractor (that can be initialized without knowing the
  35. // signature of the ExtractFeatures method) and a typed version.
  36. //
  37. // The predicate maps must be initialized before use: they can be loaded using
  38. // Read() or updated via UpdateMapsForExample.
  39. class GenericEmbeddingFeatureExtractor {
  40. public:
  41. virtual ~GenericEmbeddingFeatureExtractor() {}
  42. // Get the prefix string to put in front of all arguments, so they don't
  43. // conflict with other embedding models.
  44. virtual const string ArgPrefix() const = 0;
  45. // Sets up predicate maps and embedding space names that are common for all
  46. // embedding based feature extractors.
  47. virtual void Setup(TaskContext *context);
  48. virtual void Init(TaskContext *context);
  49. // Requests workspace for the underlying feature extractors. This is
  50. // implemented in the typed class.
  51. virtual void RequestWorkspaces(WorkspaceRegistry *registry) = 0;
  52. // Number of predicates for the embedding at a given index (vocabulary size.)
  53. int EmbeddingSize(int index) const {
  54. return generic_feature_extractor(index).GetDomainSize();
  55. }
  56. // Returns number of embedding spaces.
  57. int NumEmbeddings() const { return embedding_dims_.size(); }
  58. // Returns the number of features in the embedding space.
  59. const int FeatureSize(int idx) const {
  60. return generic_feature_extractor(idx).feature_types();
  61. }
  62. // Returns the dimensionality of the embedding space.
  63. int EmbeddingDims(int index) const { return embedding_dims_[index]; }
  64. // Accessor for embedding dims (dimensions of the embedding spaces).
  65. const vector<int> &embedding_dims() const { return embedding_dims_; }
  66. const vector<string> &embedding_fml() const { return embedding_fml_; }
  67. // Get parameter name by concatenating the prefix and the original name.
  68. string GetParamName(const string &param_name) const {
  69. return tensorflow::strings::StrCat(ArgPrefix(), "_", param_name);
  70. }
  71. protected:
  72. // Provides the generic class with access to the templated extractors. This is
  73. // used to get the type information out of the feature extractor without
  74. // knowing the specific calling arguments of the extractor itself.
  75. virtual const GenericFeatureExtractor &generic_feature_extractor(
  76. int idx) const = 0;
  77. // Converts a vector of extracted features into
  78. // dist_belief::SparseFeatures. Each feature in each feature vector becomes a
  79. // single SparseFeatures. The predicates are mapped through map_fn which
  80. // should point to either mutable_map_fn or const_map_fn depending on whether
  81. // or not the predicate maps should be updated.
  82. vector<vector<SparseFeatures>> ConvertExample(
  83. const vector<FeatureVector> &feature_vectors) const;
  84. private:
  85. // Embedding space names for parameter sharing.
  86. vector<string> embedding_names_;
  87. // FML strings for each feature extractor.
  88. vector<string> embedding_fml_;
  89. // Size of each of the embedding spaces (maximum predicate id).
  90. vector<int> embedding_sizes_;
  91. // Embedding dimensions of the embedding spaces (i.e. 32, 64 etc.)
  92. vector<int> embedding_dims_;
  93. // Whether or not to add string descriptions to converted examples.
  94. bool add_strings_;
  95. };
  96. // Templated, object-specific implementation of the
  97. // EmbeddingFeatureExtractor. EXTRACTOR should be a FeatureExtractor<OBJ,
  98. // ARGS...> class that has the appropriate FeatureTraits() to ensure that
  99. // locator type features work.
  100. //
  101. // Note: for backwards compatibility purposes, this always reads the FML spec
  102. // from "<prefix>_features".
  103. template <class EXTRACTOR, class OBJ, class... ARGS>
  104. class EmbeddingFeatureExtractor : public GenericEmbeddingFeatureExtractor {
  105. public:
  106. // Sets up all predicate maps, feature extractors, and flags.
  107. void Setup(TaskContext *context) override {
  108. GenericEmbeddingFeatureExtractor::Setup(context);
  109. feature_extractors_.resize(embedding_fml().size());
  110. for (int i = 0; i < embedding_fml().size(); ++i) {
  111. feature_extractors_[i].Parse(embedding_fml()[i]);
  112. feature_extractors_[i].Setup(context);
  113. }
  114. }
  115. // Initializes resources needed by the feature extractors.
  116. void Init(TaskContext *context) override {
  117. GenericEmbeddingFeatureExtractor::Init(context);
  118. for (auto &feature_extractor : feature_extractors_) {
  119. feature_extractor.Init(context);
  120. }
  121. }
  122. // Requests workspaces from the registry. Must be called after Init(), and
  123. // before Preprocess().
  124. void RequestWorkspaces(WorkspaceRegistry *registry) override {
  125. for (auto &feature_extractor : feature_extractors_) {
  126. feature_extractor.RequestWorkspaces(registry);
  127. }
  128. }
  129. // Must be called on the object one state for each sentence, before any
  130. // feature extraction (e.g., UpdateMapsForExample, ExtractSparseFeatures).
  131. void Preprocess(WorkspaceSet *workspaces, OBJ *obj) const {
  132. for (auto &feature_extractor : feature_extractors_) {
  133. feature_extractor.Preprocess(workspaces, obj);
  134. }
  135. }
  136. // Returns a ragged array of SparseFeatures, for 1) each feature extractor
  137. // class e, and 2) each feature f extracted by e. Underlying predicate maps
  138. // will not be updated and so unrecognized predicates may occur. In such a
  139. // case the SparseFeatures object associated with a given extractor class and
  140. // feature will be empty.
  141. vector<vector<SparseFeatures>> ExtractSparseFeatures(
  142. const WorkspaceSet &workspaces, const OBJ &obj, ARGS... args) const {
  143. vector<FeatureVector> features(feature_extractors_.size());
  144. ExtractFeatures(workspaces, obj, args..., &features);
  145. return ConvertExample(features);
  146. }
  147. // Extracts features using the extractors. Note that features must already
  148. // be initialized to the correct number of feature extractors. No predicate
  149. // mapping is applied.
  150. void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &obj,
  151. ARGS... args,
  152. vector<FeatureVector> *features) const {
  153. DCHECK(features != nullptr);
  154. DCHECK_EQ(features->size(), feature_extractors_.size());
  155. for (int i = 0; i < feature_extractors_.size(); ++i) {
  156. (*features)[i].clear();
  157. feature_extractors_[i].ExtractFeatures(workspaces, obj, args...,
  158. &(*features)[i]);
  159. }
  160. }
  161. protected:
  162. // Provides generic access to the feature extractors.
  163. const GenericFeatureExtractor &generic_feature_extractor(
  164. int idx) const override {
  165. DCHECK_LT(idx, feature_extractors_.size());
  166. DCHECK_GE(idx, 0);
  167. return feature_extractors_[idx];
  168. }
  169. private:
  170. // Templated feature extractor class.
  171. vector<EXTRACTOR> feature_extractors_;
  172. };
  173. class ParserEmbeddingFeatureExtractor
  174. : public EmbeddingFeatureExtractor<ParserFeatureExtractor, ParserState> {
  175. public:
  176. explicit ParserEmbeddingFeatureExtractor(const string &arg_prefix)
  177. : arg_prefix_(arg_prefix) {}
  178. private:
  179. const string ArgPrefix() const override { return arg_prefix_; }
  180. // Prefix for context parameters.
  181. string arg_prefix_;
  182. };
  183. } // namespace syntaxnet
  184. #endif // SYNTAXNET_EMBEDDING_FEATURE_EXTRACTOR_H_