embedding_feature_extractor.h 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef SYNTAXNET_EMBEDDING_FEATURE_EXTRACTOR_H_
  13. #define SYNTAXNET_EMBEDDING_FEATURE_EXTRACTOR_H_
  14. #include <functional>
  15. #include <memory>
  16. #include <string>
  17. #include <vector>
  18. #include "syntaxnet/utils.h"
  19. #include "syntaxnet/feature_extractor.h"
  20. #include "syntaxnet/feature_types.h"
  21. #include "syntaxnet/parser_features.h"
  22. #include "syntaxnet/sentence_features.h"
  23. #include "syntaxnet/sparse.pb.h"
  24. #include "syntaxnet/task_context.h"
  25. #include "syntaxnet/workspace.h"
  26. #include "tensorflow/core/lib/strings/strcat.h"
  27. namespace syntaxnet {
  28. // An EmbeddingFeatureExtractor manages the extraction of features for
  29. // embedding-based models. It wraps a sequence of underlying classes of feature
  30. // extractors, along with associated predicate maps. Each class of feature
  31. // extractors is associated with a name, e.g., "words", "labels", "tags".
  32. //
  33. // The class is split between a generic abstract version,
  34. // GenericEmbeddingFeatureExtractor (that can be initialized without knowing the
  35. // signature of the ExtractFeatures method) and a typed version.
  36. //
  37. // The predicate maps must be initialized before use: they can be loaded using
  38. // Read() or updated via UpdateMapsForExample.
  39. class GenericEmbeddingFeatureExtractor {
  40. public:
  41. virtual ~GenericEmbeddingFeatureExtractor() {}
  42. // Get the prefix string to put in front of all arguments, so they don't
  43. // conflict with other embedding models.
  44. virtual const string ArgPrefix() const = 0;
  45. // Sets up predicate maps and embedding space names that are common for all
  46. // embedding based feature extractors.
  47. virtual void Setup(TaskContext *context);
  48. virtual void Init(TaskContext *context);
  49. // Requests workspace for the underlying feature extractors. This is
  50. // implemented in the typed class.
  51. virtual void RequestWorkspaces(WorkspaceRegistry *registry) = 0;
  52. // Number of predicates for the embedding at a given index (vocabulary size.)
  53. int EmbeddingSize(int index) const {
  54. return generic_feature_extractor(index).GetDomainSize();
  55. }
  56. // Returns number of embedding spaces.
  57. int NumEmbeddings() const { return embedding_dims_.size(); }
  58. // Returns the number of features in the embedding space.
  59. const int FeatureSize(int idx) const {
  60. return generic_feature_extractor(idx).feature_types();
  61. }
  62. // Returns the dimensionality of the embedding space.
  63. int EmbeddingDims(int index) const { return embedding_dims_[index]; }
  64. // Accessor for embedding dims (dimensions of the embedding spaces).
  65. const std::vector<int> &embedding_dims() const { return embedding_dims_; }
  66. const std::vector<string> &embedding_fml() const { return embedding_fml_; }
  67. // Get parameter name by concatenating the prefix and the original name.
  68. string GetParamName(const string &param_name) const {
  69. return tensorflow::strings::StrCat(ArgPrefix(), "_", param_name);
  70. }
  71. // Returns the name of the embedding space.
  72. const string &embedding_name(int index) const {
  73. return embedding_names_[index];
  74. }
  75. protected:
  76. // Provides the generic class with access to the templated extractors. This is
  77. // used to get the type information out of the feature extractor without
  78. // knowing the specific calling arguments of the extractor itself.
  79. virtual const GenericFeatureExtractor &generic_feature_extractor(
  80. int idx) const = 0;
  81. // Converts a vector of extracted features into
  82. // dist_belief::SparseFeatures. Each feature in each feature vector becomes a
  83. // single SparseFeatures. The predicates are mapped through map_fn which
  84. // should point to either mutable_map_fn or const_map_fn depending on whether
  85. // or not the predicate maps should be updated.
  86. std::vector<std::vector<SparseFeatures>> ConvertExample(
  87. const std::vector<FeatureVector> &feature_vectors) const;
  88. private:
  89. // Embedding space names for parameter sharing.
  90. std::vector<string> embedding_names_;
  91. // FML strings for each feature extractor.
  92. std::vector<string> embedding_fml_;
  93. // Size of each of the embedding spaces (maximum predicate id).
  94. std::vector<int> embedding_sizes_;
  95. // Embedding dimensions of the embedding spaces (i.e. 32, 64 etc.)
  96. std::vector<int> embedding_dims_;
  97. // Whether or not to add string descriptions to converted examples.
  98. bool add_strings_;
  99. };
  100. // Templated, object-specific implementation of the
  101. // EmbeddingFeatureExtractor. EXTRACTOR should be a FeatureExtractor<OBJ,
  102. // ARGS...> class that has the appropriate FeatureTraits() to ensure that
  103. // locator type features work.
  104. //
  105. // Note: for backwards compatibility purposes, this always reads the FML spec
  106. // from "<prefix>_features".
  107. template <class EXTRACTOR, class OBJ, class... ARGS>
  108. class EmbeddingFeatureExtractor : public GenericEmbeddingFeatureExtractor {
  109. public:
  110. // Sets up all predicate maps, feature extractors, and flags.
  111. void Setup(TaskContext *context) override {
  112. GenericEmbeddingFeatureExtractor::Setup(context);
  113. feature_extractors_.resize(embedding_fml().size());
  114. for (int i = 0; i < embedding_fml().size(); ++i) {
  115. feature_extractors_[i].Parse(embedding_fml()[i]);
  116. feature_extractors_[i].Setup(context);
  117. }
  118. }
  119. // Initializes resources needed by the feature extractors.
  120. void Init(TaskContext *context) override {
  121. GenericEmbeddingFeatureExtractor::Init(context);
  122. for (auto &feature_extractor : feature_extractors_) {
  123. feature_extractor.Init(context);
  124. }
  125. }
  126. // Requests workspaces from the registry. Must be called after Init(), and
  127. // before Preprocess().
  128. void RequestWorkspaces(WorkspaceRegistry *registry) override {
  129. for (auto &feature_extractor : feature_extractors_) {
  130. feature_extractor.RequestWorkspaces(registry);
  131. }
  132. }
  133. // Must be called on the object one state for each sentence, before any
  134. // feature extraction (e.g., UpdateMapsForExample, ExtractSparseFeatures).
  135. void Preprocess(WorkspaceSet *workspaces, OBJ *obj) const {
  136. for (auto &feature_extractor : feature_extractors_) {
  137. feature_extractor.Preprocess(workspaces, obj);
  138. }
  139. }
  140. // Returns a ragged array of SparseFeatures, for 1) each feature extractor
  141. // class e, and 2) each feature f extracted by e. Underlying predicate maps
  142. // will not be updated and so unrecognized predicates may occur. In such a
  143. // case the SparseFeatures object associated with a given extractor class and
  144. // feature will be empty.
  145. std::vector<std::vector<SparseFeatures>> ExtractSparseFeatures(
  146. const WorkspaceSet &workspaces, const OBJ &obj, ARGS... args) const {
  147. std::vector<FeatureVector> features(feature_extractors_.size());
  148. ExtractFeatures(workspaces, obj, args..., &features);
  149. return ConvertExample(features);
  150. }
  151. // Extracts features using the extractors. Note that features must already
  152. // be initialized to the correct number of feature extractors. No predicate
  153. // mapping is applied.
  154. void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &obj,
  155. ARGS... args,
  156. std::vector<FeatureVector> *features) const {
  157. DCHECK(features != nullptr);
  158. DCHECK_EQ(features->size(), feature_extractors_.size());
  159. for (int i = 0; i < feature_extractors_.size(); ++i) {
  160. (*features)[i].clear();
  161. feature_extractors_[i].ExtractFeatures(workspaces, obj, args...,
  162. &(*features)[i]);
  163. }
  164. }
  165. protected:
  166. // Provides generic access to the feature extractors.
  167. const GenericFeatureExtractor &generic_feature_extractor(
  168. int idx) const override {
  169. DCHECK_LT(idx, feature_extractors_.size());
  170. DCHECK_GE(idx, 0);
  171. return feature_extractors_[idx];
  172. }
  173. private:
  174. // Templated feature extractor class.
  175. std::vector<EXTRACTOR> feature_extractors_;
  176. };
  177. class ParserEmbeddingFeatureExtractor
  178. : public EmbeddingFeatureExtractor<ParserFeatureExtractor, ParserState> {
  179. public:
  180. explicit ParserEmbeddingFeatureExtractor(const string &arg_prefix)
  181. : arg_prefix_(arg_prefix) {}
  182. private:
  183. const string ArgPrefix() const override { return arg_prefix_; }
  184. // Prefix for context parameters.
  185. string arg_prefix_;
  186. };
  187. } // namespace syntaxnet
  188. #endif // SYNTAXNET_EMBEDDING_FEATURE_EXTRACTOR_H_