morphology_label_set.h 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. // A class to store the set of possible TokenMorphology objects. This includes
  13. // lookup, iteration and serialziation.
  14. #ifndef SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
  15. #define SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
  16. #include <unordered_map>
  17. #include <string>
  18. #include <vector>
  19. #include "syntaxnet/proto_io.h"
  20. #include "syntaxnet/sentence.pb.h"
  21. namespace syntaxnet {
  22. class MorphologyLabelSet {
  23. public:
  24. // Initalize as an empty morphology.
  25. MorphologyLabelSet() {}
  26. // Initalizes by reading the given file, which has been saved by Write().
  27. // This makes using the shared store easier.
  28. explicit MorphologyLabelSet(const string &fname) { Read(fname); }
  29. // Adds a TokenMorphology to the set if it is not present. In any case, return
  30. // its position in the list. Note: This is slow, and should not be called
  31. // outside of training or init.
  32. int Add(const TokenMorphology &morph);
  33. // Look up an existing TokenMorphology. If it is not present, return -1.
  34. // Note: This is slow, and should not be called outside of training workflow
  35. // or init.
  36. int LookupExisting(const TokenMorphology &morph) const;
  37. // Return the TokenMorphology at position i. The input i should be in the
  38. // range 0..size(). Note: this will be called at inference time and needs to
  39. // be kept fast.
  40. const TokenMorphology &Lookup(int i) const;
  41. // Return the number of elements.
  42. int Size() const { return label_set_.size(); }
  43. // Deserialization and serialization.
  44. void Read(const string &filename);
  45. void Write(const string &filename) const;
  46. private:
  47. string StringForMatch(const TokenMorphology &morhp) const;
  48. // Deserialization and serialziation implementation.
  49. void Read(ProtoRecordReader *reader);
  50. void Write(ProtoRecordWriter *writer) const;
  51. // List of all possible annotations. This is a unique list, where equality is
  52. // defined as follows:
  53. //
  54. // a == b iff the set of attribute pairs (attribute, value) is identical.
  55. vector<TokenMorphology> label_set_;
  56. // Because protocol buffer equality is complicated, we implement our own
  57. // equality operator based on strings. This unordered_map allows us to do the
  58. // lookup more quickly.
  59. unordered_map<string, int> fast_lookup_;
  60. // A separator string that should not occur in any of the attribute names.
  61. // This should never be serialized, so that it can be changed in the code if
  62. // we change attribute names and it occurs in the new names.
  63. static const char kSeparator[];
  64. };
  65. // A feature type with one value for each complete morphological analysis
  66. // (analogous to the fulltag analyzer).
  67. class FullLabelFeatureType : public FeatureType {
  68. public:
  69. FullLabelFeatureType(const string &name, const MorphologyLabelSet *label_set)
  70. : FeatureType(name), label_set_(label_set) {}
  71. ~FullLabelFeatureType() override {}
  72. // Converts a feature value to a name. We don't use StringForMatch, since the
  73. // goal of these are to be readable, even if they might occasionally be
  74. // non-unique.
  75. string GetFeatureValueName(FeatureValue value) const override;
  76. // Returns the size of the feature values domain.
  77. FeatureValue GetDomainSize() const override { return label_set_->Size(); }
  78. private:
  79. // Not owned.
  80. const MorphologyLabelSet *label_set_ = nullptr;
  81. };
  82. } // namespace syntaxnet
  83. #endif // SYNTAXNET_MORPHOLOGY_LABEL_SET_H_