affix.h 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef SYNTAXNET_AFFIX_H_
  13. #define SYNTAXNET_AFFIX_H_
  14. #include <stddef.h>
  15. #include <string>
  16. #include <vector>
  17. #include "syntaxnet/utils.h"
  18. #include "syntaxnet/dictionary.pb.h"
  19. #include "syntaxnet/feature_extractor.h"
  20. #include "syntaxnet/proto_io.h"
  21. #include "syntaxnet/sentence.pb.h"
  22. #include "syntaxnet/task_context.h"
  23. #include "syntaxnet/term_frequency_map.h"
  24. #include "syntaxnet/workspace.h"
  25. #include "tensorflow/core/lib/strings/strcat.h"
  26. namespace syntaxnet {
  27. // An affix represents a prefix or suffix of a word of a certain length. Each
  28. // affix has a unique id and a textual form. An affix also has a pointer to the
  29. // affix that is one character shorter. This creates a chain of affixes that are
  30. // successively shorter.
  31. class Affix {
  32. private:
  33. friend class AffixTable;
  34. Affix(int id, const char *form, int length)
  35. : id_(id),
  36. length_(length),
  37. form_(form),
  38. shorter_(nullptr),
  39. next_(nullptr) {}
  40. public:
  41. // Returns unique id of affix.
  42. int id() const { return id_; }
  43. // Returns the textual representation of the affix.
  44. string form() const { return form_; }
  45. // Returns the length of the affix.
  46. int length() const { return length_; }
  47. // Gets/sets the affix that is one character shorter.
  48. Affix *shorter() const { return shorter_; }
  49. void set_shorter(Affix *next) { shorter_ = next; }
  50. private:
  51. // Affix id.
  52. int id_;
  53. // Length (in characters) of affix.
  54. int length_;
  55. // Text form of affix.
  56. string form_;
  57. // Pointer to affix that is one character shorter.
  58. Affix *shorter_;
  59. // Next affix in bucket chain.
  60. Affix *next_;
  61. TF_DISALLOW_COPY_AND_ASSIGN(Affix);
  62. };
  63. // An affix table holds all prefixes/suffixes of all the words added to the
  64. // table up to a maximum length. The affixes are chained together to enable
  65. // fast lookup of all affixes for a word.
  66. class AffixTable {
  67. public:
  68. // Affix table type.
  69. enum Type { PREFIX, SUFFIX };
  70. AffixTable(Type type, int max_length);
  71. ~AffixTable();
  72. // Resets the affix table and initialize the table for affixes of up to the
  73. // maximum length specified.
  74. void Reset(int max_length);
  75. // De-serializes this from the given proto.
  76. void Read(const AffixTableEntry &table_entry);
  77. // De-serializes this from the given records.
  78. void Read(ProtoRecordReader *reader);
  79. // Serializes this to the given proto.
  80. void Write(AffixTableEntry *table_entry) const;
  81. // Serializes this to the given records.
  82. void Write(ProtoRecordWriter *writer) const;
  83. // Adds all prefixes/suffixes of the word up to the maximum length to the
  84. // table. The longest affix is returned. The pointers in the affix can be
  85. // used for getting shorter affixes.
  86. Affix *AddAffixesForWord(const char *word, size_t size);
  87. // Gets the affix information for the affix with a certain id. Returns NULL if
  88. // there is no affix in the table with this id.
  89. Affix *GetAffix(int id) const;
  90. // Gets affix form from id. If the affix does not exist in the table, an empty
  91. // string is returned.
  92. string AffixForm(int id) const;
  93. // Gets affix id for affix. If the affix does not exist in the table, -1 is
  94. // returned.
  95. int AffixId(const string &form) const;
  96. // Returns size of the affix table.
  97. int size() const { return affixes_.size(); }
  98. // Returns the maximum affix length.
  99. int max_length() const { return max_length_; }
  100. private:
  101. // Adds a new affix to table.
  102. Affix *AddNewAffix(const string &form, int length);
  103. // Finds existing affix in table.
  104. Affix *FindAffix(const string &form) const;
  105. // Resizes bucket array.
  106. void Resize(int size_hint);
  107. // Affix type (prefix or suffix).
  108. Type type_;
  109. // Maximum length of affix.
  110. int max_length_;
  111. // Index from affix ids to affix items.
  112. vector<Affix *> affixes_;
  113. // Buckets for word-to-affix hash map.
  114. vector<Affix *> buckets_;
  115. TF_DISALLOW_COPY_AND_ASSIGN(AffixTable);
  116. };
  117. } // namespace syntaxnet
  118. #endif // SYNTAXNET_AFFIX_H_