term_frequency_map.h 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef SYNTAXNET_TERM_FREQUENCY_MAP_H_
  13. #define SYNTAXNET_TERM_FREQUENCY_MAP_H_
  14. #include <stddef.h>
  15. #include <memory>
  16. #include <string>
  17. #include <unordered_map>
  18. #include <utility>
  19. #include <vector>
  20. #include "syntaxnet/utils.h"
  21. namespace syntaxnet {
  22. // A mapping from strings to frequencies with save and load functionality.
  23. class TermFrequencyMap {
  24. public:
  25. // Creates an empty frequency map.
  26. TermFrequencyMap() {}
  27. // Creates a term frequency map by calling Load.
  28. TermFrequencyMap(const string &file, int min_frequency, int max_num_terms) {
  29. Load(file, min_frequency, max_num_terms);
  30. }
  31. // Returns the number of terms with positive frequency.
  32. int Size() const { return term_index_.size(); }
  33. // Returns the index associated with the given term. If the term does not
  34. // exist, the unknown index is returned instead.
  35. int LookupIndex(const string &term, int unknown) const {
  36. const TermIndex::const_iterator it = term_index_.find(term);
  37. return (it != term_index_.end() ? it->second : unknown);
  38. }
  39. // Returns the term associated with the given index.
  40. const string &GetTerm(int index) const { return term_data_[index].first; }
  41. // Increases the frequency of the given term by 1, creating a new entry if
  42. // necessary, and returns the index of the term.
  43. int Increment(const string &term);
  44. // Clears all frequencies.
  45. void Clear();
  46. // Loads a frequency mapping from the given file, which must have been created
  47. // by an earlier call to Save(). After loading, the term indices are
  48. // guaranteed to be ordered in descending order of frequency (breaking ties
  49. // arbitrarily). However, any new terms inserted after loading do not
  50. // maintain this sorting invariant.
  51. //
  52. // Only loads terms with frequency >= min_frequency. If max_num_terms <= 0,
  53. // then all qualifying terms are loaded; otherwise, max_num_terms terms with
  54. // maximal frequency are loaded (breaking ties arbitrarily).
  55. void Load(const string &filename, int min_frequency, int max_num_terms);
  56. // Saves a frequency mapping to the given file.
  57. void Save(const string &filename) const;
  58. private:
  59. // Hashtable for term-to-index mapping.
  60. typedef std::unordered_map<string, int> TermIndex;
  61. // Sorting functor for term data.
  62. struct SortByFrequencyThenTerm;
  63. // Mapping from terms to indices.
  64. TermIndex term_index_;
  65. // Mapping from indices to term and frequency.
  66. std::vector<std::pair<string, int64>> term_data_;
  67. TF_DISALLOW_COPY_AND_ASSIGN(TermFrequencyMap);
  68. };
  69. // A mapping from tags to categories.
  70. class TagToCategoryMap {
  71. public:
  72. TagToCategoryMap() {}
  73. ~TagToCategoryMap() {}
  74. // Loads a tag to category map from a text file.
  75. explicit TagToCategoryMap(const string &filename);
  76. // Sets the category for the given tag.
  77. void SetCategory(const string &tag, const string &category);
  78. // Returns the category associated with the given tag.
  79. const string &GetCategory(const string &tag) const;
  80. // Saves a tag to category map to the given file.
  81. void Save(const string &filename) const;
  82. private:
  83. // List of tags that have multiple coarse tags, and their mappings. Used only
  84. // for error reporting at Save() time.
  85. std::map<string, std::unordered_set<string>> invalid_mappings_;
  86. std::map<string, string> tag_to_category_;
  87. TF_DISALLOW_COPY_AND_ASSIGN(TagToCategoryMap);
  88. };
  89. } // namespace syntaxnet
  90. #endif // SYNTAXNET_TERM_FREQUENCY_MAP_H_