utils.h 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef SYNTAXNET_UTILS_H_
  13. #define SYNTAXNET_UTILS_H_
  14. #include <functional>
  15. #include <string>
  16. #include <vector>
  17. #include <unordered_set>
  18. #include "syntaxnet/base.h"
  19. #include "tensorflow/core/lib/core/status.h"
  20. #include "tensorflow/core/lib/strings/strcat.h"
  21. #include "tensorflow/core/platform/default/integral_types.h"
  22. #include "tensorflow/core/platform/mutex.h"
  23. #include "util/utf8/unicodetext.h"
  24. namespace syntaxnet {
  25. namespace utils {
  26. bool ParseInt32(const char *c_str, int *value);
  27. bool ParseInt64(const char *c_str, int64 *value);
  28. bool ParseDouble(const char *c_str, double *value);
  29. template <typename T>
  30. T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
  31. T value;
  32. CHECK(func(str.c_str(), &value)) << "Failed to convert: " << str;
  33. return value;
  34. }
  35. template <typename T>
  36. T ParseUsing(const string &str, T defval,
  37. std::function<bool(const char *, T *)> func) {
  38. return str.empty() ? defval : ParseUsing<T>(str, func);
  39. }
  40. string CEscape(const string &src);
  41. // Splits the given string on every occurrence of the given delimiter char.
  42. std::vector<string> Split(const string &text, char delim);
  43. // Splits the given string on the first occurrence of the given delimiter char,
  44. // or returns the given string if the given delimiter is not found.
  45. std::vector<string> SplitOne(const string &text, char delim);
  46. template <typename T>
  47. string Join(const std::vector<T> &s, const char *sep) {
  48. string result;
  49. bool first = true;
  50. for (const auto &x : s) {
  51. tensorflow::strings::StrAppend(&result, (first ? "" : sep), x);
  52. first = false;
  53. }
  54. return result;
  55. }
  56. string JoinPath(std::initializer_list<tensorflow::StringPiece> paths);
  57. size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text);
  58. size_t RemoveTrailingWhitespace(tensorflow::StringPiece *text);
  59. size_t RemoveWhitespaceContext(tensorflow::StringPiece *text);
  60. uint32 Hash32(const char *data, size_t n, uint32 seed);
  61. // Deletes all the elements in an STL container and clears the container. This
  62. // function is suitable for use with a vector, set, hash_set, or any other STL
  63. // container which defines sensible begin(), end(), and clear() methods.
  64. // If container is NULL, this function is a no-op.
  65. template <typename T>
  66. void STLDeleteElements(T *container) {
  67. if (!container) return;
  68. auto it = container->begin();
  69. while (it != container->end()) {
  70. auto temp = it;
  71. ++it;
  72. delete *temp;
  73. }
  74. container->clear();
  75. }
  76. // Returns lower-cased version of s.
  77. string Lowercase(tensorflow::StringPiece s);
  78. class PunctuationUtil {
  79. public:
  80. // Unicode character ranges for punctuation characters according to CoNLL.
  81. struct CharacterRange {
  82. int first;
  83. int last;
  84. };
  85. static CharacterRange kPunctuation[];
  86. // Returns true if Unicode character is a punctuation character.
  87. static bool IsPunctuation(int u) {
  88. int i = 0;
  89. while (kPunctuation[i].first > 0) {
  90. if (u < kPunctuation[i].first) return false;
  91. if (u <= kPunctuation[i].last) return true;
  92. ++i;
  93. }
  94. return false;
  95. }
  96. // Determine if tag is a punctuation tag.
  97. static bool IsPunctuationTag(const string &tag) {
  98. for (size_t i = 0; i < tag.length(); ++i) {
  99. int c = tag[i];
  100. if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
  101. return false;
  102. }
  103. }
  104. return true;
  105. }
  106. // Returns true if word consists of punctuation characters.
  107. static bool IsPunctuationToken(const string &word) {
  108. UnicodeText text;
  109. text.PointToUTF8(word.c_str(), word.length());
  110. UnicodeText::const_iterator it;
  111. for (it = text.begin(); it != text.end(); ++it) {
  112. if (!IsPunctuation(*it)) return false;
  113. }
  114. return true;
  115. }
  116. // Returns true if tag is non-empty and has only punctuation or parens
  117. // symbols.
  118. static bool IsPunctuationTagOrParens(const string &tag) {
  119. if (tag.empty()) return false;
  120. for (size_t i = 0; i < tag.length(); ++i) {
  121. int c = tag[i];
  122. if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
  123. c != '\'' && c != '`') {
  124. return false;
  125. }
  126. }
  127. return true;
  128. }
  129. // Decides whether to score a token, given the word, the POS tag and
  130. // and the scoring type.
  131. static bool ScoreToken(const string &word, const string &tag,
  132. const string &scoring_type) {
  133. if (scoring_type == "default") {
  134. return tag.empty() || !IsPunctuationTag(tag);
  135. } else if (scoring_type == "conllx") {
  136. return !IsPunctuationToken(word);
  137. } else if (scoring_type == "ignore_parens") {
  138. return !IsPunctuationTagOrParens(tag);
  139. }
  140. CHECK(scoring_type.empty()) << "Unknown scoring strategy " << scoring_type;
  141. return true;
  142. }
  143. };
  144. void NormalizeDigits(string *form);
  145. // Helper type to mark missing c-tor argument types
  146. // for Type's c-tor in LazyStaticPtr<Type, ...>.
  147. struct NoArg {};
  148. template <typename Type, typename Arg1 = NoArg, typename Arg2 = NoArg,
  149. typename Arg3 = NoArg>
  150. class LazyStaticPtr {
  151. public:
  152. typedef Type element_type; // per smart pointer convention
  153. // Pretend to be a pointer to Type (never NULL due to on-demand creation):
  154. Type &operator*() const { return *get(); }
  155. Type *operator->() const { return get(); }
  156. // Named accessor/initializer:
  157. Type *get() const {
  158. if (!ptr_) Initialize(this);
  159. return ptr_;
  160. }
  161. public:
  162. // All the data is public and LazyStaticPtr has no constructors so that we can
  163. // initialize LazyStaticPtr objects with the "= { arg_value, ... }" syntax.
  164. // Clients of LazyStaticPtr must not access the data members directly.
  165. // Arguments for Type's c-tor
  166. // (unused NoArg-typed arguments consume either no space, or 1 byte to
  167. // ensure address uniqueness):
  168. Arg1 arg1_;
  169. Arg2 arg2_;
  170. Arg3 arg3_;
  171. // The object we create and show.
  172. mutable Type *ptr_;
  173. private:
  174. template <typename A1, typename A2, typename A3>
  175. static Type *Factory(const A1 &a1, const A2 &a2, const A3 &a3) {
  176. return new Type(a1, a2, a3);
  177. }
  178. template <typename A1, typename A2>
  179. static Type *Factory(const A1 &a1, const A2 &a2, NoArg a3) {
  180. return new Type(a1, a2);
  181. }
  182. template <typename A1>
  183. static Type *Factory(const A1 &a1, NoArg a2, NoArg a3) {
  184. return new Type(a1);
  185. }
  186. static Type *Factory(NoArg a1, NoArg a2, NoArg a3) { return new Type(); }
  187. static void Initialize(const LazyStaticPtr *lsp) {
  188. lsp->ptr_ = Factory(lsp->arg1_, lsp->arg2_, lsp->arg3_);
  189. }
  190. };
  191. } // namespace utils
  192. } // namespace syntaxnet
  193. #endif // SYNTAXNET_UTILS_H_