utils.cc 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #include "syntaxnet/utils.h"
  13. #include "tensorflow/core/platform/macros.h"
  14. namespace syntaxnet {
  15. namespace utils {
  16. bool ParseInt32(const char *c_str, int *value) {
  17. char *temp;
  18. *value = strtol(c_str, &temp, 0); // NOLINT
  19. return (*temp == '\0');
  20. }
  21. bool ParseInt64(const char *c_str, int64 *value) {
  22. char *temp;
  23. *value = strtol(c_str, &temp, 0); // NOLINT
  24. return (*temp == '\0');
  25. }
  26. bool ParseDouble(const char *c_str, double *value) {
  27. char *temp;
  28. *value = strtod(c_str, &temp);
  29. return (*temp == '\0');
  30. }
  31. static char hex_char[] = "0123456789abcdef";
  32. string CEscape(const string &src) {
  33. string dest;
  34. for (unsigned char c : src) {
  35. switch (c) {
  36. case '\n':
  37. dest.append("\\n");
  38. break;
  39. case '\r':
  40. dest.append("\\r");
  41. break;
  42. case '\t':
  43. dest.append("\\t");
  44. break;
  45. case '\"':
  46. dest.append("\\\"");
  47. break;
  48. case '\'':
  49. dest.append("\\'");
  50. break;
  51. case '\\':
  52. dest.append("\\\\");
  53. break;
  54. default:
  55. // Note that if we emit \xNN and the src character after that is a hex
  56. // digit then that digit must be escaped too to prevent it being
  57. // interpreted as part of the character code by C.
  58. if ((c >= 0x80) || !isprint(c)) {
  59. dest.append("\\");
  60. dest.push_back(hex_char[c / 64]);
  61. dest.push_back(hex_char[(c % 64) / 8]);
  62. dest.push_back(hex_char[c % 8]);
  63. } else {
  64. dest.push_back(c);
  65. break;
  66. }
  67. }
  68. }
  69. return dest;
  70. }
  71. std::vector<string> Split(const string &text, char delim) {
  72. std::vector<string> result;
  73. int token_start = 0;
  74. if (!text.empty()) {
  75. for (size_t i = 0; i < text.size() + 1; i++) {
  76. if ((i == text.size()) || (text[i] == delim)) {
  77. result.push_back(string(text.data() + token_start, i - token_start));
  78. token_start = i + 1;
  79. }
  80. }
  81. }
  82. return result;
  83. }
  84. std::vector<string> SplitOne(const string &text, char delim) {
  85. std::vector<string> result;
  86. size_t split = text.find_first_of(delim);
  87. result.push_back(text.substr(0, split));
  88. if (split != string::npos) {
  89. result.push_back(text.substr(split + 1));
  90. }
  91. return result;
  92. }
  93. bool IsAbsolutePath(tensorflow::StringPiece path) {
  94. return !path.empty() && path[0] == '/';
  95. }
  96. // For an array of paths of length count, append them all together,
  97. // ensuring that the proper path separators are inserted between them.
  98. string JoinPath(std::initializer_list<tensorflow::StringPiece> paths) {
  99. string result;
  100. for (tensorflow::StringPiece path : paths) {
  101. if (path.empty()) {
  102. continue;
  103. }
  104. if (result.empty()) {
  105. result = path.ToString();
  106. continue;
  107. }
  108. if (result[result.size() - 1] == '/') {
  109. if (IsAbsolutePath(path)) {
  110. tensorflow::strings::StrAppend(&result, path.substr(1));
  111. } else {
  112. tensorflow::strings::StrAppend(&result, path);
  113. }
  114. } else {
  115. if (IsAbsolutePath(path)) {
  116. tensorflow::strings::StrAppend(&result, path);
  117. } else {
  118. tensorflow::strings::StrAppend(&result, "/", path);
  119. }
  120. }
  121. }
  122. return result;
  123. }
  124. size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text) {
  125. size_t count = 0;
  126. const char *ptr = text->data();
  127. while (count < text->size() && isspace(*ptr)) {
  128. count++;
  129. ptr++;
  130. }
  131. text->remove_prefix(count);
  132. return count;
  133. }
  134. size_t RemoveTrailingWhitespace(tensorflow::StringPiece *text) {
  135. size_t count = 0;
  136. const char *ptr = text->data() + text->size() - 1;
  137. while (count < text->size() && isspace(*ptr)) {
  138. ++count;
  139. --ptr;
  140. }
  141. text->remove_suffix(count);
  142. return count;
  143. }
  144. size_t RemoveWhitespaceContext(tensorflow::StringPiece *text) {
  145. // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
  146. return RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text);
  147. }
  148. namespace {
  149. // Lower-level versions of Get... that read directly from a character buffer
  150. // without any bounds checking.
  151. inline uint32 DecodeFixed32(const char *ptr) {
  152. return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) |
  153. (static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) |
  154. (static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) |
  155. (static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24));
  156. }
  157. // 0xff is in case char is signed.
  158. static inline uint32 ByteAs32(char c) { return static_cast<uint32>(c) & 0xff; }
  159. } // namespace
  160. uint32 Hash32(const char *data, size_t n, uint32 seed) {
  161. // 'm' and 'r' are mixing constants generated offline.
  162. // They're not really 'magic', they just happen to work well.
  163. const uint32 m = 0x5bd1e995;
  164. const int r = 24;
  165. // Initialize the hash to a 'random' value
  166. uint32 h = seed ^ n;
  167. // Mix 4 bytes at a time into the hash
  168. while (n >= 4) {
  169. uint32 k = DecodeFixed32(data);
  170. k *= m;
  171. k ^= k >> r;
  172. k *= m;
  173. h *= m;
  174. h ^= k;
  175. data += 4;
  176. n -= 4;
  177. }
  178. // Handle the last few bytes of the input array
  179. switch (n) {
  180. case 3:
  181. h ^= ByteAs32(data[2]) << 16;
  182. TF_FALLTHROUGH_INTENDED;
  183. case 2:
  184. h ^= ByteAs32(data[1]) << 8;
  185. TF_FALLTHROUGH_INTENDED;
  186. case 1:
  187. h ^= ByteAs32(data[0]);
  188. h *= m;
  189. }
  190. // Do a few final mixes of the hash to ensure the last few
  191. // bytes are well-incorporated.
  192. h ^= h >> 13;
  193. h *= m;
  194. h ^= h >> 15;
  195. return h;
  196. }
  197. string Lowercase(tensorflow::StringPiece s) {
  198. string result(s.data(), s.size());
  199. for (char &c : result) {
  200. c = tolower(c);
  201. }
  202. return result;
  203. }
  204. PunctuationUtil::CharacterRange PunctuationUtil::kPunctuation[] = {
  205. {33, 35}, {37, 42}, {44, 47}, {58, 59},
  206. {63, 64}, {91, 93}, {95, 95}, {123, 123},
  207. {125, 125}, {161, 161}, {171, 171}, {183, 183},
  208. {187, 187}, {191, 191}, {894, 894}, {903, 903},
  209. {1370, 1375}, {1417, 1418}, {1470, 1470}, {1472, 1472},
  210. {1475, 1475}, {1478, 1478}, {1523, 1524}, {1548, 1549},
  211. {1563, 1563}, {1566, 1567}, {1642, 1645}, {1748, 1748},
  212. {1792, 1805}, {2404, 2405}, {2416, 2416}, {3572, 3572},
  213. {3663, 3663}, {3674, 3675}, {3844, 3858}, {3898, 3901},
  214. {3973, 3973}, {4048, 4049}, {4170, 4175}, {4347, 4347},
  215. {4961, 4968}, {5741, 5742}, {5787, 5788}, {5867, 5869},
  216. {5941, 5942}, {6100, 6102}, {6104, 6106}, {6144, 6154},
  217. {6468, 6469}, {6622, 6623}, {6686, 6687}, {8208, 8231},
  218. {8240, 8259}, {8261, 8273}, {8275, 8286}, {8317, 8318},
  219. {8333, 8334}, {9001, 9002}, {9140, 9142}, {10088, 10101},
  220. {10181, 10182}, {10214, 10219}, {10627, 10648}, {10712, 10715},
  221. {10748, 10749}, {11513, 11516}, {11518, 11519}, {11776, 11799},
  222. {11804, 11805}, {12289, 12291}, {12296, 12305}, {12308, 12319},
  223. {12336, 12336}, {12349, 12349}, {12448, 12448}, {12539, 12539},
  224. {64830, 64831}, {65040, 65049}, {65072, 65106}, {65108, 65121},
  225. {65123, 65123}, {65128, 65128}, {65130, 65131}, {65281, 65283},
  226. {65285, 65290}, {65292, 65295}, {65306, 65307}, {65311, 65312},
  227. {65339, 65341}, {65343, 65343}, {65371, 65371}, {65373, 65373},
  228. {65375, 65381}, {65792, 65793}, {66463, 66463}, {68176, 68184},
  229. {-1, -1}};
  230. void NormalizeDigits(string *form) {
  231. for (size_t i = 0; i < form->size(); ++i) {
  232. if ((*form)[i] >= '0' && (*form)[i] <= '9') (*form)[i] = '9';
  233. }
  234. }
  235. } // namespace utils
  236. } // namespace syntaxnet