char_properties.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. // char_properties.h - define is_X() tests for various character properties
  13. //
  14. // Character properties can be defined in two ways:
  15. //
  16. // (1) Set-based:
  17. //
  18. // Enumerate the chars that have the property. Example:
  19. //
  20. // DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
  21. // RANGE('0', '9'),
  22. // '\'',
  23. // 0x00BF, // Spanish inverted question mark
  24. // )
  25. //
  26. // Characters are expressed as Unicode code points; note that ascii codes
  27. // are a subset. RANGE() specifies an inclusive range of code points.
  28. //
  29. // This defines two functions:
  30. //
  31. // bool is_my_fave(const char *str, int len)
  32. // bool is_my_fave(int c)
  33. //
  34. // Each returns true for precisely the 12 characters specified above.
  35. // Each takes a *single* UTf8 char as its argument -- the first expresses
  36. // it as a char * and a length, the second as a Unicode code point.
  37. // Please do not pass a string of multiple UTF8 chars to the first one.
  38. //
  39. // To make is_my_fave() externally accessible, put in your .h file:
  40. //
  41. // DECLARE_CHAR_PROPERTY(my_fave)
  42. //
  43. // (2) Function-based:
  44. //
  45. // Specify a function that assigns the desired chars to a CharProperty
  46. // object. Example:
  47. //
  48. // DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
  49. // for (int i = '0'; i <= '9'; i += 2) {
  50. // prop->AddChar(i);
  51. // }
  52. // prop->AddAsciiPredicate(&ispunct);
  53. // prop->AddCharProperty("currency_symbol");
  54. // }
  55. //
  56. // This defines a function of one arg: CharProperty *prop. The function
  57. // calls various CharProperty methods to populate the prop. The last call
  58. // above, AddCharProperty(), adds the chars from another char property
  59. // ("currency_symbol").
  60. //
  61. // As in the set-based case, put a DECLARE_CHAR_PROPERTY(my_other_fave)
  62. // in your .h if you want is_my_other_fave() to be externally accessible.
  63. //
  64. #ifndef SYNTAXNET_CHAR_PROPERTIES_H_
  65. #define SYNTAXNET_CHAR_PROPERTIES_H_
  66. #include <string> // for string
  67. #include "syntaxnet/registry.h"
  68. #include "syntaxnet/utils.h"
  69. // =====================================================================
  70. // Registry for accessing CharProperties by name
  71. //
  72. // This is for internal use by the CharProperty class and macros; callers
  73. // should not use it explicitly.
  74. //
  75. namespace syntaxnet {
  76. class CharProperty; // forward declaration
  77. // Wrapper around a CharProperty, allowing it to be stored in a registry.
  78. struct CharPropertyWrapper : RegisterableClass<CharPropertyWrapper> {
  79. virtual ~CharPropertyWrapper() { }
  80. virtual CharProperty *GetCharProperty() = 0;
  81. };
  82. #define REGISTER_CHAR_PROPERTY_WRAPPER(type, component) \
  83. REGISTER_CLASS_COMPONENT(CharPropertyWrapper, type, component)
  84. #define REGISTER_CHAR_PROPERTY(lsp, name) \
  85. struct name##CharPropertyWrapper : public CharPropertyWrapper { \
  86. CharProperty *GetCharProperty() { return lsp.get(); } \
  87. }; \
  88. REGISTER_CHAR_PROPERTY_WRAPPER(#name, name##CharPropertyWrapper)
  89. // =====================================================================
  90. // Macros for defining character properties
  91. //
  92. // Define is_X() functions to test whether a single UTF8 character has
  93. // the 'X' char prop.
  94. #define DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(lsp, name) \
  95. bool is_##name(const char *str, int len) { \
  96. return lsp->HoldsFor(str, len); \
  97. } \
  98. bool is_##name(int c) { \
  99. return lsp->HoldsFor(c); \
  100. }
  101. // Define a char property by enumerating the unicode char points,
  102. // or RANGE()s thereof, for which it holds. Example:
  103. //
  104. // DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
  105. // 'q',
  106. // RANGE('0', '9'),
  107. // 0x20AB,
  108. // )
  109. //
  110. // "..." is a GNU extension.
  111. #define DEFINE_CHAR_PROPERTY_AS_SET(name, unicodes...) \
  112. static const int k_##name##_unicodes[] = {unicodes}; \
  113. static utils::LazyStaticPtr<CharProperty, const char *, const int *, size_t> \
  114. name##_char_property = {#name, k_##name##_unicodes, \
  115. arraysize(k_##name##_unicodes)}; \
  116. REGISTER_CHAR_PROPERTY(name##_char_property, name); \
  117. DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name)
  118. // Specify a range (inclusive) of Unicode character values.
  119. // Example: RANGE('0', '9') specifies the 10 digits.
  120. // For use as an element in a DEFINE_CHAR_PROPERTY_AS_SET() list.
  121. static const int kPreUnicodeRange = -1;
  122. static const int kPostUnicodeRange = -2;
  123. #define RANGE(lower, upper) \
  124. kPreUnicodeRange, lower, upper, kPostUnicodeRange
  125. // A function to initialize a CharProperty.
  126. typedef void CharPropertyInitializer(CharProperty *prop);
  127. // Define a char property by specifying a block of code that initializes it.
  128. // Example:
  129. //
  130. // DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
  131. // for (int i = '0'; i <= '9'; i += 2) {
  132. // prop->AddChar(i);
  133. // }
  134. // prop->AddAsciiPredicate(&ispunct);
  135. // prop->AddCharProperty("currency_symbol");
  136. // }
  137. //
  138. #define DEFINE_CHAR_PROPERTY(name, charpropvar) \
  139. static void init_##name##_char_property(CharProperty *charpropvar); \
  140. static utils::LazyStaticPtr<CharProperty, const char *, \
  141. CharPropertyInitializer *> \
  142. name##_char_property = {#name, &init_##name##_char_property}; \
  143. REGISTER_CHAR_PROPERTY(name##_char_property, name); \
  144. DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name) \
  145. static void init_##name##_char_property(CharProperty *charpropvar)
  146. // =====================================================================
  147. // Macro for declaring character properties
  148. //
  149. #define DECLARE_CHAR_PROPERTY(name) \
  150. extern bool is_##name(const char *str, int len); \
  151. extern bool is_##name(int c); \
  152. // ===========================================================
  153. // CharProperty - a property that holds for selected Unicode chars
  154. //
  155. // A CharProperty is semantically equivalent to set<char32>.
  156. //
  157. // The characters for which a CharProperty holds are represented as a trie,
  158. // i.e., a tree that is indexed by successive bytes of the UTF-8 encoding
  159. // of the characters. This permits fast lookup (HoldsFor).
  160. //
  161. // A function that defines a subset of [0..255], e.g., isspace.
  162. typedef int AsciiPredicate(int c);
  163. class CharProperty {
  164. public:
  165. // Constructor for set-based char properties.
  166. CharProperty(const char *name, const int *unicodes, int num_unicodes);
  167. // Constructor for function-based char properties.
  168. CharProperty(const char *name, CharPropertyInitializer *init_fn);
  169. virtual ~CharProperty();
  170. // Various ways of adding chars to a CharProperty; for use only in
  171. // CharPropertyInitializer functions.
  172. void AddChar(int c);
  173. void AddCharRange(int c1, int c2);
  174. void AddAsciiPredicate(AsciiPredicate *pred);
  175. void AddCharProperty(const char *name);
  176. void AddCharSpec(const int *unicodes, int num_unicodes);
  177. // Return true iff the CharProperty holds for a single given UTF8 char.
  178. bool HoldsFor(const char *str, int len) const;
  179. // Return true iff the CharProperty holds for a single given Unicode char.
  180. bool HoldsFor(int c) const;
  181. // You can use this to enumerate the set elements (it was easier
  182. // than defining a real iterator). Returns -1 if there are no more.
  183. // Call with -1 to get the first element. Expects c == -1 or HoldsFor(c).
  184. int NextElementAfter(int c) const;
  185. // Return NULL or the CharProperty with the given name. Looks up the name
  186. // in a CharProperty registry.
  187. static const CharProperty *Lookup(const char *name);
  188. private:
  189. void CheckUnicodeVal(int c) const;
  190. static string UnicodeToString(int c);
  191. const char *name_;
  192. struct CharPropertyImplementation *impl_;
  193. TF_DISALLOW_COPY_AND_ASSIGN(CharProperty);
  194. };
  195. //======================================================================
  196. // Expression-level punctuation
  197. //
  198. // Punctuation that starts a sentence.
  199. DECLARE_CHAR_PROPERTY(start_sentence_punc);
  200. // Punctuation that ends a sentence.
  201. DECLARE_CHAR_PROPERTY(end_sentence_punc);
  202. // Punctuation, such as parens, that opens a "nested expression" of text.
  203. DECLARE_CHAR_PROPERTY(open_expr_punc);
  204. // Punctuation, such as parens, that closes a "nested expression" of text.
  205. DECLARE_CHAR_PROPERTY(close_expr_punc);
  206. // Chars that open a quotation.
  207. DECLARE_CHAR_PROPERTY(open_quote);
  208. // Chars that close a quotation.
  209. DECLARE_CHAR_PROPERTY(close_quote);
  210. // Punctuation chars that open an expression or a quotation.
  211. DECLARE_CHAR_PROPERTY(open_punc);
  212. // Punctuation chars that close an expression or a quotation.
  213. DECLARE_CHAR_PROPERTY(close_punc);
  214. // Punctuation chars that can come at the beginning of a sentence.
  215. DECLARE_CHAR_PROPERTY(leading_sentence_punc);
  216. // Punctuation chars that can come at the end of a sentence.
  217. DECLARE_CHAR_PROPERTY(trailing_sentence_punc);
  218. //======================================================================
  219. // Token-level punctuation
  220. //
  221. // Token-prefix symbols -- glom on to following token
  222. // (esp. if no space after) -- except for currency symbols.
  223. DECLARE_CHAR_PROPERTY(noncurrency_token_prefix_symbol);
  224. // Token-prefix symbols -- glom on to following token (esp. if no space after).
  225. DECLARE_CHAR_PROPERTY(token_prefix_symbol);
  226. // Token-suffix symbols -- glom on to preceding token (esp. if no space
  227. // before).
  228. DECLARE_CHAR_PROPERTY(token_suffix_symbol);
  229. // Subscripts.
  230. DECLARE_CHAR_PROPERTY(subscript_symbol);
  231. // Superscripts.
  232. DECLARE_CHAR_PROPERTY(superscript_symbol);
  233. //======================================================================
  234. // General punctuation
  235. //
  236. // Connector punctuation.
  237. DECLARE_CHAR_PROPERTY(connector_punc);
  238. // Dashes.
  239. DECLARE_CHAR_PROPERTY(dash_punc);
  240. // Other punctuation.
  241. DECLARE_CHAR_PROPERTY(other_punc);
  242. // All punctuation.
  243. DECLARE_CHAR_PROPERTY(punctuation);
  244. //======================================================================
  245. // Special symbols
  246. //
  247. // Currency symbols.
  248. DECLARE_CHAR_PROPERTY(currency_symbol);
  249. // Chinese bookquotes.
  250. DECLARE_CHAR_PROPERTY(open_bookquote);
  251. DECLARE_CHAR_PROPERTY(close_bookquote);
  252. //======================================================================
  253. // Separators
  254. //
  255. // Line separators.
  256. DECLARE_CHAR_PROPERTY(line_separator);
  257. // Paragraph separators.
  258. DECLARE_CHAR_PROPERTY(paragraph_separator);
  259. // Space separators.
  260. DECLARE_CHAR_PROPERTY(space_separator);
  261. // Separators -- all line, paragraph, and space separators.
  262. DECLARE_CHAR_PROPERTY(separator);
  263. //======================================================================
  264. // Alphanumeric Characters
  265. //
  266. // Digits.
  267. DECLARE_CHAR_PROPERTY(digit);
  268. // Japanese Katakana.
  269. DECLARE_CHAR_PROPERTY(katakana);
  270. //======================================================================
  271. // BiDi Directional Formatting Codes
  272. //
  273. // Explicit directional formatting codes (LRM, RLM, LRE, RLE, PDF, LRO, RLO)
  274. // used by the bidirectional algorithm.
  275. //
  276. // Note: Use this only to classify characters. To actually determine
  277. // directionality of BiDi text, look under i18n/bidi.
  278. //
  279. // See http://www.unicode.org/reports/tr9/ for a description of the algorithm
  280. // and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
  281. DECLARE_CHAR_PROPERTY(directional_formatting_code);
  282. //======================================================================
  283. // Special collections
  284. //
  285. // NB: This does not check for all punctuation and symbols in the standard;
  286. // just those listed in our code. See the definitions in char_properties.cc.
  287. DECLARE_CHAR_PROPERTY(punctuation_or_symbol);
  288. } // namespace syntaxnet
  289. #endif // SYNTAXNET_CHAR_PROPERTIES_H_