char_properties.cc 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. // char_properties.cc - define is_X() tests for various character properties
  13. //
  14. // See char_properties.h for how to write a character property.
  15. //
  16. // References for the char sets below:
  17. //
  18. // . http://www.unicode.org/Public/UNIDATA/PropList.txt
  19. //
  20. // Large (but not exhaustive) list of Unicode chars and their "properties"
  21. // (e.g., the property "Pi" = an initial quote punctuation char).
  22. //
  23. // . http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
  24. //
  25. // Defines the list of properties, such as "Pi", used in the above list.
  26. //
  27. // . http://www.unipad.org/unimap/index.php?param_char=XXXX&page=detail
  28. //
  29. // Gives detail about a particular character code.
  30. // XXXX is a 4-hex-digit Unicode character code.
  31. //
  32. // . http://www.unicode.org/Public/UNIDATA/UCD.html
  33. //
  34. // General reference for Unicode characters.
  35. //
  36. #include "syntaxnet/char_properties.h"
  37. #include <ctype.h> // for ispunct, isspace
  38. #include <memory>
  39. #include <utility>
  40. #include <vector> // for vector
  41. #include "tensorflow/core/lib/strings/str_util.h"
  42. #include "tensorflow/core/lib/strings/stringprintf.h"
  43. #include "third_party/utf/utf.h" // for runetochar, ::UTFmax, Rune
  44. #include "util/utf8/unilib.h" // for IsValidCodepoint, etc
  45. #include "util/utf8/unilib_utf8_utils.h"
  46. //============================================================
  47. // CharPropertyImplementation
  48. //
  49. // A CharPropertyImplementation stores a set of Unicode characters,
  50. // encoded in UTF-8, as a trie. The trie is represented as a vector
  51. // of nodes. Each node is a 256-element array that specifies what to
  52. // do with one byte of the UTF-8 sequence. Each element n of a node
  53. // is one of:
  54. // n = 0, indicating that the Property is not true of any
  55. // character whose UTF-8 encoding includes this byte at
  56. // this position
  57. // n = -1, indicating that the Property is true for the UTF-8 sequence
  58. // that ends with this byte.
  59. // n > 0, indicating the index of the row that describes the
  60. // remaining bytes in the UTF-8 sequence.
  61. //
  62. // The only operation that needs to be fast is HoldsFor, which tests
  63. // whether a character has a given property. We use each byte of the
  64. // character's UTF-8 encoding to index into a row. If the value is 0,
  65. // then the property is not true for the character. (We might discover
  66. // this even before getting to the end of the sequence.) If the value
  67. // is -1, then the property is true for this character. Otherwise,
  68. // the value is the index of another row, which we index using the next
  69. // byte in the sequence, and so on. The design of UTF-8 prevents
  70. // ambiguities here; no prefix of a UTF-8 sequence is a valid UTF-8
  71. // sequence.
  72. //
  73. // While it is possible to implement an iterator for this representation,
  74. // it is much easier to use set<char32> for this purpose. In fact, we
  75. // would use that as the entire representation, were it not for concerns
  76. // that HoldsFor might be slower.
  77. namespace syntaxnet {
  78. struct CharPropertyImplementation {
  79. unordered_set<char32> chars;
  80. std::vector<std::vector<int> > rows;
  81. CharPropertyImplementation() {
  82. rows.reserve(10);
  83. rows.resize(1);
  84. rows[0].resize(256, 0);
  85. }
  86. void AddChar(char *buf, int len) {
  87. int n = 0; // row index
  88. for (int i = 0; i < len; ++i) {
  89. int ch = reinterpret_cast<unsigned char *>(buf)[i];
  90. int m = rows[n][ch];
  91. if (m > 0) {
  92. CHECK_LT(i, len - 1)
  93. << " : " << (i + 1) << "-byte UTF-8 sequence "
  94. << "(" << tensorflow::str_util::CEscape(string(buf, i + 1)) << ")"
  95. << " is prefix of previously-seen UTF-8 sequence(s)";
  96. n = m;
  97. } else if (i == len - 1) {
  98. rows[n][ch] = -1;
  99. } else {
  100. CHECK_EQ(m, 0) << " : UTF-8 sequence is extension of previously-seen "
  101. << (i + 1) << "-byte UTF-8 sequence "
  102. << "("
  103. << tensorflow::str_util::CEscape(string(buf, i + 1))
  104. << ")";
  105. int a = rows.size();
  106. rows.resize(a + 1);
  107. rows[a].resize(256, 0);
  108. rows[n][ch] = a;
  109. n = a;
  110. }
  111. }
  112. }
  113. bool HoldsFor(const char *buf) const {
  114. const unsigned char *bytes = reinterpret_cast<const unsigned char *>(buf);
  115. // Lookup each byte of the UTF-8 sequence, starting in row 0.
  116. int n = rows[0][*bytes];
  117. if (n == 0) return false;
  118. if (n == -1) return true;
  119. // If the value is not 0 or -1, then it is the index of the row for the
  120. // second byte in the sequence.
  121. n = rows[n][*++bytes];
  122. if (n == 0) return false;
  123. if (n == -1) return true;
  124. n = rows[n][*++bytes]; // Likewise for the third byte.
  125. if (n == 0) return false;
  126. if (n == -1) return true;
  127. n = rows[n][*++bytes]; // Likewise for the fourth byte.
  128. if (n == 0) return false;
  129. // Since there can be at most 4 bytes in the sequence, n must be -1.
  130. return true;
  131. // Implementation note: it is possible (and perhaps clearer) to write this
  132. // code as a loop, "for (int i = 0; i < 4; ++i) ...", but the TestHoldsFor
  133. // benchmark results indicate that doing so produces slower code for
  134. // anything other than short 7-bit ASCII strings (< 512 bytes). This is
  135. // mysterious, since the compiler unrolls the loop, producing code that
  136. // is almost the same as what we have here, except for the shortcut on
  137. // the 4th byte.
  138. }
  139. };
  140. //============================================================
  141. // CharProperty - a property that holds for selected Unicode chars
  142. //
  143. CharProperty::CharProperty(const char *name,
  144. const int *unicodes,
  145. int num_unicodes)
  146. : name_(name),
  147. impl_(new CharPropertyImplementation) {
  148. // Initialize CharProperty to its char set.
  149. AddCharSpec(unicodes, num_unicodes);
  150. }
  151. CharProperty::CharProperty(const char *name, CharPropertyInitializer *init_fn)
  152. : name_(name),
  153. impl_(new CharPropertyImplementation) {
  154. (*init_fn)(this);
  155. }
  156. CharProperty::~CharProperty() {
  157. delete impl_;
  158. }
  159. void CharProperty::AddChar(int c) {
  160. CheckUnicodeVal(c);
  161. impl_->chars.insert(c);
  162. char buf[UTFmax];
  163. Rune r = c;
  164. int len = runetochar(buf, &r);
  165. impl_->AddChar(buf, len);
  166. }
  167. void CharProperty::AddCharRange(int c1, int c2) {
  168. for (int c = c1; c <= c2; ++c) {
  169. AddChar(c);
  170. }
  171. }
  172. void CharProperty::AddAsciiPredicate(AsciiPredicate *pred) {
  173. for (int c = 0; c < 256; ++c) {
  174. if ((*pred)(c)) {
  175. AddChar(c);
  176. }
  177. }
  178. }
  179. void CharProperty::AddCharProperty(const char *propname) {
  180. const CharProperty *prop = CharProperty::Lookup(propname);
  181. CHECK(prop != NULL) << ": unknown char property \"" << propname
  182. << "\" in " << name_;
  183. int c = -1;
  184. while ((c = prop->NextElementAfter(c)) >= 0) {
  185. AddChar(c);
  186. }
  187. }
  188. void CharProperty::AddCharSpec(const int *unicodes, int num_unicodes) {
  189. for (int i = 0; i < num_unicodes; ++i) {
  190. if (i + 3 < num_unicodes && unicodes[i] == kPreUnicodeRange &&
  191. unicodes[i + 3] == kPostUnicodeRange) {
  192. // Range of unicode values
  193. int lower = unicodes[i + 1];
  194. int upper = unicodes[i + 2];
  195. i += 3; // i will be incremented once more at top of loop
  196. CHECK(lower <= upper) << ": invalid char range in " << name_
  197. << ": [" << UnicodeToString(lower) << ", "
  198. << UnicodeToString(upper) << "]";
  199. AddCharRange(lower, upper);
  200. } else {
  201. AddChar(unicodes[i]);
  202. }
  203. }
  204. }
  205. bool CharProperty::HoldsFor(int c) const {
  206. if (!UniLib::IsValidCodepoint(c)) return false;
  207. char buf[UTFmax];
  208. Rune r = c;
  209. runetochar(buf, &r);
  210. return impl_->HoldsFor(buf);
  211. }
  212. bool CharProperty::HoldsFor(const char *str, int len) const {
  213. // UniLib::IsUTF8ValidCodepoint also checks for structural validity.
  214. return len > 0 && UniLib::IsUTF8ValidCodepoint(StringPiece(str, len)) &&
  215. impl_->HoldsFor(str);
  216. }
  217. // Return -1 or the smallest Unicode char greater than c for which
  218. // the CharProperty holds. Expects c == -1 or HoldsFor(c).
  219. int CharProperty::NextElementAfter(int c) const {
  220. DCHECK(c == -1 || HoldsFor(c));
  221. unordered_set<char32>::const_iterator end = impl_->chars.end();
  222. if (c < 0) {
  223. unordered_set<char32>::const_iterator it = impl_->chars.begin();
  224. if (it == end) return -1;
  225. return *it;
  226. }
  227. char32 r = c;
  228. unordered_set<char32>::const_iterator it = impl_->chars.find(r);
  229. if (it == end) return -1;
  230. it++;
  231. if (it == end) return -1;
  232. return *it;
  233. }
  234. REGISTER_SYNTAXNET_CLASS_REGISTRY("char property wrapper", CharPropertyWrapper);
  235. const CharProperty *CharProperty::Lookup(const char *subclass) {
  236. // Create a CharPropertyWrapper object and delete it. We only care about
  237. // the CharProperty it provides.
  238. std::unique_ptr<CharPropertyWrapper> wrapper(
  239. CharPropertyWrapper::Create(subclass));
  240. if (wrapper.get() == NULL) {
  241. LOG(ERROR) << "CharPropertyWrapper not found for subclass: "
  242. << "\"" << subclass << "\"";
  243. return NULL;
  244. }
  245. return wrapper->GetCharProperty();
  246. }
  247. // Check that a given Unicode value is in range.
  248. void CharProperty::CheckUnicodeVal(int c) const {
  249. CHECK(UniLib::IsValidCodepoint(c))
  250. << "Unicode in " << name_ << " out of range: " << UnicodeToString(c);
  251. }
  252. // Converts a Unicode value to a string (for error messages).
  253. string CharProperty::UnicodeToString(int c) {
  254. const char *fmt;
  255. if (c < 0) {
  256. fmt = "%d"; // out-of-range
  257. } else if (c <= 0x7f) {
  258. fmt = "'%c'"; // ascii
  259. } else if (c <= 0xffff) {
  260. fmt = "0x%04X"; // 4 hex digits
  261. } else {
  262. fmt = "0x%X"; // also out-of-range
  263. }
  264. return tensorflow::strings::Printf(fmt, c);
  265. }
  266. //======================================================================
  267. // Expression-level punctuation
  268. //
  269. // Punctuation that starts a sentence.
  270. DEFINE_CHAR_PROPERTY_AS_SET(start_sentence_punc,
  271. 0x00A1, // Spanish inverted exclamation mark
  272. 0x00BF, // Spanish inverted question mark
  273. )
  274. // Punctuation that ends a sentence.
  275. // Based on: http://www.unicode.org/unicode/reports/tr29/#Sentence_Boundaries
  276. DEFINE_CHAR_PROPERTY_AS_SET(end_sentence_punc,
  277. '.',
  278. '!',
  279. '?',
  280. 0x055C, // Armenian exclamation mark
  281. 0x055E, // Armenian question mark
  282. 0x0589, // Armenian full stop
  283. 0x061F, // Arabic question mark
  284. 0x06D4, // Arabic full stop
  285. 0x0700, // Syriac end of paragraph
  286. 0x0701, // Syriac supralinear full stop
  287. 0x0702, // Syriac sublinear full stop
  288. RANGE(0x0964, 0x0965), // Devanagari danda..Devanagari double danda
  289. 0x1362, // Ethiopic full stop
  290. 0x1367, // Ethiopic question mark
  291. 0x1368, // Ethiopic paragraph separator
  292. 0x104A, // Myanmar sign little section
  293. 0x104B, // Myanmar sign section
  294. 0x166E, // Canadian syllabics full stop
  295. 0x17d4, // Khmer sign khan
  296. 0x1803, // Mongolian full stop
  297. 0x1809, // Mongolian Manchu full stop
  298. 0x1944, // Limbu exclamation mark
  299. 0x1945, // Limbu question mark
  300. 0x203C, // double exclamation mark
  301. 0x203D, // interrobang
  302. 0x2047, // double question mark
  303. 0x2048, // question exclamation mark
  304. 0x2049, // exclamation question mark
  305. 0x3002, // ideographic full stop
  306. 0x037E, // Greek question mark
  307. 0xFE52, // small full stop
  308. 0xFE56, // small question mark
  309. 0xFE57, // small exclamation mark
  310. 0xFF01, // fullwidth exclamation mark
  311. 0xFF0E, // fullwidth full stop
  312. 0xFF1F, // fullwidth question mark
  313. 0xFF61, // halfwidth ideographic full stop
  314. 0x2026, // ellipsis
  315. )
  316. // Punctuation, such as parens, that opens a "nested expression" of text.
  317. DEFINE_CHAR_PROPERTY_AS_SET(open_expr_punc,
  318. '(',
  319. '[',
  320. '<',
  321. '{',
  322. 0x207D, // superscript left parenthesis
  323. 0x208D, // subscript left parenthesis
  324. 0x27E6, // mathematical left white square bracket
  325. 0x27E8, // mathematical left angle bracket
  326. 0x27EA, // mathematical left double angle bracket
  327. 0x2983, // left white curly bracket
  328. 0x2985, // left white parenthesis
  329. 0x2987, // Z notation left image bracket
  330. 0x2989, // Z notation left binding bracket
  331. 0x298B, // left square bracket with underbar
  332. 0x298D, // left square bracket with tick in top corner
  333. 0x298F, // left square bracket with tick in bottom corner
  334. 0x2991, // left angle bracket with dot
  335. 0x2993, // left arc less-than bracket
  336. 0x2995, // double left arc greater-than bracket
  337. 0x2997, // left black tortoise shell bracket
  338. 0x29D8, // left wiggly fence
  339. 0x29DA, // left double wiggly fence
  340. 0x29FC, // left-pointing curved angle bracket
  341. 0x3008, // CJK left angle bracket
  342. 0x300A, // CJK left double angle bracket
  343. 0x3010, // CJK left black lenticular bracket
  344. 0x3014, // CJK left tortoise shell bracket
  345. 0x3016, // CJK left white lenticular bracket
  346. 0x3018, // CJK left white tortoise shell bracket
  347. 0x301A, // CJK left white square bracket
  348. 0xFD3E, // Ornate left parenthesis
  349. 0xFE59, // small left parenthesis
  350. 0xFE5B, // small left curly bracket
  351. 0xFF08, // fullwidth left parenthesis
  352. 0xFF3B, // fullwidth left square bracket
  353. 0xFF5B, // fullwidth left curly bracket
  354. )
  355. // Punctuation, such as parens, that closes a "nested expression" of text.
  356. DEFINE_CHAR_PROPERTY_AS_SET(close_expr_punc,
  357. ')',
  358. ']',
  359. '>',
  360. '}',
  361. 0x207E, // superscript right parenthesis
  362. 0x208E, // subscript right parenthesis
  363. 0x27E7, // mathematical right white square bracket
  364. 0x27E9, // mathematical right angle bracket
  365. 0x27EB, // mathematical right double angle bracket
  366. 0x2984, // right white curly bracket
  367. 0x2986, // right white parenthesis
  368. 0x2988, // Z notation right image bracket
  369. 0x298A, // Z notation right binding bracket
  370. 0x298C, // right square bracket with underbar
  371. 0x298E, // right square bracket with tick in top corner
  372. 0x2990, // right square bracket with tick in bottom corner
  373. 0x2992, // right angle bracket with dot
  374. 0x2994, // right arc greater-than bracket
  375. 0x2996, // double right arc less-than bracket
  376. 0x2998, // right black tortoise shell bracket
  377. 0x29D9, // right wiggly fence
  378. 0x29DB, // right double wiggly fence
  379. 0x29FD, // right-pointing curved angle bracket
  380. 0x3009, // CJK right angle bracket
  381. 0x300B, // CJK right double angle bracket
  382. 0x3011, // CJK right black lenticular bracket
  383. 0x3015, // CJK right tortoise shell bracket
  384. 0x3017, // CJK right white lenticular bracket
  385. 0x3019, // CJK right white tortoise shell bracket
  386. 0x301B, // CJK right white square bracket
  387. 0xFD3F, // Ornate right parenthesis
  388. 0xFE5A, // small right parenthesis
  389. 0xFE5C, // small right curly bracket
  390. 0xFF09, // fullwidth right parenthesis
  391. 0xFF3D, // fullwidth right square bracket
  392. 0xFF5D, // fullwidth right curly bracket
  393. )
  394. // Chars that open a quotation.
  395. // Based on: http://www.unicode.org/uni2book/ch06.pdf
  396. DEFINE_CHAR_PROPERTY_AS_SET(open_quote,
  397. '"',
  398. '\'',
  399. '`',
  400. 0xFF07, // fullwidth apostrophe
  401. 0xFF02, // fullwidth quotation mark
  402. 0x2018, // left single quotation mark (English, others)
  403. 0x201C, // left double quotation mark (English, others)
  404. 0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
  405. 0x201A, // single low-9 quotation mark (Czech, German, Slovak)
  406. 0x201E, // double low-9 quotation mark (Czech, German, Slovak)
  407. 0x201F, // double high-reversed-9 quotation mark (PropList.txt)
  408. 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
  409. 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
  410. 0x2039, // single left-pointing angle quotation mark (French, others)
  411. 0x00AB, // left-pointing double angle quotation mark (French, others)
  412. 0x203A, // single right-pointing angle quotation mark (Slovenian, others)
  413. 0x00BB, // right-pointing double angle quotation mark (Slovenian, others)
  414. 0x300C, // left corner bracket (East Asian languages)
  415. 0xFE41, // presentation form for vertical left corner bracket
  416. 0xFF62, // halfwidth left corner bracket (East Asian languages)
  417. 0x300E, // left white corner bracket (East Asian languages)
  418. 0xFE43, // presentation form for vertical left white corner bracket
  419. 0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
  420. )
  421. // Chars that close a quotation.
  422. // Based on: http://www.unicode.org/uni2book/ch06.pdf
  423. DEFINE_CHAR_PROPERTY_AS_SET(close_quote,
  424. '\'',
  425. '"',
  426. '`',
  427. 0xFF07, // fullwidth apostrophe
  428. 0xFF02, // fullwidth quotation mark
  429. 0x2019, // right single quotation mark (English, others)
  430. 0x201D, // right double quotation mark (English, others)
  431. 0x2018, // left single quotation mark (Czech, German, Slovak)
  432. 0x201C, // left double quotation mark (Czech, German, Slovak)
  433. 0x203A, // single right-pointing angle quotation mark (French, others)
  434. 0x00BB, // right-pointing double angle quotation mark (French, others)
  435. 0x2039, // single left-pointing angle quotation mark (Slovenian, others)
  436. 0x00AB, // left-pointing double angle quotation mark (Slovenian, others)
  437. 0x300D, // right corner bracket (East Asian languages)
  438. 0xfe42, // presentation form for vertical right corner bracket
  439. 0xFF63, // halfwidth right corner bracket (East Asian languages)
  440. 0x300F, // right white corner bracket (East Asian languages)
  441. 0xfe44, // presentation form for vertical right white corner bracket
  442. 0x301F, // low double prime quotation mark (East Asian languages)
  443. 0x301E, // close double prime (East Asian languages written horizontally)
  444. )
  445. // Punctuation chars that open an expression or a quotation.
  446. DEFINE_CHAR_PROPERTY(open_punc, prop) {
  447. prop->AddCharProperty("open_expr_punc");
  448. prop->AddCharProperty("open_quote");
  449. }
  450. // Punctuation chars that close an expression or a quotation.
  451. DEFINE_CHAR_PROPERTY(close_punc, prop) {
  452. prop->AddCharProperty("close_expr_punc");
  453. prop->AddCharProperty("close_quote");
  454. }
  455. // Punctuation chars that can come at the beginning of a sentence.
  456. DEFINE_CHAR_PROPERTY(leading_sentence_punc, prop) {
  457. prop->AddCharProperty("open_punc");
  458. prop->AddCharProperty("start_sentence_punc");
  459. }
  460. // Punctuation chars that can come at the end of a sentence.
  461. DEFINE_CHAR_PROPERTY(trailing_sentence_punc, prop) {
  462. prop->AddCharProperty("close_punc");
  463. prop->AddCharProperty("end_sentence_punc");
  464. }
  465. //======================================================================
  466. // Special symbols
  467. //
  468. // Currency symbols.
  469. // From: http://www.unicode.org/charts/PDF/U20A0.pdf
  470. DEFINE_CHAR_PROPERTY_AS_SET(currency_symbol,
  471. '$',
  472. // 0x00A2, // cents (NB: typically FOLLOWS the amount)
  473. 0x00A3, // pounds and liras
  474. 0x00A4, // general currency sign
  475. 0x00A5, // yen or yuan
  476. 0x0192, // Dutch florin (latin small letter "f" with hook)
  477. 0x09F2, // Bengali rupee mark
  478. 0x09F3, // Bengali rupee sign
  479. 0x0AF1, // Guajarati rupee sign
  480. 0x0BF9, // Tamil rupee sign
  481. 0x0E3F, // Thai baht
  482. 0x17DB, // Khmer riel
  483. 0x20A0, // alternative euro sign
  484. 0x20A1, // Costa Rica, El Salvador (colon sign)
  485. 0x20A2, // Brazilian cruzeiro
  486. 0x20A3, // French Franc
  487. 0x20A4, // alternative lira sign
  488. 0x20A5, // mill sign (USA 1/10 cent)
  489. 0x20A6, // Nigerian Naira
  490. 0x20A7, // Spanish peseta
  491. 0x20A8, // Indian rupee
  492. 0x20A9, // Korean won
  493. 0x20AA, // Israeli new sheqel
  494. 0x20AB, // Vietnam dong
  495. 0x20AC, // euro sign
  496. 0x20AD, // Laotian kip
  497. 0x20AE, // Mongolian tugrik
  498. 0x20AF, // Greek drachma
  499. 0x20B0, // German penny
  500. 0x20B1, // Philippine peso (Mexican peso uses "$")
  501. 0x2133, // Old German mark (script capital M)
  502. 0xFDFC, // rial sign
  503. 0xFFE0, // fullwidth cents
  504. 0xFFE1, // fullwidth pounds
  505. 0xFFE5, // fullwidth Japanese yen
  506. 0xFFE6, // fullwidth Korean won
  507. )
  508. // Chinese bookquotes.
  509. // They look like "<<" and ">>" except that they are single UTF8 chars
  510. // (U+300A, U+300B). These are used in chinese as special
  511. // punctuation, refering to the title of a book, an article, a movie,
  512. // etc. For example: "cellphone" means cellphone, but <<cellphone>>
  513. // means (exclusively) the movie.
  514. DEFINE_CHAR_PROPERTY_AS_SET(open_bookquote,
  515. 0x300A
  516. )
  517. DEFINE_CHAR_PROPERTY_AS_SET(close_bookquote,
  518. 0x300B
  519. )
  520. //======================================================================
  521. // Token-level punctuation
  522. //
  523. // Token-prefix symbols, excluding currency symbols -- glom on
  524. // to following token (esp. if no space after)
  525. DEFINE_CHAR_PROPERTY_AS_SET(noncurrency_token_prefix_symbol,
  526. '#',
  527. 0x2116, // numero sign ("No")
  528. )
  529. // Token-prefix symbols -- glom on to following token (esp. if no space after)
  530. DEFINE_CHAR_PROPERTY(token_prefix_symbol, prop) {
  531. prop->AddCharProperty("currency_symbol");
  532. prop->AddCharProperty("noncurrency_token_prefix_symbol");
  533. }
  534. // Token-suffix symbols -- glom on to preceding token (esp. if no space before)
  535. DEFINE_CHAR_PROPERTY_AS_SET(token_suffix_symbol,
  536. '%',
  537. 0x066A, // Arabic percent sign
  538. 0x2030, // per mille
  539. 0x2031, // per ten thousand
  540. 0x00A2, // cents sign
  541. 0x2125, // ounces sign
  542. 0x00AA, // feminine ordinal indicator (Spanish)
  543. 0x00BA, // masculine ordinal indicator (Spanish)
  544. 0x00B0, // degrees
  545. 0x2109, // degrees Fahrenheit
  546. 0x2103, // degrees Celsius
  547. 0x2126, // ohms
  548. 0x212A, // Kelvin
  549. 0x212B, // Angstroms ("A" with circle on top)
  550. 0x00A9, // copyright
  551. 0x2117, // sound recording copyright (circled "P")
  552. 0x2122, // trade mark
  553. 0x00AE, // registered trade mark
  554. 0x2120, // service mark
  555. 0x2106, // cada una ("c/a" == "each" in Spanish)
  556. 0x2020, // dagger (can be used for footnotes)
  557. 0x2021, // double dagger (can be used for footnotes)
  558. )
  559. // Subscripts
  560. DEFINE_CHAR_PROPERTY_AS_SET(subscript_symbol,
  561. 0x2080, // subscript 0
  562. 0x2081, // subscript 1
  563. 0x2082, // subscript 2
  564. 0x2083, // subscript 3
  565. 0x2084, // subscript 4
  566. 0x2085, // subscript 5
  567. 0x2086, // subscript 6
  568. 0x2087, // subscript 7
  569. 0x2088, // subscript 8
  570. 0x2089, // subscript 9
  571. 0x208A, // subscript "+"
  572. 0x208B, // subscript "-"
  573. 0x208C, // subscript "="
  574. 0x208D, // subscript "("
  575. 0x208E, // subscript ")"
  576. )
  577. // Superscripts
  578. DEFINE_CHAR_PROPERTY_AS_SET(superscript_symbol,
  579. 0x2070, // superscript 0
  580. 0x00B9, // superscript 1
  581. 0x00B2, // superscript 2
  582. 0x00B3, // superscript 3
  583. 0x2074, // superscript 4
  584. 0x2075, // superscript 5
  585. 0x2076, // superscript 6
  586. 0x2077, // superscript 7
  587. 0x2078, // superscript 8
  588. 0x2079, // superscript 9
  589. 0x2071, // superscript Latin small "i"
  590. 0x207A, // superscript "+"
  591. 0x207B, // superscript "-"
  592. 0x207C, // superscript "="
  593. 0x207D, // superscript "("
  594. 0x207E, // superscript ")"
  595. 0x207F, // superscript Latin small "n"
  596. )
  597. //======================================================================
  598. // General punctuation
  599. //
  600. // Connector punctuation
  601. // Code Pc from http://www.unicode.org/Public/UNIDATA/PropList.txt
  602. // NB: This list is not necessarily exhaustive.
  603. DEFINE_CHAR_PROPERTY_AS_SET(connector_punc,
  604. 0x30fb, // Katakana middle dot
  605. 0xff65, // halfwidth Katakana middle dot
  606. 0x2040, // character tie
  607. )
  608. // Dashes
  609. // Code Pd from http://www.unicode.org/Public/UNIDATA/PropList.txt
  610. // NB: This list is not necessarily exhaustive.
  611. DEFINE_CHAR_PROPERTY_AS_SET(dash_punc,
  612. '-',
  613. '~',
  614. 0x058a, // Armenian hyphen
  615. 0x1806, // Mongolian todo soft hyphen
  616. RANGE(0x2010, 0x2015), // hyphen..horizontal bar
  617. 0x2053, // swung dash -- from Table 6-3 of Unicode book
  618. 0x207b, // superscript minus
  619. 0x208b, // subscript minus
  620. 0x2212, // minus sign
  621. 0x301c, // wave dash
  622. 0x3030, // wavy dash
  623. RANGE(0xfe31, 0xfe32), // presentation form for vertical em dash..en dash
  624. 0xfe58, // small em dash
  625. 0xfe63, // small hyphen-minus
  626. 0xff0d, // fullwidth hyphen-minus
  627. )
  628. // Other punctuation
  629. // Code Po from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
  630. // NB: This list is not exhaustive.
  631. DEFINE_CHAR_PROPERTY_AS_SET(other_punc,
  632. ',',
  633. ':',
  634. ';',
  635. 0x00b7, // middle dot
  636. 0x0387, // Greek ano teleia
  637. 0x05c3, // Hebrew punctuation sof pasuq
  638. 0x060c, // Arabic comma
  639. 0x061b, // Arabic semicolon
  640. 0x066b, // Arabic decimal separator
  641. 0x066c, // Arabic thousands separator
  642. RANGE(0x0703, 0x70a), // Syriac contraction and others
  643. 0x070c, // Syric harklean metobelus
  644. 0x0e5a, // Thai character angkhankhu
  645. 0x0e5b, // Thai character khomut
  646. 0x0f08, // Tibetan mark sbrul shad
  647. RANGE(0x0f0d, 0x0f12), // Tibetan mark shad..Tibetan mark rgya gram shad
  648. 0x1361, // Ethiopic wordspace
  649. RANGE(0x1363, 0x1366), // other Ethiopic chars
  650. 0x166d, // Canadian syllabics chi sign
  651. RANGE(0x16eb, 0x16ed), // Runic single punctuation..Runic cross punctuation
  652. RANGE(0x17d5, 0x17d6), // Khmer sign camnuc pii huuh and other
  653. 0x17da, // Khmer sign koomut
  654. 0x1802, // Mongolian comma
  655. RANGE(0x1804, 0x1805), // Mongolian four dots and other
  656. 0x1808, // Mongolian manchu comma
  657. 0x3001, // ideographic comma
  658. RANGE(0xfe50, 0xfe51), // small comma and others
  659. RANGE(0xfe54, 0xfe55), // small semicolon and other
  660. 0xff0c, // fullwidth comma
  661. RANGE(0xff0e, 0xff0f), // fullwidth stop..fullwidth solidus
  662. RANGE(0xff1a, 0xff1b), // fullwidth colon..fullwidth semicolon
  663. 0xff64, // halfwidth ideographic comma
  664. 0x2016, // double vertical line
  665. RANGE(0x2032, 0x2034), // prime..triple prime
  666. 0xfe61, // small asterisk
  667. 0xfe68, // small reverse solidus
  668. 0xff3c, // fullwidth reverse solidus
  669. )
  670. // All punctuation.
  671. // Code P from http://www.unicode.org/Public/UNIDATA/PropList.txt
  672. // NB: This list is not necessarily exhaustive.
  673. DEFINE_CHAR_PROPERTY(punctuation, prop) {
  674. prop->AddCharProperty("open_punc");
  675. prop->AddCharProperty("close_punc");
  676. prop->AddCharProperty("leading_sentence_punc");
  677. prop->AddCharProperty("trailing_sentence_punc");
  678. prop->AddCharProperty("connector_punc");
  679. prop->AddCharProperty("dash_punc");
  680. prop->AddCharProperty("other_punc");
  681. prop->AddAsciiPredicate(&ispunct);
  682. }
  683. //======================================================================
  684. // Separators
  685. //
  686. // Line separators
  687. // Code Zl from http://www.unicode.org/Public/UNIDATA/PropList.txt
  688. // NB: This list is not necessarily exhaustive.
  689. DEFINE_CHAR_PROPERTY_AS_SET(line_separator,
  690. 0x2028, // line separator
  691. )
  692. // Paragraph separators
  693. // Code Zp from http://www.unicode.org/Public/UNIDATA/PropList.txt
  694. // NB: This list is not necessarily exhaustive.
  695. DEFINE_CHAR_PROPERTY_AS_SET(paragraph_separator,
  696. 0x2029, // paragraph separator
  697. )
  698. // Space separators
  699. // Code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
  700. // NB: This list is not necessarily exhaustive.
  701. DEFINE_CHAR_PROPERTY_AS_SET(space_separator,
  702. 0x0020, // space
  703. 0x00a0, // no-break space
  704. 0x1680, // Ogham space mark
  705. 0x180e, // Mongolian vowel separator
  706. RANGE(0x2000, 0x200a), // en quad..hair space
  707. 0x202f, // narrow no-break space
  708. 0x205f, // medium mathematical space
  709. 0x3000, // ideographic space
  710. // Google additions
  711. 0xe5e5, // "private" char used as space in Chinese
  712. )
  713. // Separators -- all line, paragraph, and space separators.
  714. // Code Z from http://www.unicode.org/Public/UNIDATA/PropList.txt
  715. // NB: This list is not necessarily exhaustive.
  716. DEFINE_CHAR_PROPERTY(separator, prop) {
  717. prop->AddCharProperty("line_separator");
  718. prop->AddCharProperty("paragraph_separator");
  719. prop->AddCharProperty("space_separator");
  720. prop->AddAsciiPredicate(&isspace);
  721. }
  722. //======================================================================
  723. // Alphanumeric Characters
  724. //
  725. // Digits
  726. DEFINE_CHAR_PROPERTY_AS_SET(digit,
  727. RANGE('0', '9'),
  728. RANGE(0x0660, 0x0669), // Arabic-Indic digits
  729. RANGE(0x06F0, 0x06F9), // Eastern Arabic-Indic digits
  730. )
  731. //======================================================================
  732. // Japanese Katakana
  733. //
  734. DEFINE_CHAR_PROPERTY_AS_SET(katakana,
  735. 0x3099, // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
  736. 0x309A, // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
  737. 0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK
  738. 0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
  739. RANGE(0x30A0, 0x30FF), // Fullwidth Katakana
  740. RANGE(0xFF65, 0xFF9F), // Halfwidth Katakana
  741. )
  742. //======================================================================
  743. // BiDi Directional Formatting Codes
  744. //
  745. // See http://www.unicode.org/reports/tr9/ for a description of Bidi
  746. // and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
  747. DEFINE_CHAR_PROPERTY_AS_SET(directional_formatting_code,
  748. 0x200E, // LRM (Left-to-Right Mark)
  749. 0x200F, // RLM (Right-to-Left Mark)
  750. 0x202A, // LRE (Left-to-Right Embedding)
  751. 0x202B, // RLE (Right-to-Left Embedding)
  752. 0x202C, // PDF (Pop Directional Format)
  753. 0x202D, // LRO (Left-to-Right Override)
  754. 0x202E, // RLO (Right-to-Left Override)
  755. )
  756. //======================================================================
  757. // Special collections
  758. //
  759. // NB: This does not check for all punctuation and symbols in the
  760. // standard; just those listed in our code. See the definitions in
  761. // char_properties.cc
  762. DEFINE_CHAR_PROPERTY(punctuation_or_symbol, prop) {
  763. prop->AddCharProperty("punctuation");
  764. prop->AddCharProperty("subscript_symbol");
  765. prop->AddCharProperty("superscript_symbol");
  766. prop->AddCharProperty("token_prefix_symbol");
  767. prop->AddCharProperty("token_suffix_symbol");
  768. }
  769. } // namespace syntaxnet