| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847 |
- /* Copyright 2016 Google Inc. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- // char_properties.cc - define is_X() tests for various character properties
- //
- // See char_properties.h for how to write a character property.
- //
- // References for the char sets below:
- //
- // . http://www.unicode.org/Public/UNIDATA/PropList.txt
- //
- // Large (but not exhaustive) list of Unicode chars and their "properties"
- // (e.g., the property "Pi" = an initial quote punctuation char).
- //
- // . http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
- //
- // Defines the list of properties, such as "Pi", used in the above list.
- //
- // . http://www.unipad.org/unimap/index.php?param_char=XXXX&page=detail
- //
- // Gives detail about a particular character code.
- // XXXX is a 4-hex-digit Unicode character code.
- //
- // . http://www.unicode.org/Public/UNIDATA/UCD.html
- //
- // General reference for Unicode characters.
- //
- #include "syntaxnet/char_properties.h"
- #include <ctype.h> // for ispunct, isspace
- #include <memory>
- #include <utility>
- #include <vector> // for vector
- #include "tensorflow/core/lib/strings/str_util.h"
- #include "tensorflow/core/lib/strings/stringprintf.h"
- #include "third_party/utf/utf.h" // for runetochar, ::UTFmax, Rune
- #include "util/utf8/unilib.h" // for IsValidCodepoint, etc
- #include "util/utf8/unilib_utf8_utils.h"
- //============================================================
- // CharPropertyImplementation
- //
- // A CharPropertyImplementation stores a set of Unicode characters,
- // encoded in UTF-8, as a trie. The trie is represented as a vector
- // of nodes. Each node is a 256-element array that specifies what to
- // do with one byte of the UTF-8 sequence. Each element n of a node
- // is one of:
- // n = 0, indicating that the Property is not true of any
- // character whose UTF-8 encoding includes this byte at
- // this position
- // n = -1, indicating that the Property is true for the UTF-8 sequence
- // that ends with this byte.
- // n > 0, indicating the index of the row that describes the
- // remaining bytes in the UTF-8 sequence.
- //
- // The only operation that needs to be fast is HoldsFor, which tests
- // whether a character has a given property. We use each byte of the
- // character's UTF-8 encoding to index into a row. If the value is 0,
- // then the property is not true for the character. (We might discover
- // this even before getting to the end of the sequence.) If the value
- // is -1, then the property is true for this character. Otherwise,
- // the value is the index of another row, which we index using the next
- // byte in the sequence, and so on. The design of UTF-8 prevents
- // ambiguities here; no prefix of a UTF-8 sequence is a valid UTF-8
- // sequence.
- //
- // While it is possible to implement an iterator for this representation,
- // it is much easier to use set<char32> for this purpose. In fact, we
- // would use that as the entire representation, were it not for concerns
- // that HoldsFor might be slower.
- namespace syntaxnet {
- struct CharPropertyImplementation {
- unordered_set<char32> chars;
- std::vector<std::vector<int> > rows;
- CharPropertyImplementation() {
- rows.reserve(10);
- rows.resize(1);
- rows[0].resize(256, 0);
- }
- void AddChar(char *buf, int len) {
- int n = 0; // row index
- for (int i = 0; i < len; ++i) {
- int ch = reinterpret_cast<unsigned char *>(buf)[i];
- int m = rows[n][ch];
- if (m > 0) {
- CHECK_LT(i, len - 1)
- << " : " << (i + 1) << "-byte UTF-8 sequence "
- << "(" << tensorflow::str_util::CEscape(string(buf, i + 1)) << ")"
- << " is prefix of previously-seen UTF-8 sequence(s)";
- n = m;
- } else if (i == len - 1) {
- rows[n][ch] = -1;
- } else {
- CHECK_EQ(m, 0) << " : UTF-8 sequence is extension of previously-seen "
- << (i + 1) << "-byte UTF-8 sequence "
- << "("
- << tensorflow::str_util::CEscape(string(buf, i + 1))
- << ")";
- int a = rows.size();
- rows.resize(a + 1);
- rows[a].resize(256, 0);
- rows[n][ch] = a;
- n = a;
- }
- }
- }
- bool HoldsFor(const char *buf) const {
- const unsigned char *bytes = reinterpret_cast<const unsigned char *>(buf);
- // Lookup each byte of the UTF-8 sequence, starting in row 0.
- int n = rows[0][*bytes];
- if (n == 0) return false;
- if (n == -1) return true;
- // If the value is not 0 or -1, then it is the index of the row for the
- // second byte in the sequence.
- n = rows[n][*++bytes];
- if (n == 0) return false;
- if (n == -1) return true;
- n = rows[n][*++bytes]; // Likewise for the third byte.
- if (n == 0) return false;
- if (n == -1) return true;
- n = rows[n][*++bytes]; // Likewise for the fourth byte.
- if (n == 0) return false;
- // Since there can be at most 4 bytes in the sequence, n must be -1.
- return true;
- // Implementation note: it is possible (and perhaps clearer) to write this
- // code as a loop, "for (int i = 0; i < 4; ++i) ...", but the TestHoldsFor
- // benchmark results indicate that doing so produces slower code for
- // anything other than short 7-bit ASCII strings (< 512 bytes). This is
- // mysterious, since the compiler unrolls the loop, producing code that
- // is almost the same as what we have here, except for the shortcut on
- // the 4th byte.
- }
- };
- //============================================================
- // CharProperty - a property that holds for selected Unicode chars
- //
- CharProperty::CharProperty(const char *name,
- const int *unicodes,
- int num_unicodes)
- : name_(name),
- impl_(new CharPropertyImplementation) {
- // Initialize CharProperty to its char set.
- AddCharSpec(unicodes, num_unicodes);
- }
- CharProperty::CharProperty(const char *name, CharPropertyInitializer *init_fn)
- : name_(name),
- impl_(new CharPropertyImplementation) {
- (*init_fn)(this);
- }
- CharProperty::~CharProperty() {
- delete impl_;
- }
- void CharProperty::AddChar(int c) {
- CheckUnicodeVal(c);
- impl_->chars.insert(c);
- char buf[UTFmax];
- Rune r = c;
- int len = runetochar(buf, &r);
- impl_->AddChar(buf, len);
- }
- void CharProperty::AddCharRange(int c1, int c2) {
- for (int c = c1; c <= c2; ++c) {
- AddChar(c);
- }
- }
- void CharProperty::AddAsciiPredicate(AsciiPredicate *pred) {
- for (int c = 0; c < 256; ++c) {
- if ((*pred)(c)) {
- AddChar(c);
- }
- }
- }
- void CharProperty::AddCharProperty(const char *propname) {
- const CharProperty *prop = CharProperty::Lookup(propname);
- CHECK(prop != NULL) << ": unknown char property \"" << propname
- << "\" in " << name_;
- int c = -1;
- while ((c = prop->NextElementAfter(c)) >= 0) {
- AddChar(c);
- }
- }
- void CharProperty::AddCharSpec(const int *unicodes, int num_unicodes) {
- for (int i = 0; i < num_unicodes; ++i) {
- if (i + 3 < num_unicodes && unicodes[i] == kPreUnicodeRange &&
- unicodes[i + 3] == kPostUnicodeRange) {
- // Range of unicode values
- int lower = unicodes[i + 1];
- int upper = unicodes[i + 2];
- i += 3; // i will be incremented once more at top of loop
- CHECK(lower <= upper) << ": invalid char range in " << name_
- << ": [" << UnicodeToString(lower) << ", "
- << UnicodeToString(upper) << "]";
- AddCharRange(lower, upper);
- } else {
- AddChar(unicodes[i]);
- }
- }
- }
- bool CharProperty::HoldsFor(int c) const {
- if (!UniLib::IsValidCodepoint(c)) return false;
- char buf[UTFmax];
- Rune r = c;
- runetochar(buf, &r);
- return impl_->HoldsFor(buf);
- }
- bool CharProperty::HoldsFor(const char *str, int len) const {
- // UniLib::IsUTF8ValidCodepoint also checks for structural validity.
- return len > 0 && UniLib::IsUTF8ValidCodepoint(StringPiece(str, len)) &&
- impl_->HoldsFor(str);
- }
- // Return -1 or the smallest Unicode char greater than c for which
- // the CharProperty holds. Expects c == -1 or HoldsFor(c).
- int CharProperty::NextElementAfter(int c) const {
- DCHECK(c == -1 || HoldsFor(c));
- unordered_set<char32>::const_iterator end = impl_->chars.end();
- if (c < 0) {
- unordered_set<char32>::const_iterator it = impl_->chars.begin();
- if (it == end) return -1;
- return *it;
- }
- char32 r = c;
- unordered_set<char32>::const_iterator it = impl_->chars.find(r);
- if (it == end) return -1;
- it++;
- if (it == end) return -1;
- return *it;
- }
- REGISTER_SYNTAXNET_CLASS_REGISTRY("char property wrapper", CharPropertyWrapper);
- const CharProperty *CharProperty::Lookup(const char *subclass) {
- // Create a CharPropertyWrapper object and delete it. We only care about
- // the CharProperty it provides.
- std::unique_ptr<CharPropertyWrapper> wrapper(
- CharPropertyWrapper::Create(subclass));
- if (wrapper.get() == NULL) {
- LOG(ERROR) << "CharPropertyWrapper not found for subclass: "
- << "\"" << subclass << "\"";
- return NULL;
- }
- return wrapper->GetCharProperty();
- }
- // Check that a given Unicode value is in range.
- void CharProperty::CheckUnicodeVal(int c) const {
- CHECK(UniLib::IsValidCodepoint(c))
- << "Unicode in " << name_ << " out of range: " << UnicodeToString(c);
- }
- // Converts a Unicode value to a string (for error messages).
- string CharProperty::UnicodeToString(int c) {
- const char *fmt;
- if (c < 0) {
- fmt = "%d"; // out-of-range
- } else if (c <= 0x7f) {
- fmt = "'%c'"; // ascii
- } else if (c <= 0xffff) {
- fmt = "0x%04X"; // 4 hex digits
- } else {
- fmt = "0x%X"; // also out-of-range
- }
- return tensorflow::strings::Printf(fmt, c);
- }
- //======================================================================
- // Expression-level punctuation
- //
- // Punctuation that starts a sentence.
- DEFINE_CHAR_PROPERTY_AS_SET(start_sentence_punc,
- 0x00A1, // Spanish inverted exclamation mark
- 0x00BF, // Spanish inverted question mark
- )
- // Punctuation that ends a sentence.
- // Based on: http://www.unicode.org/unicode/reports/tr29/#Sentence_Boundaries
- DEFINE_CHAR_PROPERTY_AS_SET(end_sentence_punc,
- '.',
- '!',
- '?',
- 0x055C, // Armenian exclamation mark
- 0x055E, // Armenian question mark
- 0x0589, // Armenian full stop
- 0x061F, // Arabic question mark
- 0x06D4, // Arabic full stop
- 0x0700, // Syriac end of paragraph
- 0x0701, // Syriac supralinear full stop
- 0x0702, // Syriac sublinear full stop
- RANGE(0x0964, 0x0965), // Devanagari danda..Devanagari double danda
- 0x1362, // Ethiopic full stop
- 0x1367, // Ethiopic question mark
- 0x1368, // Ethiopic paragraph separator
- 0x104A, // Myanmar sign little section
- 0x104B, // Myanmar sign section
- 0x166E, // Canadian syllabics full stop
- 0x17d4, // Khmer sign khan
- 0x1803, // Mongolian full stop
- 0x1809, // Mongolian Manchu full stop
- 0x1944, // Limbu exclamation mark
- 0x1945, // Limbu question mark
- 0x203C, // double exclamation mark
- 0x203D, // interrobang
- 0x2047, // double question mark
- 0x2048, // question exclamation mark
- 0x2049, // exclamation question mark
- 0x3002, // ideographic full stop
- 0x037E, // Greek question mark
- 0xFE52, // small full stop
- 0xFE56, // small question mark
- 0xFE57, // small exclamation mark
- 0xFF01, // fullwidth exclamation mark
- 0xFF0E, // fullwidth full stop
- 0xFF1F, // fullwidth question mark
- 0xFF61, // halfwidth ideographic full stop
- 0x2026, // ellipsis
- )
- // Punctuation, such as parens, that opens a "nested expression" of text.
- DEFINE_CHAR_PROPERTY_AS_SET(open_expr_punc,
- '(',
- '[',
- '<',
- '{',
- 0x207D, // superscript left parenthesis
- 0x208D, // subscript left parenthesis
- 0x27E6, // mathematical left white square bracket
- 0x27E8, // mathematical left angle bracket
- 0x27EA, // mathematical left double angle bracket
- 0x2983, // left white curly bracket
- 0x2985, // left white parenthesis
- 0x2987, // Z notation left image bracket
- 0x2989, // Z notation left binding bracket
- 0x298B, // left square bracket with underbar
- 0x298D, // left square bracket with tick in top corner
- 0x298F, // left square bracket with tick in bottom corner
- 0x2991, // left angle bracket with dot
- 0x2993, // left arc less-than bracket
- 0x2995, // double left arc greater-than bracket
- 0x2997, // left black tortoise shell bracket
- 0x29D8, // left wiggly fence
- 0x29DA, // left double wiggly fence
- 0x29FC, // left-pointing curved angle bracket
- 0x3008, // CJK left angle bracket
- 0x300A, // CJK left double angle bracket
- 0x3010, // CJK left black lenticular bracket
- 0x3014, // CJK left tortoise shell bracket
- 0x3016, // CJK left white lenticular bracket
- 0x3018, // CJK left white tortoise shell bracket
- 0x301A, // CJK left white square bracket
- 0xFD3E, // Ornate left parenthesis
- 0xFE59, // small left parenthesis
- 0xFE5B, // small left curly bracket
- 0xFF08, // fullwidth left parenthesis
- 0xFF3B, // fullwidth left square bracket
- 0xFF5B, // fullwidth left curly bracket
- )
- // Punctuation, such as parens, that closes a "nested expression" of text.
- DEFINE_CHAR_PROPERTY_AS_SET(close_expr_punc,
- ')',
- ']',
- '>',
- '}',
- 0x207E, // superscript right parenthesis
- 0x208E, // subscript right parenthesis
- 0x27E7, // mathematical right white square bracket
- 0x27E9, // mathematical right angle bracket
- 0x27EB, // mathematical right double angle bracket
- 0x2984, // right white curly bracket
- 0x2986, // right white parenthesis
- 0x2988, // Z notation right image bracket
- 0x298A, // Z notation right binding bracket
- 0x298C, // right square bracket with underbar
- 0x298E, // right square bracket with tick in top corner
- 0x2990, // right square bracket with tick in bottom corner
- 0x2992, // right angle bracket with dot
- 0x2994, // right arc greater-than bracket
- 0x2996, // double right arc less-than bracket
- 0x2998, // right black tortoise shell bracket
- 0x29D9, // right wiggly fence
- 0x29DB, // right double wiggly fence
- 0x29FD, // right-pointing curved angle bracket
- 0x3009, // CJK right angle bracket
- 0x300B, // CJK right double angle bracket
- 0x3011, // CJK right black lenticular bracket
- 0x3015, // CJK right tortoise shell bracket
- 0x3017, // CJK right white lenticular bracket
- 0x3019, // CJK right white tortoise shell bracket
- 0x301B, // CJK right white square bracket
- 0xFD3F, // Ornate right parenthesis
- 0xFE5A, // small right parenthesis
- 0xFE5C, // small right curly bracket
- 0xFF09, // fullwidth right parenthesis
- 0xFF3D, // fullwidth right square bracket
- 0xFF5D, // fullwidth right curly bracket
- )
- // Chars that open a quotation.
- // Based on: http://www.unicode.org/uni2book/ch06.pdf
- DEFINE_CHAR_PROPERTY_AS_SET(open_quote,
- '"',
- '\'',
- '`',
- 0xFF07, // fullwidth apostrophe
- 0xFF02, // fullwidth quotation mark
- 0x2018, // left single quotation mark (English, others)
- 0x201C, // left double quotation mark (English, others)
- 0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
- 0x201A, // single low-9 quotation mark (Czech, German, Slovak)
- 0x201E, // double low-9 quotation mark (Czech, German, Slovak)
- 0x201F, // double high-reversed-9 quotation mark (PropList.txt)
- 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
- 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
- 0x2039, // single left-pointing angle quotation mark (French, others)
- 0x00AB, // left-pointing double angle quotation mark (French, others)
- 0x203A, // single right-pointing angle quotation mark (Slovenian, others)
- 0x00BB, // right-pointing double angle quotation mark (Slovenian, others)
- 0x300C, // left corner bracket (East Asian languages)
- 0xFE41, // presentation form for vertical left corner bracket
- 0xFF62, // halfwidth left corner bracket (East Asian languages)
- 0x300E, // left white corner bracket (East Asian languages)
- 0xFE43, // presentation form for vertical left white corner bracket
- 0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
- )
- // Chars that close a quotation.
- // Based on: http://www.unicode.org/uni2book/ch06.pdf
- DEFINE_CHAR_PROPERTY_AS_SET(close_quote,
- '\'',
- '"',
- '`',
- 0xFF07, // fullwidth apostrophe
- 0xFF02, // fullwidth quotation mark
- 0x2019, // right single quotation mark (English, others)
- 0x201D, // right double quotation mark (English, others)
- 0x2018, // left single quotation mark (Czech, German, Slovak)
- 0x201C, // left double quotation mark (Czech, German, Slovak)
- 0x203A, // single right-pointing angle quotation mark (French, others)
- 0x00BB, // right-pointing double angle quotation mark (French, others)
- 0x2039, // single left-pointing angle quotation mark (Slovenian, others)
- 0x00AB, // left-pointing double angle quotation mark (Slovenian, others)
- 0x300D, // right corner bracket (East Asian languages)
- 0xfe42, // presentation form for vertical right corner bracket
- 0xFF63, // halfwidth right corner bracket (East Asian languages)
- 0x300F, // right white corner bracket (East Asian languages)
- 0xfe44, // presentation form for vertical right white corner bracket
- 0x301F, // low double prime quotation mark (East Asian languages)
- 0x301E, // close double prime (East Asian languages written horizontally)
- )
- // Punctuation chars that open an expression or a quotation.
- DEFINE_CHAR_PROPERTY(open_punc, prop) {
- prop->AddCharProperty("open_expr_punc");
- prop->AddCharProperty("open_quote");
- }
- // Punctuation chars that close an expression or a quotation.
- DEFINE_CHAR_PROPERTY(close_punc, prop) {
- prop->AddCharProperty("close_expr_punc");
- prop->AddCharProperty("close_quote");
- }
- // Punctuation chars that can come at the beginning of a sentence.
- DEFINE_CHAR_PROPERTY(leading_sentence_punc, prop) {
- prop->AddCharProperty("open_punc");
- prop->AddCharProperty("start_sentence_punc");
- }
- // Punctuation chars that can come at the end of a sentence.
- DEFINE_CHAR_PROPERTY(trailing_sentence_punc, prop) {
- prop->AddCharProperty("close_punc");
- prop->AddCharProperty("end_sentence_punc");
- }
- //======================================================================
- // Special symbols
- //
- // Currency symbols.
- // From: http://www.unicode.org/charts/PDF/U20A0.pdf
- DEFINE_CHAR_PROPERTY_AS_SET(currency_symbol,
- '$',
- // 0x00A2, // cents (NB: typically FOLLOWS the amount)
- 0x00A3, // pounds and liras
- 0x00A4, // general currency sign
- 0x00A5, // yen or yuan
- 0x0192, // Dutch florin (latin small letter "f" with hook)
- 0x09F2, // Bengali rupee mark
- 0x09F3, // Bengali rupee sign
- 0x0AF1, // Guajarati rupee sign
- 0x0BF9, // Tamil rupee sign
- 0x0E3F, // Thai baht
- 0x17DB, // Khmer riel
- 0x20A0, // alternative euro sign
- 0x20A1, // Costa Rica, El Salvador (colon sign)
- 0x20A2, // Brazilian cruzeiro
- 0x20A3, // French Franc
- 0x20A4, // alternative lira sign
- 0x20A5, // mill sign (USA 1/10 cent)
- 0x20A6, // Nigerian Naira
- 0x20A7, // Spanish peseta
- 0x20A8, // Indian rupee
- 0x20A9, // Korean won
- 0x20AA, // Israeli new sheqel
- 0x20AB, // Vietnam dong
- 0x20AC, // euro sign
- 0x20AD, // Laotian kip
- 0x20AE, // Mongolian tugrik
- 0x20AF, // Greek drachma
- 0x20B0, // German penny
- 0x20B1, // Philippine peso (Mexican peso uses "$")
- 0x2133, // Old German mark (script capital M)
- 0xFDFC, // rial sign
- 0xFFE0, // fullwidth cents
- 0xFFE1, // fullwidth pounds
- 0xFFE5, // fullwidth Japanese yen
- 0xFFE6, // fullwidth Korean won
- )
- // Chinese bookquotes.
- // They look like "<<" and ">>" except that they are single UTF8 chars
- // (U+300A, U+300B). These are used in chinese as special
- // punctuation, refering to the title of a book, an article, a movie,
- // etc. For example: "cellphone" means cellphone, but <<cellphone>>
- // means (exclusively) the movie.
- DEFINE_CHAR_PROPERTY_AS_SET(open_bookquote,
- 0x300A
- )
- DEFINE_CHAR_PROPERTY_AS_SET(close_bookquote,
- 0x300B
- )
- //======================================================================
- // Token-level punctuation
- //
- // Token-prefix symbols, excluding currency symbols -- glom on
- // to following token (esp. if no space after)
- DEFINE_CHAR_PROPERTY_AS_SET(noncurrency_token_prefix_symbol,
- '#',
- 0x2116, // numero sign ("No")
- )
- // Token-prefix symbols -- glom on to following token (esp. if no space after)
- DEFINE_CHAR_PROPERTY(token_prefix_symbol, prop) {
- prop->AddCharProperty("currency_symbol");
- prop->AddCharProperty("noncurrency_token_prefix_symbol");
- }
- // Token-suffix symbols -- glom on to preceding token (esp. if no space before)
- DEFINE_CHAR_PROPERTY_AS_SET(token_suffix_symbol,
- '%',
- 0x066A, // Arabic percent sign
- 0x2030, // per mille
- 0x2031, // per ten thousand
- 0x00A2, // cents sign
- 0x2125, // ounces sign
- 0x00AA, // feminine ordinal indicator (Spanish)
- 0x00BA, // masculine ordinal indicator (Spanish)
- 0x00B0, // degrees
- 0x2109, // degrees Fahrenheit
- 0x2103, // degrees Celsius
- 0x2126, // ohms
- 0x212A, // Kelvin
- 0x212B, // Angstroms ("A" with circle on top)
- 0x00A9, // copyright
- 0x2117, // sound recording copyright (circled "P")
- 0x2122, // trade mark
- 0x00AE, // registered trade mark
- 0x2120, // service mark
- 0x2106, // cada una ("c/a" == "each" in Spanish)
- 0x2020, // dagger (can be used for footnotes)
- 0x2021, // double dagger (can be used for footnotes)
- )
- // Subscripts
- DEFINE_CHAR_PROPERTY_AS_SET(subscript_symbol,
- 0x2080, // subscript 0
- 0x2081, // subscript 1
- 0x2082, // subscript 2
- 0x2083, // subscript 3
- 0x2084, // subscript 4
- 0x2085, // subscript 5
- 0x2086, // subscript 6
- 0x2087, // subscript 7
- 0x2088, // subscript 8
- 0x2089, // subscript 9
- 0x208A, // subscript "+"
- 0x208B, // subscript "-"
- 0x208C, // subscript "="
- 0x208D, // subscript "("
- 0x208E, // subscript ")"
- )
- // Superscripts
- DEFINE_CHAR_PROPERTY_AS_SET(superscript_symbol,
- 0x2070, // superscript 0
- 0x00B9, // superscript 1
- 0x00B2, // superscript 2
- 0x00B3, // superscript 3
- 0x2074, // superscript 4
- 0x2075, // superscript 5
- 0x2076, // superscript 6
- 0x2077, // superscript 7
- 0x2078, // superscript 8
- 0x2079, // superscript 9
- 0x2071, // superscript Latin small "i"
- 0x207A, // superscript "+"
- 0x207B, // superscript "-"
- 0x207C, // superscript "="
- 0x207D, // superscript "("
- 0x207E, // superscript ")"
- 0x207F, // superscript Latin small "n"
- )
- //======================================================================
- // General punctuation
- //
- // Connector punctuation
- // Code Pc from http://www.unicode.org/Public/UNIDATA/PropList.txt
- // NB: This list is not necessarily exhaustive.
- DEFINE_CHAR_PROPERTY_AS_SET(connector_punc,
- 0x30fb, // Katakana middle dot
- 0xff65, // halfwidth Katakana middle dot
- 0x2040, // character tie
- )
- // Dashes
- // Code Pd from http://www.unicode.org/Public/UNIDATA/PropList.txt
- // NB: This list is not necessarily exhaustive.
- DEFINE_CHAR_PROPERTY_AS_SET(dash_punc,
- '-',
- '~',
- 0x058a, // Armenian hyphen
- 0x1806, // Mongolian todo soft hyphen
- RANGE(0x2010, 0x2015), // hyphen..horizontal bar
- 0x2053, // swung dash -- from Table 6-3 of Unicode book
- 0x207b, // superscript minus
- 0x208b, // subscript minus
- 0x2212, // minus sign
- 0x301c, // wave dash
- 0x3030, // wavy dash
- RANGE(0xfe31, 0xfe32), // presentation form for vertical em dash..en dash
- 0xfe58, // small em dash
- 0xfe63, // small hyphen-minus
- 0xff0d, // fullwidth hyphen-minus
- )
- // Other punctuation
- // Code Po from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
- // NB: This list is not exhaustive.
- DEFINE_CHAR_PROPERTY_AS_SET(other_punc,
- ',',
- ':',
- ';',
- 0x00b7, // middle dot
- 0x0387, // Greek ano teleia
- 0x05c3, // Hebrew punctuation sof pasuq
- 0x060c, // Arabic comma
- 0x061b, // Arabic semicolon
- 0x066b, // Arabic decimal separator
- 0x066c, // Arabic thousands separator
- RANGE(0x0703, 0x70a), // Syriac contraction and others
- 0x070c, // Syric harklean metobelus
- 0x0e5a, // Thai character angkhankhu
- 0x0e5b, // Thai character khomut
- 0x0f08, // Tibetan mark sbrul shad
- RANGE(0x0f0d, 0x0f12), // Tibetan mark shad..Tibetan mark rgya gram shad
- 0x1361, // Ethiopic wordspace
- RANGE(0x1363, 0x1366), // other Ethiopic chars
- 0x166d, // Canadian syllabics chi sign
- RANGE(0x16eb, 0x16ed), // Runic single punctuation..Runic cross punctuation
- RANGE(0x17d5, 0x17d6), // Khmer sign camnuc pii huuh and other
- 0x17da, // Khmer sign koomut
- 0x1802, // Mongolian comma
- RANGE(0x1804, 0x1805), // Mongolian four dots and other
- 0x1808, // Mongolian manchu comma
- 0x3001, // ideographic comma
- RANGE(0xfe50, 0xfe51), // small comma and others
- RANGE(0xfe54, 0xfe55), // small semicolon and other
- 0xff0c, // fullwidth comma
- RANGE(0xff0e, 0xff0f), // fullwidth stop..fullwidth solidus
- RANGE(0xff1a, 0xff1b), // fullwidth colon..fullwidth semicolon
- 0xff64, // halfwidth ideographic comma
- 0x2016, // double vertical line
- RANGE(0x2032, 0x2034), // prime..triple prime
- 0xfe61, // small asterisk
- 0xfe68, // small reverse solidus
- 0xff3c, // fullwidth reverse solidus
- )
- // All punctuation.
- // Code P from http://www.unicode.org/Public/UNIDATA/PropList.txt
- // NB: This list is not necessarily exhaustive.
- DEFINE_CHAR_PROPERTY(punctuation, prop) {
- prop->AddCharProperty("open_punc");
- prop->AddCharProperty("close_punc");
- prop->AddCharProperty("leading_sentence_punc");
- prop->AddCharProperty("trailing_sentence_punc");
- prop->AddCharProperty("connector_punc");
- prop->AddCharProperty("dash_punc");
- prop->AddCharProperty("other_punc");
- prop->AddAsciiPredicate(&ispunct);
- }
- //======================================================================
- // Separators
- //
- // Line separators
- // Code Zl from http://www.unicode.org/Public/UNIDATA/PropList.txt
- // NB: This list is not necessarily exhaustive.
- DEFINE_CHAR_PROPERTY_AS_SET(line_separator,
- 0x2028, // line separator
- )
- // Paragraph separators
- // Code Zp from http://www.unicode.org/Public/UNIDATA/PropList.txt
- // NB: This list is not necessarily exhaustive.
- DEFINE_CHAR_PROPERTY_AS_SET(paragraph_separator,
- 0x2029, // paragraph separator
- )
- // Space separators
- // Code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
- // NB: This list is not necessarily exhaustive.
- DEFINE_CHAR_PROPERTY_AS_SET(space_separator,
- 0x0020, // space
- 0x00a0, // no-break space
- 0x1680, // Ogham space mark
- 0x180e, // Mongolian vowel separator
- RANGE(0x2000, 0x200a), // en quad..hair space
- 0x202f, // narrow no-break space
- 0x205f, // medium mathematical space
- 0x3000, // ideographic space
- // Google additions
- 0xe5e5, // "private" char used as space in Chinese
- )
- // Separators -- all line, paragraph, and space separators.
- // Code Z from http://www.unicode.org/Public/UNIDATA/PropList.txt
- // NB: This list is not necessarily exhaustive.
- DEFINE_CHAR_PROPERTY(separator, prop) {
- prop->AddCharProperty("line_separator");
- prop->AddCharProperty("paragraph_separator");
- prop->AddCharProperty("space_separator");
- prop->AddAsciiPredicate(&isspace);
- }
- //======================================================================
- // Alphanumeric Characters
- //
- // Digits
- DEFINE_CHAR_PROPERTY_AS_SET(digit,
- RANGE('0', '9'),
- RANGE(0x0660, 0x0669), // Arabic-Indic digits
- RANGE(0x06F0, 0x06F9), // Eastern Arabic-Indic digits
- )
- //======================================================================
- // Japanese Katakana
- //
- DEFINE_CHAR_PROPERTY_AS_SET(katakana,
- 0x3099, // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
- 0x309A, // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
- 0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK
- 0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
- RANGE(0x30A0, 0x30FF), // Fullwidth Katakana
- RANGE(0xFF65, 0xFF9F), // Halfwidth Katakana
- )
- //======================================================================
- // BiDi Directional Formatting Codes
- //
- // See http://www.unicode.org/reports/tr9/ for a description of Bidi
- // and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
- DEFINE_CHAR_PROPERTY_AS_SET(directional_formatting_code,
- 0x200E, // LRM (Left-to-Right Mark)
- 0x200F, // RLM (Right-to-Left Mark)
- 0x202A, // LRE (Left-to-Right Embedding)
- 0x202B, // RLE (Right-to-Left Embedding)
- 0x202C, // PDF (Pop Directional Format)
- 0x202D, // LRO (Left-to-Right Override)
- 0x202E, // RLO (Right-to-Left Override)
- )
- //======================================================================
- // Special collections
- //
- // NB: This does not check for all punctuation and symbols in the
- // standard; just those listed in our code. See the definitions in
- // char_properties.cc
- DEFINE_CHAR_PROPERTY(punctuation_or_symbol, prop) {
- prop->AddCharProperty("punctuation");
- prop->AddCharProperty("subscript_symbol");
- prop->AddCharProperty("superscript_symbol");
- prop->AddCharProperty("token_prefix_symbol");
- prop->AddCharProperty("token_suffix_symbol");
- }
- } // namespace syntaxnet
|