char_properties_test.cc 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. // Tests for char_properties.cc:
  13. //
  14. // (1) Test the DEFINE_CHAR_PROPERTY_AS_SET and DEFINE_CHAR_PROPERTY macros
  15. // by defining a few fake char properties and verifying their contents.
  16. //
  17. // (2) Test the char properties defined in char_properties.cc by spot-checking
  18. // a few chars.
  19. //
  20. #include "syntaxnet/char_properties.h"
  21. #include <ctype.h> // for ispunct, isspace
  22. #include <map>
  23. #include <set>
  24. #include <utility>
  25. #include <vector>
  26. #include <gmock/gmock.h> // for ContainerEq, EXPECT_THAT
  27. #include "tensorflow/core/platform/test.h"
  28. #include "third_party/utf/utf.h"
  29. #include "util/utf8/unilib.h" // for IsValidCodepoint, etc
  30. #include "util/utf8/unilib_utf8_utils.h"
  31. using ::testing::ContainerEq;
  32. namespace syntaxnet {
  33. // Invalid UTF-8 bytes are decoded as the Replacement Character, U+FFFD
  34. // (which is also Runeerror). Invalid code points are encoded in UTF-8
  35. // with the UTF-8 representation of the Replacement Character.
  36. static const char ReplacementCharacterUTF8[3] = {'\xEF', '\xBF', '\xBD'};
  37. // ====================================================================
  38. // CharPropertiesTest
  39. //
  40. class CharPropertiesTest : public testing::Test {
  41. protected:
  42. // Collect a set of chars.
  43. void CollectChars(const std::set<char32> &chars) {
  44. collected_set_.insert(chars.begin(), chars.end());
  45. }
  46. // Collect an array of chars.
  47. void CollectArray(const char32 arr[], int len) {
  48. collected_set_.insert(arr, arr + len);
  49. }
  50. // Collect the chars for which the named CharProperty holds.
  51. void CollectCharProperty(const char *name) {
  52. const CharProperty *prop = CharProperty::Lookup(name);
  53. ASSERT_TRUE(prop != nullptr) << "for " << name;
  54. for (char32 c = 0; c <= 0x10FFFF; ++c) {
  55. if (UniLib::IsValidCodepoint(c) && prop->HoldsFor(c)) {
  56. collected_set_.insert(c);
  57. }
  58. }
  59. }
  60. // Collect the chars for which an ascii predicate holds.
  61. void CollectAsciiPredicate(AsciiPredicate *pred) {
  62. for (char32 c = 0; c < 256; ++c) {
  63. if ((*pred)(c)) {
  64. collected_set_.insert(c);
  65. }
  66. }
  67. }
  68. // Expect the named char property to be true for precisely the chars in
  69. // the collected set.
  70. void ExpectCharPropertyEqualsCollectedSet(const char *name) {
  71. const CharProperty *prop = CharProperty::Lookup(name);
  72. ASSERT_TRUE(prop != nullptr) << "for " << name;
  73. // Test that char property holds for all collected chars. Exercises both
  74. // signatures of CharProperty::HoldsFor().
  75. for (std::set<char32>::const_iterator it = collected_set_.begin();
  76. it != collected_set_.end(); ++it) {
  77. // Test utf8 version of is_X().
  78. const char32 c = *it;
  79. string utf8_char = EncodeAsUTF8(&c, 1);
  80. EXPECT_TRUE(prop->HoldsFor(utf8_char.c_str(), utf8_char.size()));
  81. // Test ucs-2 version of is_X().
  82. EXPECT_TRUE(prop->HoldsFor(static_cast<int>(c)));
  83. }
  84. // Test that the char property holds for precisely the collected chars.
  85. // Somewhat redundant with previous test, but exercises
  86. // CharProperty::NextElementAfter().
  87. std::set<char32> actual_chars;
  88. int c = -1;
  89. while ((c = prop->NextElementAfter(c)) >= 0) {
  90. actual_chars.insert(static_cast<char32>(c));
  91. }
  92. EXPECT_THAT(actual_chars, ContainerEq(collected_set_))
  93. << " for " << name;
  94. }
  95. // Expect the named char property to be true for at least the chars in
  96. // the collected set.
  97. void ExpectCharPropertyContainsCollectedSet(const char *name) {
  98. const CharProperty *prop = CharProperty::Lookup(name);
  99. ASSERT_TRUE(prop != nullptr) << "for " << name;
  100. for (std::set<char32>::const_iterator it = collected_set_.begin();
  101. it != collected_set_.end(); ++it) {
  102. EXPECT_TRUE(prop->HoldsFor(static_cast<int>(*it)));
  103. }
  104. }
  105. string EncodeAsUTF8(const char32 *in, int size) {
  106. string out;
  107. out.reserve(size);
  108. for (int i = 0; i < size; ++i) {
  109. char buf[UTFmax];
  110. int len = EncodeAsUTF8Char(*in++, buf);
  111. out.append(buf, len);
  112. }
  113. return out;
  114. }
  115. int EncodeAsUTF8Char(char32 in, char *out) {
  116. if (UniLib::IsValidCodepoint(in)) {
  117. return runetochar(out, &in);
  118. } else {
  119. memcpy(out, ReplacementCharacterUTF8, 3);
  120. return 3;
  121. }
  122. }
  123. private:
  124. std::set<char32> collected_set_;
  125. };
  126. //======================================================================
  127. // Declarations of the sample character sets below
  128. // (to test the DECLARE_CHAR_PROPERTY() macro)
  129. //
  130. DECLARE_CHAR_PROPERTY(test_digit);
  131. DECLARE_CHAR_PROPERTY(test_wavy_dash);
  132. DECLARE_CHAR_PROPERTY(test_digit_or_wavy_dash);
  133. DECLARE_CHAR_PROPERTY(test_punctuation_plus);
  134. //======================================================================
  135. // Definitions of sample character sets
  136. //
  137. // Digits.
  138. DEFINE_CHAR_PROPERTY_AS_SET(test_digit,
  139. RANGE('0', '9'),
  140. )
  141. // Wavy dashes.
  142. DEFINE_CHAR_PROPERTY_AS_SET(test_wavy_dash,
  143. '~',
  144. 0x301C, // wave dash
  145. 0x3030, // wavy dash
  146. )
  147. // Digits or wavy dashes.
  148. DEFINE_CHAR_PROPERTY(test_digit_or_wavy_dash, prop) {
  149. prop->AddCharProperty("test_digit");
  150. prop->AddCharProperty("test_wavy_dash");
  151. }
  152. // Punctuation plus a few extraneous chars.
  153. DEFINE_CHAR_PROPERTY(test_punctuation_plus, prop) {
  154. prop->AddChar('a');
  155. prop->AddCharRange('b', 'b');
  156. prop->AddCharRange('c', 'e');
  157. static const int kUnicodes[] = {'f', RANGE('g', 'i'), 'j'};
  158. prop->AddCharSpec(kUnicodes, arraysize(kUnicodes));
  159. prop->AddCharProperty("punctuation");
  160. }
  161. //====================================================================
  162. // Another form of the character sets above -- for verification
  163. //
  164. const char32 kTestDigit[] = {
  165. '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
  166. };
  167. const char32 kTestWavyDash[] = {
  168. '~',
  169. 0x301C, // wave dash,
  170. 0x3030, // wavy dash
  171. };
  172. const char32 kTestPunctuationPlusExtras[] = {
  173. 'a',
  174. 'b',
  175. 'c',
  176. 'd',
  177. 'e',
  178. 'f',
  179. 'g',
  180. 'h',
  181. 'i',
  182. 'j',
  183. };
  184. // ====================================================================
  185. // Tests
  186. //
  187. TEST_F(CharPropertiesTest, TestDigit) {
  188. CollectArray(kTestDigit, arraysize(kTestDigit));
  189. ExpectCharPropertyEqualsCollectedSet("test_digit");
  190. }
  191. TEST_F(CharPropertiesTest, TestWavyDash) {
  192. CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
  193. ExpectCharPropertyEqualsCollectedSet("test_wavy_dash");
  194. }
  195. TEST_F(CharPropertiesTest, TestDigitOrWavyDash) {
  196. CollectArray(kTestDigit, arraysize(kTestDigit));
  197. CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
  198. ExpectCharPropertyEqualsCollectedSet("test_digit_or_wavy_dash");
  199. }
  200. TEST_F(CharPropertiesTest, TestPunctuationPlus) {
  201. CollectCharProperty("punctuation");
  202. CollectArray(kTestPunctuationPlusExtras,
  203. arraysize(kTestPunctuationPlusExtras));
  204. ExpectCharPropertyEqualsCollectedSet("test_punctuation_plus");
  205. }
  206. // ====================================================================
  207. // Spot-check predicates in char_properties.cc
  208. //
  209. TEST_F(CharPropertiesTest, StartSentencePunc) {
  210. CollectChars({0x00A1, 0x00BF});
  211. ExpectCharPropertyContainsCollectedSet("start_sentence_punc");
  212. }
  213. TEST_F(CharPropertiesTest, EndSentencePunc) {
  214. CollectChars({'.', '!', '?'});
  215. ExpectCharPropertyContainsCollectedSet("end_sentence_punc");
  216. }
  217. TEST_F(CharPropertiesTest, OpenExprPunc) {
  218. CollectChars({'(', '['});
  219. ExpectCharPropertyContainsCollectedSet("open_expr_punc");
  220. }
  221. TEST_F(CharPropertiesTest, CloseExprPunc) {
  222. CollectChars({')', ']'});
  223. ExpectCharPropertyContainsCollectedSet("close_expr_punc");
  224. }
  225. TEST_F(CharPropertiesTest, OpenQuote) {
  226. CollectChars({'\'', '"'});
  227. ExpectCharPropertyContainsCollectedSet("open_quote");
  228. }
  229. TEST_F(CharPropertiesTest, CloseQuote) {
  230. CollectChars({'\'', '"'});
  231. ExpectCharPropertyContainsCollectedSet("close_quote");
  232. }
  233. TEST_F(CharPropertiesTest, OpenBookquote) {
  234. CollectChars({0x300A});
  235. ExpectCharPropertyContainsCollectedSet("open_bookquote");
  236. }
  237. TEST_F(CharPropertiesTest, CloseBookquote) {
  238. CollectChars({0x300B});
  239. ExpectCharPropertyContainsCollectedSet("close_bookquote");
  240. }
  241. TEST_F(CharPropertiesTest, OpenPunc) {
  242. CollectChars({'(', '['});
  243. CollectChars({'\'', '"'});
  244. ExpectCharPropertyContainsCollectedSet("open_punc");
  245. }
  246. TEST_F(CharPropertiesTest, ClosePunc) {
  247. CollectChars({')', ']'});
  248. CollectChars({'\'', '"'});
  249. ExpectCharPropertyContainsCollectedSet("close_punc");
  250. }
  251. TEST_F(CharPropertiesTest, LeadingSentencePunc) {
  252. CollectChars({'(', '['});
  253. CollectChars({'\'', '"'});
  254. CollectChars({0x00A1, 0x00BF});
  255. ExpectCharPropertyContainsCollectedSet("leading_sentence_punc");
  256. }
  257. TEST_F(CharPropertiesTest, TrailingSentencePunc) {
  258. CollectChars({')', ']'});
  259. CollectChars({'\'', '"'});
  260. CollectChars({'.', '!', '?'});
  261. ExpectCharPropertyContainsCollectedSet("trailing_sentence_punc");
  262. }
  263. TEST_F(CharPropertiesTest, NoncurrencyTokenPrefixSymbol) {
  264. CollectChars({'#'});
  265. ExpectCharPropertyContainsCollectedSet("noncurrency_token_prefix_symbol");
  266. }
  267. TEST_F(CharPropertiesTest, TokenSuffixSymbol) {
  268. CollectChars({'%', 0x2122, 0x00A9, 0x00B0});
  269. ExpectCharPropertyContainsCollectedSet("token_suffix_symbol");
  270. }
  271. TEST_F(CharPropertiesTest, TokenPrefixSymbol) {
  272. CollectChars({'#'});
  273. CollectChars({'$', 0x00A5, 0x20AC});
  274. ExpectCharPropertyContainsCollectedSet("token_prefix_symbol");
  275. }
  276. TEST_F(CharPropertiesTest, SubscriptSymbol) {
  277. CollectChars({0x2082, 0x2083});
  278. ExpectCharPropertyContainsCollectedSet("subscript_symbol");
  279. }
  280. TEST_F(CharPropertiesTest, SuperscriptSymbol) {
  281. CollectChars({0x00B2, 0x00B3});
  282. ExpectCharPropertyContainsCollectedSet("superscript_symbol");
  283. }
  284. TEST_F(CharPropertiesTest, CurrencySymbol) {
  285. CollectChars({'$', 0x00A5, 0x20AC});
  286. ExpectCharPropertyContainsCollectedSet("currency_symbol");
  287. }
  288. TEST_F(CharPropertiesTest, DirectionalFormattingCode) {
  289. CollectChars({0x200E, 0x200F, 0x202A, 0x202B, 0x202C, 0x202D, 0x202E});
  290. ExpectCharPropertyContainsCollectedSet("directional_formatting_code");
  291. }
  292. TEST_F(CharPropertiesTest, Punctuation) {
  293. CollectAsciiPredicate(ispunct);
  294. ExpectCharPropertyContainsCollectedSet("punctuation");
  295. }
  296. TEST_F(CharPropertiesTest, Separator) {
  297. CollectAsciiPredicate(isspace);
  298. ExpectCharPropertyContainsCollectedSet("separator");
  299. }
  300. } // namespace syntaxnet