unicodetext_unittest.cc 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /**
  2. * Copyright 2010 Google Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "util/utf8/unicodetext.h"
  17. #include <iterator>
  18. #include <set>
  19. #include "gtest/gtest.h"
  20. #include "third_party/utf/utf.h"
  21. #include "util/utf8/unilib.h"
  22. namespace {
  23. class UnicodeTextTest : public testing::Test {
  24. protected:
  25. UnicodeTextTest() : empty_text_() {
  26. const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
  27. // Construct a UnicodeText from those codepoints.
  28. text_.append(&text[0], text + arraysize(text));
  29. }
  30. UnicodeText empty_text_;
  31. UnicodeText text_;
  32. };
  33. TEST(UnicodeTextTest, Ownership) {
  34. const string src = "\u304A\u00B0\u106B";
  35. {
  36. string s = src;
  37. char* sbuf = new char[s.size()];
  38. memcpy(sbuf, s.data(), s.size());
  39. UnicodeText owned;
  40. owned.TakeOwnershipOfUTF8(sbuf, s.size(), s.size());
  41. EXPECT_EQ(owned.utf8_data(), sbuf);
  42. s.clear();
  43. // owned should be OK even after s has been cleared.
  44. UnicodeText::const_iterator it = owned.begin();
  45. EXPECT_EQ(*it++, 0x304A);
  46. EXPECT_EQ(*it++, 0x00B0);
  47. EXPECT_EQ(*it++, 0x106B);
  48. CHECK(it == owned.end());
  49. }
  50. {
  51. UnicodeText owner;
  52. { // Create a new scope for s.
  53. string s = src;
  54. char* sbuf = new char[s.size()];
  55. memcpy(sbuf, s.data(), s.size());
  56. UnicodeText t;
  57. t.TakeOwnershipOfUTF8(sbuf, s.size(), s.size());
  58. EXPECT_EQ(t.utf8_data(), sbuf);
  59. owner = t; // Copies the data
  60. EXPECT_NE(owner.utf8_data(), sbuf);
  61. }
  62. // owner should be OK even after s has gone out of scope
  63. UnicodeText::const_iterator it = owner.begin();
  64. EXPECT_EQ(*it++, 0x304A);
  65. EXPECT_EQ(*it++, 0x00B0);
  66. EXPECT_EQ(*it++, 0x106B);
  67. CHECK(it == owner.end());
  68. }
  69. {
  70. UnicodeText alias;
  71. alias.PointToUTF8(src.data(), src.size());
  72. EXPECT_EQ(alias.utf8_data(), src.data());
  73. UnicodeText::const_iterator it = alias.begin();
  74. EXPECT_EQ(*it++, 0x304A);
  75. EXPECT_EQ(*it++, 0x00B0);
  76. EXPECT_EQ(*it++, 0x106B);
  77. CHECK(it == alias.end());
  78. UnicodeText t = alias; // Copy initialization copies the data.
  79. EXPECT_NE(t.utf8_data(), alias.utf8_data());
  80. UnicodeText t2;
  81. t2 = alias; // Assignment copies the data.
  82. EXPECT_NE(t2.utf8_data(), alias.utf8_data());
  83. // Preserve an alias.
  84. t.PointTo(alias); // This does not copy the data.
  85. EXPECT_EQ(t.utf8_data(), alias.utf8_data());
  86. t.push_back(0x0020); // Modify the alias
  87. EXPECT_NE(t.utf8_data(), alias.utf8_data()); // It's no longer an alias.
  88. }
  89. }
  90. class IteratorTest : public UnicodeTextTest {};
  91. TEST_F(IteratorTest, Iterates) {
  92. UnicodeText::const_iterator iter = text_.begin();
  93. EXPECT_EQ(0x1C0, *iter);
  94. EXPECT_EQ(&iter, &++iter); // operator++ returns *this.
  95. EXPECT_EQ(0x4E8C, *iter++);
  96. EXPECT_EQ(0xD7DB, *iter);
  97. // Make sure you can dereference more than once.
  98. EXPECT_EQ(0xD7DB, *iter);
  99. EXPECT_EQ(0x34, *++iter);
  100. EXPECT_EQ(0x1D11E, *++iter);
  101. ASSERT_TRUE(iter != text_.end());
  102. iter++;
  103. EXPECT_TRUE(iter == text_.end());
  104. }
  105. TEST_F(IteratorTest, Reverse) {
  106. UnicodeText::const_reverse_iterator iter = text_.rbegin();
  107. EXPECT_EQ(0x1D11E, *iter);
  108. EXPECT_EQ(&iter, &++iter); // operator++ returns *this.
  109. EXPECT_EQ(0x34, *iter++);
  110. EXPECT_EQ(0xD7DB, *iter);
  111. // Make sure you can dereference more than once.
  112. EXPECT_EQ(0xD7DB, *iter);
  113. EXPECT_EQ(0x4E8C, *++iter);
  114. EXPECT_EQ(0x1C0, *++iter);
  115. ASSERT_TRUE(iter != text_.rend());
  116. iter++;
  117. EXPECT_TRUE(iter == text_.rend());
  118. }
  119. TEST_F(IteratorTest, MultiPass) {
  120. // Also tests Default Constructible and Assignable.
  121. UnicodeText::const_iterator i1, i2;
  122. i1 = text_.begin();
  123. i2 = i1;
  124. EXPECT_EQ(0x4E8C, *++i1);
  125. EXPECT_TRUE(i1 != i2);
  126. EXPECT_EQ(0x1C0, *i2);
  127. ++i2;
  128. EXPECT_TRUE(i1 == i2);
  129. EXPECT_EQ(0x4E8C, *i2);
  130. }
  131. TEST_F(IteratorTest, ReverseIterates) {
  132. UnicodeText::const_iterator iter = text_.end();
  133. EXPECT_TRUE(iter == text_.end());
  134. iter--;
  135. ASSERT_TRUE(iter != text_.end());
  136. EXPECT_EQ(0x1D11E, *iter--);
  137. EXPECT_EQ(0x34, *iter);
  138. EXPECT_EQ(0xD7DB, *--iter);
  139. // Make sure you can dereference more than once.
  140. EXPECT_EQ(0xD7DB, *iter);
  141. --iter;
  142. EXPECT_EQ(0x4E8C, *iter--);
  143. EXPECT_EQ(0x1C0, *iter);
  144. EXPECT_TRUE(iter == text_.begin());
  145. }
  146. TEST_F(IteratorTest, Comparable) {
  147. UnicodeText::const_iterator i1, i2;
  148. i1 = text_.begin();
  149. i2 = i1;
  150. ++i2;
  151. EXPECT_TRUE(i1 < i2);
  152. EXPECT_TRUE(text_.begin() <= i1);
  153. EXPECT_FALSE(i1 >= i2);
  154. EXPECT_FALSE(i1 > text_.end());
  155. }
  156. TEST_F(IteratorTest, Advance) {
  157. UnicodeText::const_iterator iter = text_.begin();
  158. EXPECT_EQ(0x1C0, *iter);
  159. std::advance(iter, 4);
  160. EXPECT_EQ(0x1D11E, *iter);
  161. ++iter;
  162. EXPECT_TRUE(iter == text_.end());
  163. }
  164. TEST_F(IteratorTest, Distance) {
  165. UnicodeText::const_iterator iter = text_.begin();
  166. EXPECT_EQ(0, distance(text_.begin(), iter));
  167. EXPECT_EQ(5, distance(iter, text_.end()));
  168. ++iter;
  169. ++iter;
  170. EXPECT_EQ(2, distance(text_.begin(), iter));
  171. EXPECT_EQ(3, distance(iter, text_.end()));
  172. ++iter;
  173. ++iter;
  174. EXPECT_EQ(4, distance(text_.begin(), iter));
  175. ++iter;
  176. EXPECT_EQ(0, distance(iter, text_.end()));
  177. }
  178. TEST_F(IteratorTest, Encode) {
  179. const string utf8 = "\xC7\x80"
  180. "\xE4\xBA\x8C"
  181. "\xED\x9F\x9B"
  182. "\x34"
  183. "\xF0\x9D\x84\x9E";
  184. const int lengths[] = {2, 3, 3, 1, 4};
  185. EXPECT_EQ(text_.size(), 5);
  186. EXPECT_EQ(text_.utf8_length(), 13);
  187. EXPECT_TRUE(memcmp(text_.utf8_data(), utf8.data(), text_.utf8_length())
  188. == 0);
  189. {
  190. // Test the iterator
  191. UnicodeText::const_iterator iter = text_.begin(), end = text_.end();
  192. const char* u = utf8.data();
  193. int i = 0;
  194. while (iter != end) {
  195. char buf[5];
  196. int n = iter.get_utf8(buf);
  197. buf[n] = '\0';
  198. EXPECT_TRUE(strncmp(buf, u, n) == 0);
  199. EXPECT_EQ(buf, iter.get_utf8_string());
  200. EXPECT_EQ(lengths[i], iter.utf8_length());
  201. u += n;
  202. iter++;
  203. i++;
  204. }
  205. }
  206. {
  207. // Test the reverse_iterator
  208. UnicodeText::const_reverse_iterator iter = text_.rbegin();
  209. UnicodeText::const_reverse_iterator end = text_.rend();
  210. const char* u = utf8.data() + utf8.size();
  211. int i = 0;
  212. while (iter != end) {
  213. char buf[5];
  214. int n = iter.get_utf8(buf);
  215. buf[n] = '\0';
  216. u -= n;
  217. EXPECT_TRUE(strncmp(buf, u, n) == 0);
  218. EXPECT_EQ(buf, iter.get_utf8_string());
  219. EXPECT_EQ(lengths[text_.size() - i - 1], iter.utf8_length());
  220. iter++;
  221. i++;
  222. }
  223. }
  224. text_.push_back('$');
  225. EXPECT_EQ(text_.size(), 6);
  226. EXPECT_EQ(text_.utf8_length(), 14);
  227. text_.push_back('\xAE'); // registered sign
  228. EXPECT_EQ(text_.size(), 7);
  229. EXPECT_EQ(text_.utf8_length(), 16); // 2 bytes long
  230. }
  231. TEST_F(IteratorTest, Decode) {
  232. const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
  233. UnicodeText::const_iterator iter = text_.begin();
  234. for (int i = 0; i < 5; ++i)
  235. EXPECT_EQ(text[i], *iter++);
  236. string s = CodepointString(text_);
  237. EXPECT_EQ(s, "1C0 4E8C D7DB 34 1D11E ");
  238. }
  239. class OperatorTest : public UnicodeTextTest {};
  240. TEST_F(OperatorTest, Clear) {
  241. UnicodeText empty_text(UTF8ToUnicodeText(""));
  242. EXPECT_FALSE(text_ == empty_text);
  243. text_.clear();
  244. EXPECT_TRUE(text_ == empty_text);
  245. }
  246. TEST_F(OperatorTest, Empty) {
  247. EXPECT_TRUE(empty_text_.empty());
  248. EXPECT_FALSE(text_.empty());
  249. text_.clear();
  250. EXPECT_TRUE(text_.empty());
  251. }
  252. TEST(UnicodeTextTest, InterchangeValidity) {
  253. char* FDD0 = new char[3];
  254. memcpy(FDD0, "\xEF\xB7\x90", 3);
  255. EXPECT_FALSE(UniLib::IsInterchangeValid(FDD0, 3));
  256. UnicodeText a = MakeUnicodeTextWithoutAcceptingOwnership(FDD0, 3);
  257. EXPECT_EQ(a.size(), 1);
  258. EXPECT_EQ(*a.begin(), 0x20);
  259. a.clear();
  260. a.push_back(0xFDD0);
  261. EXPECT_EQ(a.size(), 1);
  262. EXPECT_EQ(*a.begin(), 0x20);
  263. a = MakeUnicodeTextAcceptingOwnership(FDD0, 3, 3);
  264. EXPECT_EQ(a.size(), 1);
  265. EXPECT_EQ(*a.begin(), 0x20);
  266. a.clear();
  267. a.push_back(0xFDD0);
  268. EXPECT_EQ(a.size(), 1);
  269. EXPECT_EQ(*a.begin(), 0x20);
  270. }
  271. class SubstringSearchTest : public UnicodeTextTest {};
  272. // TEST_F(SubstringSearchTest, FindEmpty) {
  273. // EXPECT_TRUE(text_.find(empty_text_) == text_.begin());
  274. // EXPECT_TRUE(empty_text_.find(text_) == empty_text_.end());
  275. // }
  276. // TEST_F(SubstringSearchTest, Find) {
  277. // UnicodeText::const_iterator second_pos = text_.begin();
  278. // ++second_pos;
  279. // UnicodeText::const_iterator third_pos = second_pos;
  280. // ++third_pos;
  281. // UnicodeText::const_iterator fourth_pos = third_pos;
  282. // ++fourth_pos;
  283. // // same as text_
  284. // const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
  285. // UnicodeText prefix;
  286. // prefix.append(&text[0], &text[2]);
  287. // EXPECT_TRUE(text_.find(prefix) == text_.begin());
  288. // EXPECT_TRUE(text_.find(prefix, second_pos) == text_.end());
  289. // UnicodeText suffix;
  290. // suffix.append(&text[2], text + arraysize(text));
  291. // EXPECT_TRUE(text_.find(suffix) == third_pos);
  292. // EXPECT_TRUE(text_.find(suffix, second_pos) == third_pos);
  293. // EXPECT_TRUE(text_.find(suffix, third_pos) == third_pos);
  294. // EXPECT_TRUE(text_.find(suffix, fourth_pos) == text_.end());
  295. // }
  296. // TEST_F(SubstringSearchTest, HasConversionError) {
  297. // EXPECT_FALSE(text_.HasReplacementChar());
  298. // const char32 beg[] = {0xFFFD, 0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
  299. // UnicodeText beg_uni;
  300. // beg_uni.append(&beg[0], beg + arraysize(beg));
  301. // EXPECT_TRUE(beg_uni.HasReplacementChar());
  302. // const char32 mid[] = {0x1C0, 0x4E8C, 0xFFFD, 0xD7DB, 0x34, 0x1D11E};
  303. // UnicodeText mid_uni;
  304. // mid_uni.append(&mid[0], mid + arraysize(mid));
  305. // EXPECT_TRUE(mid_uni.HasReplacementChar());
  306. // const char32 end[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E, 0xFFFD};
  307. // UnicodeText end_uni;
  308. // end_uni.append(&end[0], end + arraysize(end));
  309. // EXPECT_TRUE(end_uni.HasReplacementChar());
  310. // const char32 two[] = {0xFFFD, 0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E, 0xFFFD};
  311. // UnicodeText two_uni;
  312. // two_uni.append(&two[0], two + arraysize(two));
  313. // EXPECT_TRUE(two_uni.HasReplacementChar());
  314. // const char32 adj[] = {0x1C0, 0xFFFD, 0xFFFD, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
  315. // UnicodeText adj_uni;
  316. // adj_uni.append(&adj[0], adj + arraysize(adj));
  317. // EXPECT_TRUE(adj_uni.HasReplacementChar());
  318. // }
  319. } // namespace