unicodetext.cc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. /**
  2. * Copyright 2010 Google Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "util/utf8/unicodetext.h"
  17. #include <string.h> // for memcpy, NULL, memcmp, etc
  18. #include <algorithm> // for max
  19. //#include "base/logging.h" // for operator<<, CHECK, etc
  20. //#include "base/stringprintf.h" // for StringPrintf, StringAppendF
  21. //#include "strings/stringpiece.h" // for StringPiece, etc
  22. #include "third_party/utf/utf.h" // for isvalidcharntorune, etc
  23. #include "util/utf8/unilib.h" // for IsInterchangeValid, etc
  24. #include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
  25. static int CodepointDistance(const char* start, const char* end) {
  26. int n = 0;
  27. // Increment n on every non-trail-byte.
  28. for (const char* p = start; p < end; ++p) {
  29. n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
  30. }
  31. return n;
  32. }
  33. static int CodepointCount(const char* utf8, int len) {
  34. return CodepointDistance(utf8, utf8 + len);
  35. }
  36. UnicodeText::const_iterator::difference_type
  37. distance(const UnicodeText::const_iterator& first,
  38. const UnicodeText::const_iterator& last) {
  39. return CodepointDistance(first.it_, last.it_);
  40. }
  41. // ---------- Utility ----------
  42. static int ConvertToInterchangeValid(char* start, int len) {
  43. // This routine is called only when we've discovered that a UTF-8 buffer
  44. // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
  45. // was not interchange valid. This indicates a bug in the caller, and
  46. // a LOG(WARNING) is done in that case.
  47. // This is similar to CoerceToInterchangeValid, but it replaces each
  48. // structurally valid byte with a space, and each non-interchange
  49. // character with a space, even when that character requires more
  50. // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
  51. // structurally valid UTF8, but U+FDD0 is not an interchange-valid
  52. // code point. The result should contain one space, not three.
  53. //
  54. // Since the conversion never needs to write more data than it
  55. // reads, it is safe to change the buffer in place. It returns the
  56. // number of bytes written.
  57. char* const in = start;
  58. char* out = start;
  59. char* const end = start + len;
  60. while (start < end) {
  61. int good = UniLib::SpanInterchangeValid(start, end - start);
  62. if (good > 0) {
  63. if (out != start) {
  64. memmove(out, start, good);
  65. }
  66. out += good;
  67. start += good;
  68. if (start == end) {
  69. break;
  70. }
  71. }
  72. // Is the current string invalid UTF8 or just non-interchange UTF8?
  73. char32 rune;
  74. int n;
  75. if (isvalidcharntorune(start, end - start, &rune, &n)) {
  76. // structurally valid UTF8, but not interchange valid
  77. start += n; // Skip over the whole character.
  78. } else { // bad UTF8
  79. start += 1; // Skip over just one byte
  80. }
  81. *out++ = ' ';
  82. }
  83. return out - in;
  84. }
  85. // *************** Data representation **********
  86. // Note: the copy constructor is undefined.
  87. // After reserve(), resize(), or clear(), we're an owner, not an alias.
  88. void UnicodeText::Repr::reserve(int new_capacity) {
  89. // If there's already enough capacity, and we're an owner, do nothing.
  90. if (capacity_ >= new_capacity && ours_) return;
  91. // Otherwise, allocate a new buffer.
  92. capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
  93. char* new_data = new char[capacity_];
  94. // If there is an old buffer, copy it into the new buffer.
  95. if (data_) {
  96. memcpy(new_data, data_, size_);
  97. if (ours_) delete[] data_; // If we owned the old buffer, free it.
  98. }
  99. data_ = new_data;
  100. ours_ = true; // We own the new buffer.
  101. // size_ is unchanged.
  102. }
  103. void UnicodeText::Repr::resize(int new_size) {
  104. if (new_size == 0) {
  105. clear();
  106. } else {
  107. if (!ours_ || new_size > capacity_) reserve(new_size);
  108. // Clear the memory in the expanded part.
  109. if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
  110. size_ = new_size;
  111. ours_ = true;
  112. }
  113. }
  114. // This implementation of clear() deallocates the buffer if we're an owner.
  115. // That's not strictly necessary; we could just set size_ to 0.
  116. void UnicodeText::Repr::clear() {
  117. if (ours_) delete[] data_;
  118. data_ = nullptr;
  119. size_ = capacity_ = 0;
  120. ours_ = true;
  121. }
  122. void UnicodeText::Repr::Copy(const char* data, int size) {
  123. resize(size);
  124. memcpy(data_, data, size);
  125. }
  126. void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
  127. if (data == data_) return; // We already own this memory. (Weird case.)
  128. if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
  129. data_ = data;
  130. size_ = size;
  131. capacity_ = capacity;
  132. ours_ = true;
  133. }
  134. void UnicodeText::Repr::PointTo(const char* data, int size) {
  135. if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
  136. data_ = const_cast<char*>(data);
  137. size_ = size;
  138. capacity_ = size;
  139. ours_ = false;
  140. }
  141. void UnicodeText::Repr::append(const char* bytes, int byte_length) {
  142. reserve(size_ + byte_length);
  143. memcpy(data_ + size_, bytes, byte_length);
  144. size_ += byte_length;
  145. }
  146. string UnicodeText::Repr::DebugString() const {
  147. return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
  148. this,
  149. data_, size_, capacity_,
  150. ours_ ? "Owned" : "Alias");
  151. }
  152. // *************** UnicodeText ******************
  153. // ----- Constructors -----
  154. // Default constructor
  155. UnicodeText::UnicodeText() {
  156. }
  157. // Copy constructor
  158. UnicodeText::UnicodeText(const UnicodeText& src) {
  159. Copy(src);
  160. }
  161. // Substring constructor
  162. UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
  163. const UnicodeText::const_iterator& last) {
  164. CHECK(first <= last) << " Incompatible iterators";
  165. repr_.append(first.it_, last.it_ - first.it_);
  166. }
  167. string UnicodeText::UTF8Substring(const const_iterator& first,
  168. const const_iterator& last) {
  169. CHECK(first <= last) << " Incompatible iterators";
  170. return string(first.it_, last.it_ - first.it_);
  171. }
  172. // ----- Copy -----
  173. UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
  174. if (this != &src) {
  175. Copy(src);
  176. }
  177. return *this;
  178. }
  179. UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
  180. repr_.Copy(src.repr_.data_, src.repr_.size_);
  181. return *this;
  182. }
  183. UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
  184. repr_.Copy(buffer, byte_length);
  185. if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
  186. LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
  187. repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
  188. }
  189. return *this;
  190. }
  191. UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
  192. int byte_length) {
  193. repr_.Copy(buffer, byte_length);
  194. return *this;
  195. }
  196. // ----- TakeOwnershipOf -----
  197. UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
  198. int byte_length,
  199. int byte_capacity) {
  200. repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
  201. if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
  202. LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
  203. repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
  204. }
  205. return *this;
  206. }
  207. UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
  208. int byte_length,
  209. int byte_capacity) {
  210. repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
  211. return *this;
  212. }
  213. // ----- PointTo -----
  214. UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
  215. if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
  216. repr_.PointTo(buffer, byte_length);
  217. } else {
  218. LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
  219. repr_.Copy(buffer, byte_length);
  220. repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
  221. }
  222. return *this;
  223. }
  224. UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
  225. int byte_length) {
  226. repr_.PointTo(buffer, byte_length);
  227. return *this;
  228. }
  229. UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
  230. repr_.PointTo(src.repr_.data_, src.repr_.size_);
  231. return *this;
  232. }
  233. UnicodeText& UnicodeText::PointTo(const const_iterator &first,
  234. const const_iterator &last) {
  235. CHECK(first <= last) << " Incompatible iterators";
  236. repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
  237. return *this;
  238. }
  239. // ----- Append -----
  240. UnicodeText& UnicodeText::append(const UnicodeText& u) {
  241. repr_.append(u.repr_.data_, u.repr_.size_);
  242. return *this;
  243. }
  244. UnicodeText& UnicodeText::append(const const_iterator& first,
  245. const const_iterator& last) {
  246. CHECK(first <= last) << " Incompatible iterators";
  247. repr_.append(first.it_, last.it_ - first.it_);
  248. return *this;
  249. }
  250. UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
  251. repr_.append(utf8, len);
  252. return *this;
  253. }
  254. // ----- substring searching -----
  255. UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
  256. const_iterator start_pos) const {
  257. CHECK_GE(start_pos.utf8_data(), utf8_data());
  258. CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
  259. return UnsafeFind(look, start_pos);
  260. }
  261. UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
  262. return UnsafeFind(look, begin());
  263. }
  264. UnicodeText::const_iterator UnicodeText::UnsafeFind(
  265. const UnicodeText& look, const_iterator start_pos) const {
  266. // Due to the magic of the UTF8 encoding, searching for a sequence of
  267. // letters is equivalent to substring search.
  268. StringPiece searching(utf8_data(), utf8_length());
  269. StringPiece look_piece(look.utf8_data(), look.utf8_length());
  270. LOG(FATAL) << "Not implemented";
  271. //StringPiece::size_type found =
  272. // searching.find(look_piece, start_pos.utf8_data() - utf8_data());
  273. StringPiece::size_type found = StringPiece::npos;
  274. if (found == StringPiece::npos) return end();
  275. return const_iterator(utf8_data() + found);
  276. }
  277. bool UnicodeText::HasReplacementChar() const {
  278. // Equivalent to:
  279. // UnicodeText replacement_char;
  280. // replacement_char.push_back(0xFFFD);
  281. // return find(replacement_char) != end();
  282. StringPiece searching(utf8_data(), utf8_length());
  283. StringPiece looking_for("\xEF\xBF\xBD", 3);
  284. LOG(FATAL) << "Not implemented";
  285. //return searching.find(looking_for) != StringPiece::npos;
  286. return false;
  287. }
  288. // ----- other methods -----
  289. // Clear operator
  290. void UnicodeText::clear() {
  291. repr_.clear();
  292. }
  293. // Destructor
  294. UnicodeText::~UnicodeText() {}
  295. void UnicodeText::push_back(char32 c) {
  296. if (UniLib::IsValidCodepoint(c)) {
  297. char buf[UTFmax];
  298. int len = runetochar(buf, &c);
  299. if (UniLib::IsInterchangeValid(buf, len)) {
  300. repr_.append(buf, len);
  301. } else {
  302. LOG(WARNING) << "Unicode value 0x" << std::hex << c
  303. << " is not valid for interchange";
  304. repr_.append(" ", 1);
  305. }
  306. } else {
  307. LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
  308. repr_.append(" ", 1);
  309. }
  310. }
  311. int UnicodeText::size() const {
  312. return CodepointCount(repr_.data_, repr_.size_);
  313. }
  314. bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
  315. if (&lhs == &rhs) return true;
  316. if (lhs.repr_.size_ != rhs.repr_.size_) return false;
  317. return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
  318. }
  319. string UnicodeText::DebugString() const {
  320. return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
  321. this,
  322. size(),
  323. repr_.DebugString().c_str());
  324. }
  325. // ******************* UnicodeText::const_iterator *********************
  326. // The implementation of const_iterator would be nicer if it
  327. // inherited from boost::iterator_facade
  328. // (http://boost.org/libs/iterator/doc/iterator_facade.html).
  329. UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
  330. UnicodeText::const_iterator::const_iterator(const const_iterator& other)
  331. : it_(other.it_) {
  332. }
  333. UnicodeText::const_iterator&
  334. UnicodeText::const_iterator::operator=(const const_iterator& other) {
  335. if (&other != this)
  336. it_ = other.it_;
  337. return *this;
  338. }
  339. UnicodeText::const_iterator UnicodeText::begin() const {
  340. return const_iterator(repr_.data_);
  341. }
  342. UnicodeText::const_iterator UnicodeText::end() const {
  343. return const_iterator(repr_.data_ + repr_.size_);
  344. }
  345. bool operator<(const UnicodeText::const_iterator& lhs,
  346. const UnicodeText::const_iterator& rhs) {
  347. return lhs.it_ < rhs.it_;
  348. }
  349. char32 UnicodeText::const_iterator::operator*() const {
  350. // (We could call chartorune here, but that does some
  351. // error-checking, and we're guaranteed that our data is valid
  352. // UTF-8. Also, we expect this routine to be called very often. So
  353. // for speed, we do the calculation ourselves.)
  354. // Convert from UTF-8
  355. int byte1 = it_[0];
  356. if (byte1 < 0x80)
  357. return byte1;
  358. int byte2 = it_[1];
  359. if (byte1 < 0xE0)
  360. return ((byte1 & 0x1F) << 6)
  361. | (byte2 & 0x3F);
  362. int byte3 = it_[2];
  363. if (byte1 < 0xF0)
  364. return ((byte1 & 0x0F) << 12)
  365. | ((byte2 & 0x3F) << 6)
  366. | (byte3 & 0x3F);
  367. int byte4 = it_[3];
  368. return ((byte1 & 0x07) << 18)
  369. | ((byte2 & 0x3F) << 12)
  370. | ((byte3 & 0x3F) << 6)
  371. | (byte4 & 0x3F);
  372. }
  373. UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
  374. it_ += UniLib::OneCharLen(it_);
  375. return *this;
  376. }
  377. UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
  378. while (UniLib::IsTrailByte(*--it_));
  379. return *this;
  380. }
  381. int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
  382. utf8_output[0] = it_[0]; if (it_[0] < 0x80) return 1;
  383. utf8_output[1] = it_[1]; if (it_[0] < 0xE0) return 2;
  384. utf8_output[2] = it_[2]; if (it_[0] < 0xF0) return 3;
  385. utf8_output[3] = it_[3];
  386. return 4;
  387. }
  388. string UnicodeText::const_iterator::get_utf8_string() const {
  389. return string(utf8_data(), utf8_length());
  390. }
  391. int UnicodeText::const_iterator::utf8_length() const {
  392. if (it_[0] < 0x80) {
  393. return 1;
  394. } else if (it_[0] < 0xE0) {
  395. return 2;
  396. } else if (it_[0] < 0xF0) {
  397. return 3;
  398. } else {
  399. return 4;
  400. }
  401. }
  402. UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
  403. CHECK(p != nullptr);
  404. const char* start = utf8_data();
  405. int len = utf8_length();
  406. const char* end = start + len;
  407. CHECK(p >= start);
  408. CHECK(p <= end);
  409. CHECK(p == end || !UniLib::IsTrailByte(*p));
  410. return const_iterator(p);
  411. }
  412. string UnicodeText::const_iterator::DebugString() const {
  413. return tensorflow::strings::Printf("{iter %p}", it_);
  414. }
  415. // *************************** Utilities *************************
  416. string CodepointString(const UnicodeText& t) {
  417. string s;
  418. UnicodeText::const_iterator it = t.begin(), end = t.end();
  419. while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
  420. return s;
  421. }