unilib.cc 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. /**
  2. * Copyright 2010 Google Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. // Author: sligocki@google.com (Shawn Ligocki)
  17. #include "util/utf8/unilib.h"
  18. #include "syntaxnet/base.h"
  19. #include "third_party/utf/utf.h"
  20. namespace UniLib {
  21. // Codepoints not allowed for interchange are:
  22. // C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
  23. // Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
  24. // Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
  25. // C1 controls: U+007F to U+009F
  26. // Surrogates: U+D800 to U+DFFF
  27. // Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
  28. bool IsInterchangeValid(char32 c) {
  29. return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
  30. (c >= 0x7F && c <= 0x9F) ||
  31. (c >= 0xD800 && c <= 0xDFFF) ||
  32. (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
  33. }
  34. int SpanInterchangeValid(const char* begin, int byte_length) {
  35. char32 rune;
  36. const char* p = begin;
  37. const char* end = begin + byte_length;
  38. while (p < end) {
  39. int bytes_consumed = charntorune(&rune, p, end - p);
  40. // We want to accept Runeerror == U+FFFD as a valid char, but it is used
  41. // by chartorune to indicate error. Luckily, the real codepoint is size 3
  42. // while errors return bytes_consumed <= 1.
  43. if ((rune == Runeerror && bytes_consumed <= 1) ||
  44. !IsInterchangeValid(rune)) {
  45. break; // Found
  46. }
  47. p += bytes_consumed;
  48. }
  49. return p - begin;
  50. }
  51. } // namespace UniLib