unilib.h 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. /**
  2. * Copyright 2010 Google Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. // Routines to do manipulation of Unicode characters or text
  17. //
  18. // The StructurallyValid routines accept buffers of arbitrary bytes.
  19. // For CoerceToStructurallyValid(), the input buffer and output buffers may
  20. // point to exactly the same memory.
  21. //
  22. // In all other cases, the UTF-8 string must be structurally valid and
  23. // have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
  24. // Debug builds take a fatal error for invalid UTF-8 input.
  25. // The input and output buffers may not overlap at all.
  26. //
  27. // The char32 routines are here only for convenience; they convert to UTF-8
  28. // internally and use the UTF-8 routines.
  29. #ifndef UTIL_UTF8_UNILIB_H__
  30. #define UTIL_UTF8_UNILIB_H__
  31. #include <string>
  32. #include "syntaxnet/base.h"
  33. // We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
  34. // but they are defined in unilib_utf8_utils.h.
  35. //#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
  36. namespace UniLib {
  37. // Returns the length in bytes of the prefix of src that is all
  38. // interchange valid UTF-8
  39. int SpanInterchangeValid(const char* src, int byte_length);
  40. inline int SpanInterchangeValid(const std::string& src) {
  41. return SpanInterchangeValid(src.data(), src.size());
  42. }
  43. // Returns true if the source is all interchange valid UTF-8
  44. // "Interchange valid" is a stronger than structurally valid --
  45. // no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
  46. bool IsInterchangeValid(char32 codepoint);
  47. inline bool IsInterchangeValid(const char* src, int byte_length) {
  48. return (byte_length == SpanInterchangeValid(src, byte_length));
  49. }
  50. inline bool IsInterchangeValid(const std::string& src) {
  51. return IsInterchangeValid(src.data(), src.size());
  52. }
  53. } // namespace UniLib
  54. #endif // UTIL_UTF8_PUBLIC_UNILIB_H_