utf.h 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /*
  2. * The authors of this software are Rob Pike and Ken Thompson.
  3. * Copyright (c) 2002 by Lucent Technologies.
  4. * Permission to use, copy, modify, and distribute this software for any
  5. * purpose without fee is hereby granted, provided that this entire notice
  6. * is included in all copies of any software which is or includes a copy
  7. * or modification of this software and in all copies of the supporting
  8. * documentation for such software.
  9. * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
  10. * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
  11. * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
  12. * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
  13. */
  14. #ifndef _UTFH_
  15. #define _UTFH_ 1
  16. #include <stdint.h>
  17. typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
  18. enum
  19. {
  20. UTFmax = 4, /* maximum bytes per rune */
  21. Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
  22. Runeself = 0x80, /* rune and UTF sequences are the same (<) */
  23. Runeerror = 0xFFFD, /* decoding error in UTF */
  24. Runemax = 0x10FFFF, /* maximum rune value */
  25. };
  26. #ifdef __cplusplus
  27. extern "C" {
  28. #endif
  29. /*
  30. * rune routines
  31. */
  32. /*
  33. * These routines were written by Rob Pike and Ken Thompson
  34. * and first appeared in Plan 9.
  35. * SEE ALSO
  36. * utf (7)
  37. * tcs (1)
  38. */
  39. // runetochar copies (encodes) one rune, pointed to by r, to at most
  40. // UTFmax bytes starting at s and returns the number of bytes generated.
  41. int runetochar(char* s, const Rune* r);
  42. // chartorune copies (decodes) at most UTFmax bytes starting at s to
  43. // one rune, pointed to by r, and returns the number of bytes consumed.
  44. // If the input is not exactly in UTF format, chartorune will set *r
  45. // to Runeerror and return 1.
  46. //
  47. // Note: There is no special case for a "null-terminated" string. A
  48. // string whose first byte has the value 0 is the UTF8 encoding of the
  49. // Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
  50. // anywhere else in a UTF sequence.
  51. int chartorune(Rune* r, const char* s);
  52. // charntorune is like chartorune, except that it will access at most
  53. // n bytes of s. If the UTF sequence is incomplete within n bytes,
  54. // charntorune will set *r to Runeerror and return 0. If it is complete
  55. // but not in UTF format, it will set *r to Runeerror and return 1.
  56. //
  57. // Added 2004-09-24 by Wei-Hwa Huang
  58. int charntorune(Rune* r, const char* s, int n);
  59. // isvalidcharntorune(str, n, r, consumed)
  60. // is a convenience function that calls "*consumed = charntorune(r, str, n)"
  61. // and returns an int (logically boolean) indicating whether the first
  62. // n bytes of str was a valid and complete UTF sequence.
  63. int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
  64. // runelen returns the number of bytes required to convert r into UTF.
  65. int runelen(Rune r);
  66. // runenlen returns the number of bytes required to convert the n
  67. // runes pointed to by r into UTF.
  68. int runenlen(const Rune* r, int n);
  69. // fullrune returns 1 if the string s of length n is long enough to be
  70. // decoded by chartorune, and 0 otherwise. This does not guarantee
  71. // that the string contains a legal UTF encoding. This routine is used
  72. // by programs that obtain input one byte at a time and need to know
  73. // when a full rune has arrived.
  74. int fullrune(const char* s, int n);
  75. // The following routines are analogous to the corresponding string
  76. // routines with "utf" substituted for "str", and "rune" substituted
  77. // for "chr".
  78. // utflen returns the number of runes that are represented by the UTF
  79. // string s. (cf. strlen)
  80. int utflen(const char* s);
  81. // utfnlen returns the number of complete runes that are represented
  82. // by the first n bytes of the UTF string s. If the last few bytes of
  83. // the string contain an incompletely coded rune, utfnlen will not
  84. // count them; in this way, it differs from utflen, which includes
  85. // every byte of the string. (cf. strnlen)
  86. int utfnlen(const char* s, long n);
  87. // utfrune returns a pointer to the first occurrence of rune r in the
  88. // UTF string s, or 0 if r does not occur in the string. The NULL
  89. // byte terminating a string is considered to be part of the string s.
  90. // (cf. strchr)
  91. const char* utfrune(const char* s, Rune r);
  92. // utfrrune returns a pointer to the last occurrence of rune r in the
  93. // UTF string s, or 0 if r does not occur in the string. The NULL
  94. // byte terminating a string is considered to be part of the string s.
  95. // (cf. strrchr)
  96. const char* utfrrune(const char* s, Rune r);
  97. // utfutf returns a pointer to the first occurrence of the UTF string
  98. // s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
  99. // null string, utfutf returns s1. (cf. strstr)
  100. const char* utfutf(const char* s1, const char* s2);
  101. // utfecpy copies UTF sequences until a null sequence has been copied,
  102. // but writes no sequences beyond es1. If any sequences are copied,
  103. // s1 is terminated by a null sequence, and a pointer to that sequence
  104. // is returned. Otherwise, the original s1 is returned. (cf. strecpy)
  105. char* utfecpy(char *s1, char *es1, const char *s2);
  106. // These functions are rune-string analogues of the corresponding
  107. // functions in strcat (3).
  108. //
  109. // These routines first appeared in Plan 9.
  110. // SEE ALSO
  111. // memmove (3)
  112. // rune (3)
  113. // strcat (2)
  114. //
  115. // BUGS: The outcome of overlapping moves varies among implementations.
  116. Rune* runestrcat(Rune* s1, const Rune* s2);
  117. Rune* runestrncat(Rune* s1, const Rune* s2, long n);
  118. const Rune* runestrchr(const Rune* s, Rune c);
  119. int runestrcmp(const Rune* s1, const Rune* s2);
  120. int runestrncmp(const Rune* s1, const Rune* s2, long n);
  121. Rune* runestrcpy(Rune* s1, const Rune* s2);
  122. Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
  123. Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
  124. Rune* runestrdup(const Rune* s);
  125. const Rune* runestrrchr(const Rune* s, Rune c);
  126. long runestrlen(const Rune* s);
  127. const Rune* runestrstr(const Rune* s1, const Rune* s2);
  128. // The following routines test types and modify cases for Unicode
  129. // characters. Unicode defines some characters as letters and
  130. // specifies three cases: upper, lower, and title. Mappings among the
  131. // cases are also defined, although they are not exhaustive: some
  132. // upper case letters have no lower case mapping, and so on. Unicode
  133. // also defines several character properties, a subset of which are
  134. // checked by these routines. These routines are based on Unicode
  135. // version 3.0.0.
  136. //
  137. // NOTE: The routines are implemented in C, so the boolean functions
  138. // (e.g., isupperrune) return 0 for false and 1 for true.
  139. //
  140. //
  141. // toupperrune, tolowerrune, and totitlerune are the Unicode case
  142. // mappings. These routines return the character unchanged if it has
  143. // no defined mapping.
  144. Rune toupperrune(Rune r);
  145. Rune tolowerrune(Rune r);
  146. Rune totitlerune(Rune r);
  147. // isupperrune tests for upper case characters, including Unicode
  148. // upper case letters and targets of the toupper mapping. islowerrune
  149. // and istitlerune are defined analogously.
  150. int isupperrune(Rune r);
  151. int islowerrune(Rune r);
  152. int istitlerune(Rune r);
  153. // isalpharune tests for Unicode letters; this includes ideographs in
  154. // addition to alphabetic characters.
  155. int isalpharune(Rune r);
  156. // isdigitrune tests for digits. Non-digit numbers, such as Roman
  157. // numerals, are not included.
  158. int isdigitrune(Rune r);
  159. // isideographicrune tests for ideographic characters and numbers, as
  160. // defined by the Unicode standard.
  161. int isideographicrune(Rune r);
  162. // isspacerune tests for whitespace characters, including "C" locale
  163. // whitespace, Unicode defined whitespace, and the "zero-width
  164. // non-break space" character.
  165. int isspacerune(Rune r);
  166. // (The comments in this file were copied from the manpage files rune.3,
  167. // isalpharune.3, and runestrcat.3. Some formatting changes were also made
  168. // to conform to Google style. /JRM 11/11/05)
  169. #ifdef __cplusplus
  170. }
  171. #endif
  172. #endif