utf8proc.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. /*
  2. * Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20. * DEALINGS IN THE SOFTWARE.
  21. */
  22. /*
  23. * File name: utf8proc.h
  24. * Version: 1.1.1
  25. * Last changed: 2007-07-22
  26. * Changed 2008-05-16 by rkr to add config.h and replacement for stdbool.h
  27. * for pre-C99 compilers that don't support bool.
  28. * Changed 2008-06-05 by rkr to add utf8proc_check(str, options) function for
  29. * for just checking UTF-8 validity
  30. * Description:
  31. * Header files for libutf8proc, which is a mapping tool for UTF-8 strings
  32. * with following features:
  33. * - decomposing and composing of strings
  34. * - replacing compatibility characters with their equivalents
  35. * - stripping of "default ignorable characters"
  36. * like SOFT-HYPHEN or ZERO-WIDTH-SPACE
  37. * - folding of certain characters for string comparison
  38. * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-")
  39. * (see "LUMP" option)
  40. * - optional rejection of strings containing non-assigned code points
  41. * - stripping of control characters
  42. * - stripping of character marks (accents, etc.)
  43. * - transformation of LF, CRLF, CR and NEL to line-feed (LF)
  44. * or to the unicode chararacters for paragraph separation (PS)
  45. * or line separation (LS).
  46. * - unicode case folding (for case insensitive string comparisons)
  47. * - rejection of illegal UTF-8 data
  48. * (i.e. UTF-8 encoded UTF-16 surrogates)
  49. * - support for korean hangul characters
  50. * Unicode Version 5.0.0 is supported.
  51. */
  52. #ifndef UTF8PROC_H
  53. #define UTF8PROC_H
  54. #include "config.h"
  55. #include <stdlib.h>
  56. #ifdef HAVE_STDBOOL_H
  57. #include <stdbool.h>
  58. #else
  59. # if ! HAVE__BOOL
  60. # ifdef __cplusplus
  61. typedef bool _Bool;
  62. # else
  63. typedef unsigned char _Bool;
  64. # endif
  65. # endif
  66. # define bool _Bool
  67. # define false 0
  68. # define true 1
  69. # define __bool_true_false_are_defined 1
  70. #endif
  71. #include <sys/types.h>
  72. #ifdef HAVE_INTTYPES_H
  73. #include <inttypes.h>
  74. #else /* HAVE_INTTYPES_H */
  75. #include <pstdint.h>
  76. #endif /* HAVE_INTTYPES_H */
  77. #include <limits.h>
  78. #ifndef HAVE_SSIZE_T
  79. #define ssize_t int
  80. #endif
  81. #ifndef SSIZE_MAX
  82. #define SSIZE_MAX (SIZE_MAX/2)
  83. #endif
  84. #define UTF8PROC_NULLTERM (1<<0)
  85. #define UTF8PROC_STABLE (1<<1)
  86. #define UTF8PROC_COMPAT (1<<2)
  87. #define UTF8PROC_COMPOSE (1<<3)
  88. #define UTF8PROC_DECOMPOSE (1<<4)
  89. #define UTF8PROC_IGNORE (1<<5)
  90. #define UTF8PROC_REJECTNA (1<<6)
  91. #define UTF8PROC_NLF2LS (1<<7)
  92. #define UTF8PROC_NLF2PS (1<<8)
  93. #define UTF8PROC_NLF2LF (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
  94. #define UTF8PROC_STRIPCC (1<<9)
  95. #define UTF8PROC_CASEFOLD (1<<10)
  96. #define UTF8PROC_CHARBOUND (1<<11)
  97. #define UTF8PROC_LUMP (1<<12)
  98. #define UTF8PROC_STRIPMARK (1<<13)
  99. /*
  100. * Flags being regarded by several functions in the library:
  101. * NULLTERM: The given UTF-8 input is NULL terminated.
  102. * STABLE: Unicode Versioning Stability has to be respected.
  103. * COMPAT: Compatiblity decomposition
  104. * (i.e. formatting information is lost)
  105. * COMPOSE: Return a result with composed characters.
  106. * DECOMPOSE: Return a result with decomposed characters.
  107. * IGNORE: Strip "default ignorable characters"
  108. * REJECTNA: Return an error, if the input contains unassigned
  109. * code points.
  110. * NLF2LS: Indicating that NLF-sequences (LF, CRLF, CR, NEL) are
  111. * representing a line break, and should be converted to the
  112. * unicode character for line separation (LS).
  113. * NLF2PS: Indicating that NLF-sequences are representing a paragraph
  114. * break, and should be converted to the unicode character for
  115. * paragraph separation (PS).
  116. * NLF2LF: Indicating that the meaning of NLF-sequences is unknown.
  117. * STRIPCC: Strips and/or convers control characters.
  118. * NLF-sequences are transformed into space, except if one of
  119. * the NLF2LS/PS/LF options is given.
  120. * HorizontalTab (HT) and FormFeed (FF) are treated as a
  121. * NLF-sequence in this case.
  122. * All other control characters are simply removed.
  123. * CASEFOLD: Performs unicode case folding, to be able to do a
  124. * case-insensitive string comparison.
  125. * CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which
  126. * is representing a single grapheme cluster (see UAX#29).
  127. * LUMP: Lumps certain characters together
  128. * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
  129. * (See lump.txt for details.)
  130. * If NLF2LF is set, this includes a transformation of
  131. * paragraph and line separators to ASCII line-feed (LF).
  132. * STRIPMARK: Strips all character markings
  133. * (non-spacing, spacing and enclosing) (i.e. accents)
  134. * NOTE: this option works only with COMPOSE or DECOMPOSE
  135. */
  136. #define UTF8PROC_ERROR_NOMEM -1
  137. #define UTF8PROC_ERROR_OVERFLOW -2
  138. #define UTF8PROC_ERROR_INVALIDUTF8 -3
  139. #define UTF8PROC_ERROR_NOTASSIGNED -4
  140. #define UTF8PROC_ERROR_INVALIDOPTS -5
  141. /*
  142. * Error codes being returned by almost all functions:
  143. * ERROR_NOMEM: Memory could not be allocated.
  144. * ERROR_OVERFLOW: The given string is too long to be processed.
  145. * ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string.
  146. * ERROR_NOTASSIGNED: The REJECTNA flag was set,
  147. * and an unassigned code point was found.
  148. * ERROR_INVALIDOPTS: Invalid options have been used.
  149. */
  150. typedef int16_t utf8proc_propval_t;
  151. typedef struct utf8proc_property_struct {
  152. utf8proc_propval_t category;
  153. utf8proc_propval_t combining_class;
  154. utf8proc_propval_t bidi_class;
  155. utf8proc_propval_t decomp_type;
  156. const int32_t *decomp_mapping;
  157. unsigned bidi_mirrored:1;
  158. int32_t uppercase_mapping;
  159. int32_t lowercase_mapping;
  160. int32_t titlecase_mapping;
  161. int32_t comb1st_index;
  162. int32_t comb2nd_index;
  163. unsigned comp_exclusion:1;
  164. unsigned ignorable:1;
  165. unsigned control_boundary:1;
  166. unsigned extend:1;
  167. const int32_t *casefold_mapping;
  168. } utf8proc_property_t;
  169. #define UTF8PROC_CATEGORY_LU 1
  170. #define UTF8PROC_CATEGORY_LL 2
  171. #define UTF8PROC_CATEGORY_LT 3
  172. #define UTF8PROC_CATEGORY_LM 4
  173. #define UTF8PROC_CATEGORY_LO 5
  174. #define UTF8PROC_CATEGORY_MN 6
  175. #define UTF8PROC_CATEGORY_MC 7
  176. #define UTF8PROC_CATEGORY_ME 8
  177. #define UTF8PROC_CATEGORY_ND 9
  178. #define UTF8PROC_CATEGORY_NL 10
  179. #define UTF8PROC_CATEGORY_NO 11
  180. #define UTF8PROC_CATEGORY_PC 12
  181. #define UTF8PROC_CATEGORY_PD 13
  182. #define UTF8PROC_CATEGORY_PS 14
  183. #define UTF8PROC_CATEGORY_PE 15
  184. #define UTF8PROC_CATEGORY_PI 16
  185. #define UTF8PROC_CATEGORY_PF 17
  186. #define UTF8PROC_CATEGORY_PO 18
  187. #define UTF8PROC_CATEGORY_SM 19
  188. #define UTF8PROC_CATEGORY_SC 20
  189. #define UTF8PROC_CATEGORY_SK 21
  190. #define UTF8PROC_CATEGORY_SO 22
  191. #define UTF8PROC_CATEGORY_ZS 23
  192. #define UTF8PROC_CATEGORY_ZL 24
  193. #define UTF8PROC_CATEGORY_ZP 25
  194. #define UTF8PROC_CATEGORY_CC 26
  195. #define UTF8PROC_CATEGORY_CF 27
  196. #define UTF8PROC_CATEGORY_CS 28
  197. #define UTF8PROC_CATEGORY_CO 29
  198. #define UTF8PROC_CATEGORY_CN 30
  199. #define UTF8PROC_BIDI_CLASS_L 1
  200. #define UTF8PROC_BIDI_CLASS_LRE 2
  201. #define UTF8PROC_BIDI_CLASS_LRO 3
  202. #define UTF8PROC_BIDI_CLASS_R 4
  203. #define UTF8PROC_BIDI_CLASS_AL 5
  204. #define UTF8PROC_BIDI_CLASS_RLE 6
  205. #define UTF8PROC_BIDI_CLASS_RLO 7
  206. #define UTF8PROC_BIDI_CLASS_PDF 8
  207. #define UTF8PROC_BIDI_CLASS_EN 9
  208. #define UTF8PROC_BIDI_CLASS_ES 10
  209. #define UTF8PROC_BIDI_CLASS_ET 11
  210. #define UTF8PROC_BIDI_CLASS_AN 12
  211. #define UTF8PROC_BIDI_CLASS_CS 13
  212. #define UTF8PROC_BIDI_CLASS_NSM 14
  213. #define UTF8PROC_BIDI_CLASS_BN 15
  214. #define UTF8PROC_BIDI_CLASS_B 16
  215. #define UTF8PROC_BIDI_CLASS_S 17
  216. #define UTF8PROC_BIDI_CLASS_WS 18
  217. #define UTF8PROC_BIDI_CLASS_ON 19
  218. #define UTF8PROC_DECOMP_TYPE_FONT 1
  219. #define UTF8PROC_DECOMP_TYPE_NOBREAK 2
  220. #define UTF8PROC_DECOMP_TYPE_INITIAL 3
  221. #define UTF8PROC_DECOMP_TYPE_MEDIAL 4
  222. #define UTF8PROC_DECOMP_TYPE_FINAL 5
  223. #define UTF8PROC_DECOMP_TYPE_ISOLATED 6
  224. #define UTF8PROC_DECOMP_TYPE_CIRCLE 7
  225. #define UTF8PROC_DECOMP_TYPE_SUPER 8
  226. #define UTF8PROC_DECOMP_TYPE_SUB 9
  227. #define UTF8PROC_DECOMP_TYPE_VERTICAL 10
  228. #define UTF8PROC_DECOMP_TYPE_WIDE 11
  229. #define UTF8PROC_DECOMP_TYPE_NARROW 12
  230. #define UTF8PROC_DECOMP_TYPE_SMALL 13
  231. #define UTF8PROC_DECOMP_TYPE_SQUARE 14
  232. #define UTF8PROC_DECOMP_TYPE_FRACTION 15
  233. #define UTF8PROC_DECOMP_TYPE_COMPAT 16
  234. extern const int8_t utf8proc_utf8class[256];
  235. const char *utf8proc_errmsg(ssize_t errcode);
  236. /*
  237. * Returns a static error string for the given error code.
  238. */
  239. ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
  240. /*
  241. * Reads a single char from the UTF-8 sequence being pointed to by 'str'.
  242. * The maximum number of bytes read is 'strlen', unless 'strlen' is
  243. * negative.
  244. * If a valid unicode char could be read, it is stored in the variable
  245. * being pointed to by 'dst', otherwise that variable will be set to -1.
  246. * In case of success the number of bytes read is returned, otherwise a
  247. * negative error code is returned.
  248. */
  249. bool utf8proc_codepoint_valid(int32_t uc);
  250. /*
  251. * Returns 1, if the given unicode code-point is valid, otherwise 0.
  252. */
  253. ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
  254. /*
  255. * Encodes the unicode char with the code point 'uc' as an UTF-8 string in
  256. * the byte array being pointed to by 'dst'. This array has to be at least
  257. * 4 bytes long.
  258. * In case of success the number of bytes written is returned,
  259. * otherwise 0.
  260. * This function does not check if 'uc' is a valid unicode code point.
  261. */
  262. const utf8proc_property_t *utf8proc_get_property(int32_t uc);
  263. /*
  264. * Returns a pointer to a (constant) struct containing information about
  265. * the unicode char with the given code point 'uc'.
  266. * If the character is not existent a pointer to a special struct is
  267. * returned, where 'category' is a NULL pointer.
  268. * WARNING: The parameter 'uc' has to be in the range of 0x0000 to
  269. * 0x10FFFF, otherwise the program might crash!
  270. */
  271. ssize_t utf8proc_decompose_char(
  272. int32_t uc, int32_t *dst, ssize_t bufsize,
  273. int options, int *last_boundclass
  274. );
  275. /*
  276. * Writes a decomposition of the unicode char 'uc' into the array being
  277. * pointed to by 'dst'.
  278. * Following flags in the 'options' field are regarded:
  279. * REJECTNA: an unassigned unicode code point leads to an error
  280. * IGNORE: "default ignorable" chars are stripped
  281. * CASEFOLD: unicode casefolding is applied
  282. * COMPAT: replace certain characters with their
  283. * compatibility decomposition
  284. * CHARBOUND: Inserts 0xFF bytes before each grapheme cluster
  285. * LUMP: lumps certain different characters together
  286. * STRIPMARK: removes all character marks
  287. * The pointer 'last_boundclass' has to point to an integer variable which
  288. * is storing the last character boundary class, if the CHARBOUND option
  289. * is used.
  290. * In case of success the number of chars written is returned,
  291. * in case of an error, a negative error code is returned.
  292. * If the number of written chars would be bigger than 'bufsize',
  293. * the buffer (up to 'bufsize') has inpredictable data, and the needed
  294. * buffer size is returned.
  295. * WARNING: The parameter 'uc' has to be in the range of 0x0000 to
  296. * 0x10FFFF, otherwise the program might crash!
  297. */
  298. ssize_t utf8proc_decompose(
  299. const uint8_t *str, ssize_t strlen,
  300. int32_t *buffer, ssize_t bufsize, int options
  301. );
  302. /*
  303. * Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8
  304. * string, and orders the decomposed sequences correctly.
  305. * If the NULLTERM flag in 'options' is set, processing will be stopped,
  306. * when a NULL byte is encounted, otherwise 'strlen' bytes are processed.
  307. * The result in form of unicode code points is written into the buffer
  308. * being pointed to by 'buffer', having the length of 'bufsize' entries.
  309. * In case of success the number of chars written is returned,
  310. * in case of an error, a negative error code is returned.
  311. * If the number of written chars would be bigger than 'bufsize',
  312. * the buffer (up to 'bufsize') has inpredictable data, and the needed
  313. * buffer size is returned.
  314. */
  315. ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
  316. /*
  317. * Reencodes the sequence of unicode characters given by the pointer
  318. * 'buffer' and 'length' as UTF-8.
  319. * The result is stored in the same memory area where the data is read.
  320. * Following flags in the 'options' field are regarded:
  321. * NLF2LS: converts LF, CRLF, CR and NEL into LS
  322. * NLF2PS: converts LF, CRLF, CR and NEL into PS
  323. * NLF2LF: converts LF, CRLF, CR and NEL into LF
  324. * STRIPCC: strips or converts all non-affected control characters
  325. * COMPOSE: tries to combine decomposed characters into composite
  326. * characters
  327. * STABLE: prohibits combining characters which would violate
  328. * the unicode versioning stability
  329. * In case of success the length of the resulting UTF-8 string is
  330. * returned, otherwise a negative error code is returned.
  331. * WARNING: The amount of free space being pointed to by 'buffer', has to
  332. * exceed the amount of the input data by one byte, and the
  333. * entries of the array pointed to by 'str' have to be in the
  334. * range of 0x0000 to 0x10FFFF, otherwise the program might
  335. * crash!
  336. */
  337. ssize_t utf8proc_map(
  338. const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
  339. );
  340. /*
  341. * Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8
  342. * string, which is allocated dynamically, and afterwards pointed to by
  343. * the pointer being pointed to by 'dstptr'.
  344. * If the NULLTERM flag in the 'options' field is set, the length is
  345. * determined by a NULL terminator, otherwise the parameter 'strlen' is
  346. * evaluated to determine the string length, but in any case the result
  347. * will be NULL terminated (though it might contain NULL characters
  348. * before). Other flags in the 'options' field are passed to the functions
  349. * defined above, and regarded as described.
  350. * In case of success the length of the new string is returned,
  351. * otherwise a negative error code is returned.
  352. * NOTICE: The memory of the new UTF-8 string will have been allocated with
  353. * 'malloc', and has theirfore to be freed with 'free'.
  354. */
  355. uint8_t *utf8proc_NFD(const uint8_t *str);
  356. uint8_t *utf8proc_NFC(const uint8_t *str);
  357. uint8_t *utf8proc_NFKD(const uint8_t *str);
  358. uint8_t *utf8proc_NFKC(const uint8_t *str);
  359. /*
  360. * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
  361. * normalized version of the null-terminated string 'str'.
  362. */
  363. ssize_t utf8proc_check(const uint8_t *str);
  364. /*
  365. * Just checks UTF-8 string for validity, returns 0 if valid or one of
  366. * the negative UTF8PROC_ERROR_* codes if invalid or memory exhausted
  367. * checking. Assumes null-terminated string str and UTF8PROC_STABLE
  368. * option.
  369. */
  370. #endif