utf8-norm.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (c) 2014 SGI.
  4. * All rights reserved.
  5. */
  6. #include "utf8n.h"
  7. int utf8version_is_supported(const struct unicode_map *um, unsigned int version)
  8. {
  9. int i = um->tables->utf8agetab_size - 1;
  10. while (i >= 0 && um->tables->utf8agetab[i] != 0) {
  11. if (version == um->tables->utf8agetab[i])
  12. return 1;
  13. i--;
  14. }
  15. return 0;
  16. }
  17. /*
  18. * UTF-8 valid ranges.
  19. *
  20. * The UTF-8 encoding spreads the bits of a 32bit word over several
  21. * bytes. This table gives the ranges that can be held and how they'd
  22. * be represented.
  23. *
  24. * 0x00000000 0x0000007F: 0xxxxxxx
  25. * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
  26. * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  27. * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  28. * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  29. * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  30. *
  31. * There is an additional requirement on UTF-8, in that only the
  32. * shortest representation of a 32bit value is to be used. A decoder
  33. * must not decode sequences that do not satisfy this requirement.
  34. * Thus the allowed ranges have a lower bound.
  35. *
  36. * 0x00000000 0x0000007F: 0xxxxxxx
  37. * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
  38. * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  39. * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  40. * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  41. * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  42. *
  43. * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
  44. * 17 planes of 65536 values. This limits the sequences actually seen
  45. * even more, to just the following.
  46. *
  47. * 0 - 0x7F: 0 - 0x7F
  48. * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
  49. * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
  50. * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
  51. *
  52. * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
  53. *
  54. * Note that the longest sequence seen with valid usage is 4 bytes,
  55. * the same a single UTF-32 character. This makes the UTF-8
  56. * representation of Unicode strictly smaller than UTF-32.
  57. *
  58. * The shortest sequence requirement was introduced by:
  59. * Corrigendum #1: UTF-8 Shortest Form
  60. * It can be found here:
  61. * http://www.unicode.org/versions/corrigendum1.html
  62. *
  63. */
  64. /*
  65. * Return the number of bytes used by the current UTF-8 sequence.
  66. * Assumes the input points to the first byte of a valid UTF-8
  67. * sequence.
  68. */
  69. static inline int utf8clen(const char *s)
  70. {
  71. unsigned char c = *s;
  72. return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
  73. }
  74. /*
  75. * Decode a 3-byte UTF-8 sequence.
  76. */
  77. static unsigned int
  78. utf8decode3(const char *str)
  79. {
  80. unsigned int uc;
  81. uc = *str++ & 0x0F;
  82. uc <<= 6;
  83. uc |= *str++ & 0x3F;
  84. uc <<= 6;
  85. uc |= *str++ & 0x3F;
  86. return uc;
  87. }
  88. /*
  89. * Encode a 3-byte UTF-8 sequence.
  90. */
  91. static int
  92. utf8encode3(char *str, unsigned int val)
  93. {
  94. str[2] = (val & 0x3F) | 0x80;
  95. val >>= 6;
  96. str[1] = (val & 0x3F) | 0x80;
  97. val >>= 6;
  98. str[0] = val | 0xE0;
  99. return 3;
  100. }
  101. /*
  102. * utf8trie_t
  103. *
  104. * A compact binary tree, used to decode UTF-8 characters.
  105. *
  106. * Internal nodes are one byte for the node itself, and up to three
  107. * bytes for an offset into the tree. The first byte contains the
  108. * following information:
  109. * NEXTBYTE - flag - advance to next byte if set
  110. * BITNUM - 3 bit field - the bit number to tested
  111. * OFFLEN - 2 bit field - number of bytes in the offset
  112. * if offlen == 0 (non-branching node)
  113. * RIGHTPATH - 1 bit field - set if the following node is for the
  114. * right-hand path (tested bit is set)
  115. * TRIENODE - 1 bit field - set if the following node is an internal
  116. * node, otherwise it is a leaf node
  117. * if offlen != 0 (branching node)
  118. * LEFTNODE - 1 bit field - set if the left-hand node is internal
  119. * RIGHTNODE - 1 bit field - set if the right-hand node is internal
  120. *
  121. * Due to the way utf8 works, there cannot be branching nodes with
  122. * NEXTBYTE set, and moreover those nodes always have a righthand
  123. * descendant.
  124. */
  125. typedef const unsigned char utf8trie_t;
  126. #define BITNUM 0x07
  127. #define NEXTBYTE 0x08
  128. #define OFFLEN 0x30
  129. #define OFFLEN_SHIFT 4
  130. #define RIGHTPATH 0x40
  131. #define TRIENODE 0x80
  132. #define RIGHTNODE 0x40
  133. #define LEFTNODE 0x80
  134. /*
  135. * utf8leaf_t
  136. *
  137. * The leaves of the trie are embedded in the trie, and so the same
  138. * underlying datatype: unsigned char.
  139. *
  140. * leaf[0]: The unicode version, stored as a generation number that is
  141. * an index into ->utf8agetab[]. With this we can filter code
  142. * points based on the unicode version in which they were
  143. * defined. The CCC of a non-defined code point is 0.
  144. * leaf[1]: Canonical Combining Class. During normalization, we need
  145. * to do a stable sort into ascending order of all characters
  146. * with a non-zero CCC that occur between two characters with
  147. * a CCC of 0, or at the begin or end of a string.
  148. * The unicode standard guarantees that all CCC values are
  149. * between 0 and 254 inclusive, which leaves 255 available as
  150. * a special value.
  151. * Code points with CCC 0 are known as stoppers.
  152. * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
  153. * start of a NUL-terminated string that is the decomposition
  154. * of the character.
  155. * The CCC of a decomposable character is the same as the CCC
  156. * of the first character of its decomposition.
  157. * Some characters decompose as the empty string: these are
  158. * characters with the Default_Ignorable_Code_Point property.
  159. * These do affect normalization, as they all have CCC 0.
  160. *
  161. * The decompositions in the trie have been fully expanded, with the
  162. * exception of Hangul syllables, which are decomposed algorithmically.
  163. *
  164. * Casefolding, if applicable, is also done using decompositions.
  165. *
  166. * The trie is constructed in such a way that leaves exist for all
  167. * UTF-8 sequences that match the criteria from the "UTF-8 valid
  168. * ranges" comment above, and only for those sequences. Therefore a
  169. * lookup in the trie can be used to validate the UTF-8 input.
  170. */
  171. typedef const unsigned char utf8leaf_t;
  172. #define LEAF_GEN(LEAF) ((LEAF)[0])
  173. #define LEAF_CCC(LEAF) ((LEAF)[1])
  174. #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
  175. #define MINCCC (0)
  176. #define MAXCCC (254)
  177. #define STOPPER (0)
  178. #define DECOMPOSE (255)
  179. /* Marker for hangul syllable decomposition. */
  180. #define HANGUL ((char)(255))
  181. /* Size of the synthesized leaf used for Hangul syllable decomposition. */
  182. #define UTF8HANGULLEAF (12)
  183. /*
  184. * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
  185. *
  186. * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
  187. * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
  188. *
  189. * SBase = 0xAC00
  190. * LBase = 0x1100
  191. * VBase = 0x1161
  192. * TBase = 0x11A7
  193. * LCount = 19
  194. * VCount = 21
  195. * TCount = 28
  196. * NCount = 588 (VCount * TCount)
  197. * SCount = 11172 (LCount * NCount)
  198. *
  199. * Decomposition:
  200. * SIndex = s - SBase
  201. *
  202. * LV (Canonical/Full)
  203. * LIndex = SIndex / NCount
  204. * VIndex = (Sindex % NCount) / TCount
  205. * LPart = LBase + LIndex
  206. * VPart = VBase + VIndex
  207. *
  208. * LVT (Canonical)
  209. * LVIndex = (SIndex / TCount) * TCount
  210. * TIndex = (Sindex % TCount)
  211. * LVPart = SBase + LVIndex
  212. * TPart = TBase + TIndex
  213. *
  214. * LVT (Full)
  215. * LIndex = SIndex / NCount
  216. * VIndex = (Sindex % NCount) / TCount
  217. * TIndex = (Sindex % TCount)
  218. * LPart = LBase + LIndex
  219. * VPart = VBase + VIndex
  220. * if (TIndex == 0) {
  221. * d = <LPart, VPart>
  222. * } else {
  223. * TPart = TBase + TIndex
  224. * d = <LPart, TPart, VPart>
  225. * }
  226. */
  227. /* Constants */
  228. #define SB (0xAC00)
  229. #define LB (0x1100)
  230. #define VB (0x1161)
  231. #define TB (0x11A7)
  232. #define LC (19)
  233. #define VC (21)
  234. #define TC (28)
  235. #define NC (VC * TC)
  236. #define SC (LC * NC)
  237. /* Algorithmic decomposition of hangul syllable. */
  238. static utf8leaf_t *
  239. utf8hangul(const char *str, unsigned char *hangul)
  240. {
  241. unsigned int si;
  242. unsigned int li;
  243. unsigned int vi;
  244. unsigned int ti;
  245. unsigned char *h;
  246. /* Calculate the SI, LI, VI, and TI values. */
  247. si = utf8decode3(str) - SB;
  248. li = si / NC;
  249. vi = (si % NC) / TC;
  250. ti = si % TC;
  251. /* Fill in base of leaf. */
  252. h = hangul;
  253. LEAF_GEN(h) = 2;
  254. LEAF_CCC(h) = DECOMPOSE;
  255. h += 2;
  256. /* Add LPart, a 3-byte UTF-8 sequence. */
  257. h += utf8encode3((char *)h, li + LB);
  258. /* Add VPart, a 3-byte UTF-8 sequence. */
  259. h += utf8encode3((char *)h, vi + VB);
  260. /* Add TPart if required, also a 3-byte UTF-8 sequence. */
  261. if (ti)
  262. h += utf8encode3((char *)h, ti + TB);
  263. /* Terminate string. */
  264. h[0] = '\0';
  265. return hangul;
  266. }
  267. /*
  268. * Use trie to scan s, touching at most len bytes.
  269. * Returns the leaf if one exists, NULL otherwise.
  270. *
  271. * A non-NULL return guarantees that the UTF-8 sequence starting at s
  272. * is well-formed and corresponds to a known unicode code point. The
  273. * shorthand for this will be "is valid UTF-8 unicode".
  274. */
  275. static utf8leaf_t *utf8nlookup(const struct unicode_map *um,
  276. enum utf8_normalization n, unsigned char *hangul, const char *s,
  277. size_t len)
  278. {
  279. utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset;
  280. int offlen;
  281. int offset;
  282. int mask;
  283. int node;
  284. if (len == 0)
  285. return NULL;
  286. node = 1;
  287. while (node) {
  288. offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
  289. if (*trie & NEXTBYTE) {
  290. if (--len == 0)
  291. return NULL;
  292. s++;
  293. }
  294. mask = 1 << (*trie & BITNUM);
  295. if (*s & mask) {
  296. /* Right leg */
  297. if (offlen) {
  298. /* Right node at offset of trie */
  299. node = (*trie & RIGHTNODE);
  300. offset = trie[offlen];
  301. while (--offlen) {
  302. offset <<= 8;
  303. offset |= trie[offlen];
  304. }
  305. trie += offset;
  306. } else if (*trie & RIGHTPATH) {
  307. /* Right node after this node */
  308. node = (*trie & TRIENODE);
  309. trie++;
  310. } else {
  311. /* No right node. */
  312. return NULL;
  313. }
  314. } else {
  315. /* Left leg */
  316. if (offlen) {
  317. /* Left node after this node. */
  318. node = (*trie & LEFTNODE);
  319. trie += offlen + 1;
  320. } else if (*trie & RIGHTPATH) {
  321. /* No left node. */
  322. return NULL;
  323. } else {
  324. /* Left node after this node */
  325. node = (*trie & TRIENODE);
  326. trie++;
  327. }
  328. }
  329. }
  330. /*
  331. * Hangul decomposition is done algorithmically. These are the
  332. * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
  333. * always 3 bytes long, so s has been advanced twice, and the
  334. * start of the sequence is at s-2.
  335. */
  336. if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
  337. trie = utf8hangul(s - 2, hangul);
  338. return trie;
  339. }
  340. /*
  341. * Use trie to scan s.
  342. * Returns the leaf if one exists, NULL otherwise.
  343. *
  344. * Forwards to utf8nlookup().
  345. */
  346. static utf8leaf_t *utf8lookup(const struct unicode_map *um,
  347. enum utf8_normalization n, unsigned char *hangul, const char *s)
  348. {
  349. return utf8nlookup(um, n, hangul, s, (size_t)-1);
  350. }
  351. /*
  352. * Length of the normalization of s, touch at most len bytes.
  353. * Return -1 if s is not valid UTF-8 unicode.
  354. */
  355. ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
  356. const char *s, size_t len)
  357. {
  358. utf8leaf_t *leaf;
  359. size_t ret = 0;
  360. unsigned char hangul[UTF8HANGULLEAF];
  361. while (len && *s) {
  362. leaf = utf8nlookup(um, n, hangul, s, len);
  363. if (!leaf)
  364. return -1;
  365. if (um->tables->utf8agetab[LEAF_GEN(leaf)] >
  366. um->ntab[n]->maxage)
  367. ret += utf8clen(s);
  368. else if (LEAF_CCC(leaf) == DECOMPOSE)
  369. ret += strlen(LEAF_STR(leaf));
  370. else
  371. ret += utf8clen(s);
  372. len -= utf8clen(s);
  373. s += utf8clen(s);
  374. }
  375. return ret;
  376. }
  377. /*
  378. * Set up an utf8cursor for use by utf8byte().
  379. *
  380. * u8c : pointer to cursor.
  381. * data : const struct utf8data to use for normalization.
  382. * s : string.
  383. * len : length of s.
  384. *
  385. * Returns -1 on error, 0 on success.
  386. */
  387. int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um,
  388. enum utf8_normalization n, const char *s, size_t len)
  389. {
  390. if (!s)
  391. return -1;
  392. u8c->um = um;
  393. u8c->n = n;
  394. u8c->s = s;
  395. u8c->p = NULL;
  396. u8c->ss = NULL;
  397. u8c->sp = NULL;
  398. u8c->len = len;
  399. u8c->slen = 0;
  400. u8c->ccc = STOPPER;
  401. u8c->nccc = STOPPER;
  402. /* Check we didn't clobber the maximum length. */
  403. if (u8c->len != len)
  404. return -1;
  405. /* The first byte of s may not be an utf8 continuation. */
  406. if (len > 0 && (*s & 0xC0) == 0x80)
  407. return -1;
  408. return 0;
  409. }
  410. /*
  411. * Get one byte from the normalized form of the string described by u8c.
  412. *
  413. * Returns the byte cast to an unsigned char on succes, and -1 on failure.
  414. *
  415. * The cursor keeps track of the location in the string in u8c->s.
  416. * When a character is decomposed, the current location is stored in
  417. * u8c->p, and u8c->s is set to the start of the decomposition. Note
  418. * that bytes from a decomposition do not count against u8c->len.
  419. *
  420. * Characters are emitted if they match the current CCC in u8c->ccc.
  421. * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
  422. * and the function returns 0 in that case.
  423. *
  424. * Sorting by CCC is done by repeatedly scanning the string. The
  425. * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
  426. * the start of the scan. The first pass finds the lowest CCC to be
  427. * emitted and stores it in u8c->nccc, the second pass emits the
  428. * characters with this CCC and finds the next lowest CCC. This limits
  429. * the number of passes to 1 + the number of different CCCs in the
  430. * sequence being scanned.
  431. *
  432. * Therefore:
  433. * u8c->p != NULL -> a decomposition is being scanned.
  434. * u8c->ss != NULL -> this is a repeating scan.
  435. * u8c->ccc == -1 -> this is the first scan of a repeating scan.
  436. */
  437. int utf8byte(struct utf8cursor *u8c)
  438. {
  439. utf8leaf_t *leaf;
  440. int ccc;
  441. for (;;) {
  442. /* Check for the end of a decomposed character. */
  443. if (u8c->p && *u8c->s == '\0') {
  444. u8c->s = u8c->p;
  445. u8c->p = NULL;
  446. }
  447. /* Check for end-of-string. */
  448. if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
  449. /* There is no next byte. */
  450. if (u8c->ccc == STOPPER)
  451. return 0;
  452. /* End-of-string during a scan counts as a stopper. */
  453. ccc = STOPPER;
  454. goto ccc_mismatch;
  455. } else if ((*u8c->s & 0xC0) == 0x80) {
  456. /* This is a continuation of the current character. */
  457. if (!u8c->p)
  458. u8c->len--;
  459. return (unsigned char)*u8c->s++;
  460. }
  461. /* Look up the data for the current character. */
  462. if (u8c->p) {
  463. leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
  464. } else {
  465. leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul,
  466. u8c->s, u8c->len);
  467. }
  468. /* No leaf found implies that the input is a binary blob. */
  469. if (!leaf)
  470. return -1;
  471. ccc = LEAF_CCC(leaf);
  472. /* Characters that are too new have CCC 0. */
  473. if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >
  474. u8c->um->ntab[u8c->n]->maxage) {
  475. ccc = STOPPER;
  476. } else if (ccc == DECOMPOSE) {
  477. u8c->len -= utf8clen(u8c->s);
  478. u8c->p = u8c->s + utf8clen(u8c->s);
  479. u8c->s = LEAF_STR(leaf);
  480. /* Empty decomposition implies CCC 0. */
  481. if (*u8c->s == '\0') {
  482. if (u8c->ccc == STOPPER)
  483. continue;
  484. ccc = STOPPER;
  485. goto ccc_mismatch;
  486. }
  487. leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
  488. if (!leaf)
  489. return -1;
  490. ccc = LEAF_CCC(leaf);
  491. }
  492. /*
  493. * If this is not a stopper, then see if it updates
  494. * the next canonical class to be emitted.
  495. */
  496. if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
  497. u8c->nccc = ccc;
  498. /*
  499. * Return the current byte if this is the current
  500. * combining class.
  501. */
  502. if (ccc == u8c->ccc) {
  503. if (!u8c->p)
  504. u8c->len--;
  505. return (unsigned char)*u8c->s++;
  506. }
  507. /* Current combining class mismatch. */
  508. ccc_mismatch:
  509. if (u8c->nccc == STOPPER) {
  510. /*
  511. * Scan forward for the first canonical class
  512. * to be emitted. Save the position from
  513. * which to restart.
  514. */
  515. u8c->ccc = MINCCC - 1;
  516. u8c->nccc = ccc;
  517. u8c->sp = u8c->p;
  518. u8c->ss = u8c->s;
  519. u8c->slen = u8c->len;
  520. if (!u8c->p)
  521. u8c->len -= utf8clen(u8c->s);
  522. u8c->s += utf8clen(u8c->s);
  523. } else if (ccc != STOPPER) {
  524. /* Not a stopper, and not the ccc we're emitting. */
  525. if (!u8c->p)
  526. u8c->len -= utf8clen(u8c->s);
  527. u8c->s += utf8clen(u8c->s);
  528. } else if (u8c->nccc != MAXCCC + 1) {
  529. /* At a stopper, restart for next ccc. */
  530. u8c->ccc = u8c->nccc;
  531. u8c->nccc = MAXCCC + 1;
  532. u8c->s = u8c->ss;
  533. u8c->p = u8c->sp;
  534. u8c->len = u8c->slen;
  535. } else {
  536. /* All done, proceed from here. */
  537. u8c->ccc = STOPPER;
  538. u8c->nccc = STOPPER;
  539. u8c->sp = NULL;
  540. u8c->ss = NULL;
  541. u8c->slen = 0;
  542. }
  543. }
  544. }
  545. #ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE
  546. EXPORT_SYMBOL_GPL(utf8version_is_supported);
  547. EXPORT_SYMBOL_GPL(utf8nlen);
  548. EXPORT_SYMBOL_GPL(utf8ncursor);
  549. EXPORT_SYMBOL_GPL(utf8byte);
  550. #endif