mkutf8data.c 80 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433
  1. /*
  2. * Copyright (c) 2014 SGI.
  3. * All rights reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it would be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write the Free Software Foundation,
  16. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. /* Generator for a compact trie for unicode normalization */
  19. #include <sys/types.h>
  20. #include <stddef.h>
  21. #include <stdlib.h>
  22. #include <stdio.h>
  23. #include <assert.h>
  24. #include <string.h>
  25. #include <unistd.h>
  26. #include <errno.h>
  27. /* Default names of the in- and output files. */
  28. #define AGE_NAME "DerivedAge.txt"
  29. #define CCC_NAME "DerivedCombiningClass.txt"
  30. #define PROP_NAME "DerivedCoreProperties.txt"
  31. #define DATA_NAME "UnicodeData.txt"
  32. #define FOLD_NAME "CaseFolding.txt"
  33. #define NORM_NAME "NormalizationCorrections.txt"
  34. #define TEST_NAME "NormalizationTest.txt"
  35. #define UTF8_NAME "utf8data.h"
  36. const char *age_name = AGE_NAME;
  37. const char *ccc_name = CCC_NAME;
  38. const char *prop_name = PROP_NAME;
  39. const char *data_name = DATA_NAME;
  40. const char *fold_name = FOLD_NAME;
  41. const char *norm_name = NORM_NAME;
  42. const char *test_name = TEST_NAME;
  43. const char *utf8_name = UTF8_NAME;
  44. int verbose = 0;
  45. /* An arbitrary line size limit on input lines. */
  46. #define LINESIZE 1024
  47. char line[LINESIZE];
  48. char buf0[LINESIZE];
  49. char buf1[LINESIZE];
  50. char buf2[LINESIZE];
  51. char buf3[LINESIZE];
  52. const char *argv0;
  53. #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
  54. /* ------------------------------------------------------------------ */
  55. /*
  56. * Unicode version numbers consist of three parts: major, minor, and a
  57. * revision. These numbers are packed into an unsigned int to obtain
  58. * a single version number.
  59. *
  60. * To save space in the generated trie, the unicode version is not
  61. * stored directly, instead we calculate a generation number from the
  62. * unicode versions seen in the DerivedAge file, and use that as an
  63. * index into a table of unicode versions.
  64. */
  65. #define UNICODE_MAJ_SHIFT (16)
  66. #define UNICODE_MIN_SHIFT (8)
  67. #define UNICODE_MAJ_MAX ((unsigned short)-1)
  68. #define UNICODE_MIN_MAX ((unsigned char)-1)
  69. #define UNICODE_REV_MAX ((unsigned char)-1)
  70. #define UNICODE_AGE(MAJ,MIN,REV) \
  71. (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
  72. ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
  73. ((unsigned int)(REV)))
  74. unsigned int *ages;
  75. int ages_count;
  76. unsigned int unicode_maxage;
  77. static int age_valid(unsigned int major, unsigned int minor,
  78. unsigned int revision)
  79. {
  80. if (major > UNICODE_MAJ_MAX)
  81. return 0;
  82. if (minor > UNICODE_MIN_MAX)
  83. return 0;
  84. if (revision > UNICODE_REV_MAX)
  85. return 0;
  86. return 1;
  87. }
  88. /* ------------------------------------------------------------------ */
  89. /*
  90. * utf8trie_t
  91. *
  92. * A compact binary tree, used to decode UTF-8 characters.
  93. *
  94. * Internal nodes are one byte for the node itself, and up to three
  95. * bytes for an offset into the tree. The first byte contains the
  96. * following information:
  97. * NEXTBYTE - flag - advance to next byte if set
  98. * BITNUM - 3 bit field - the bit number to tested
  99. * OFFLEN - 2 bit field - number of bytes in the offset
  100. * if offlen == 0 (non-branching node)
  101. * RIGHTPATH - 1 bit field - set if the following node is for the
  102. * right-hand path (tested bit is set)
  103. * TRIENODE - 1 bit field - set if the following node is an internal
  104. * node, otherwise it is a leaf node
  105. * if offlen != 0 (branching node)
  106. * LEFTNODE - 1 bit field - set if the left-hand node is internal
  107. * RIGHTNODE - 1 bit field - set if the right-hand node is internal
  108. *
  109. * Due to the way utf8 works, there cannot be branching nodes with
  110. * NEXTBYTE set, and moreover those nodes always have a righthand
  111. * descendant.
  112. */
  113. typedef unsigned char utf8trie_t;
  114. #define BITNUM 0x07
  115. #define NEXTBYTE 0x08
  116. #define OFFLEN 0x30
  117. #define OFFLEN_SHIFT 4
  118. #define RIGHTPATH 0x40
  119. #define TRIENODE 0x80
  120. #define RIGHTNODE 0x40
  121. #define LEFTNODE 0x80
  122. /*
  123. * utf8leaf_t
  124. *
  125. * The leaves of the trie are embedded in the trie, and so the same
  126. * underlying datatype, unsigned char.
  127. *
  128. * leaf[0]: The unicode version, stored as a generation number that is
  129. * an index into utf8agetab[]. With this we can filter code
  130. * points based on the unicode version in which they were
  131. * defined. The CCC of a non-defined code point is 0.
  132. * leaf[1]: Canonical Combining Class. During normalization, we need
  133. * to do a stable sort into ascending order of all characters
  134. * with a non-zero CCC that occur between two characters with
  135. * a CCC of 0, or at the begin or end of a string.
  136. * The unicode standard guarantees that all CCC values are
  137. * between 0 and 254 inclusive, which leaves 255 available as
  138. * a special value.
  139. * Code points with CCC 0 are known as stoppers.
  140. * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
  141. * start of a NUL-terminated string that is the decomposition
  142. * of the character.
  143. * The CCC of a decomposable character is the same as the CCC
  144. * of the first character of its decomposition.
  145. * Some characters decompose as the empty string: these are
  146. * characters with the Default_Ignorable_Code_Point property.
  147. * These do affect normalization, as they all have CCC 0.
  148. *
  149. * The decompositions in the trie have been fully expanded.
  150. *
  151. * Casefolding, if applicable, is also done using decompositions.
  152. */
  153. typedef unsigned char utf8leaf_t;
  154. #define LEAF_GEN(LEAF) ((LEAF)[0])
  155. #define LEAF_CCC(LEAF) ((LEAF)[1])
  156. #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
  157. #define MAXGEN (255)
  158. #define MINCCC (0)
  159. #define MAXCCC (254)
  160. #define STOPPER (0)
  161. #define DECOMPOSE (255)
  162. #define HANGUL ((char)(255))
  163. #define UTF8HANGULLEAF (12)
  164. struct tree;
  165. static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
  166. const char *, size_t);
  167. static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
  168. unsigned char *utf8data;
  169. size_t utf8data_size;
  170. utf8trie_t *nfdi;
  171. utf8trie_t *nfdicf;
  172. /* ------------------------------------------------------------------ */
  173. /*
  174. * UTF8 valid ranges.
  175. *
  176. * The UTF-8 encoding spreads the bits of a 32bit word over several
  177. * bytes. This table gives the ranges that can be held and how they'd
  178. * be represented.
  179. *
  180. * 0x00000000 0x0000007F: 0xxxxxxx
  181. * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
  182. * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  183. * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  184. * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  185. * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  186. *
  187. * There is an additional requirement on UTF-8, in that only the
  188. * shortest representation of a 32bit value is to be used. A decoder
  189. * must not decode sequences that do not satisfy this requirement.
  190. * Thus the allowed ranges have a lower bound.
  191. *
  192. * 0x00000000 0x0000007F: 0xxxxxxx
  193. * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
  194. * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  195. * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  196. * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  197. * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  198. *
  199. * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
  200. * 17 planes of 65536 values. This limits the sequences actually seen
  201. * even more, to just the following.
  202. *
  203. * 0 - 0x7f: 0 0x7f
  204. * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
  205. * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
  206. * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
  207. *
  208. * Even within those ranges not all values are allowed: the surrogates
  209. * 0xd800 - 0xdfff should never be seen.
  210. *
  211. * Note that the longest sequence seen with valid usage is 4 bytes,
  212. * the same a single UTF-32 character. This makes the UTF-8
  213. * representation of Unicode strictly smaller than UTF-32.
  214. *
  215. * The shortest sequence requirement was introduced by:
  216. * Corrigendum #1: UTF-8 Shortest Form
  217. * It can be found here:
  218. * http://www.unicode.org/versions/corrigendum1.html
  219. *
  220. */
  221. #define UTF8_2_BITS 0xC0
  222. #define UTF8_3_BITS 0xE0
  223. #define UTF8_4_BITS 0xF0
  224. #define UTF8_N_BITS 0x80
  225. #define UTF8_2_MASK 0xE0
  226. #define UTF8_3_MASK 0xF0
  227. #define UTF8_4_MASK 0xF8
  228. #define UTF8_N_MASK 0xC0
  229. #define UTF8_V_MASK 0x3F
  230. #define UTF8_V_SHIFT 6
  231. static int utf8encode(char *str, unsigned int val)
  232. {
  233. int len;
  234. if (val < 0x80) {
  235. str[0] = val;
  236. len = 1;
  237. } else if (val < 0x800) {
  238. str[1] = val & UTF8_V_MASK;
  239. str[1] |= UTF8_N_BITS;
  240. val >>= UTF8_V_SHIFT;
  241. str[0] = val;
  242. str[0] |= UTF8_2_BITS;
  243. len = 2;
  244. } else if (val < 0x10000) {
  245. str[2] = val & UTF8_V_MASK;
  246. str[2] |= UTF8_N_BITS;
  247. val >>= UTF8_V_SHIFT;
  248. str[1] = val & UTF8_V_MASK;
  249. str[1] |= UTF8_N_BITS;
  250. val >>= UTF8_V_SHIFT;
  251. str[0] = val;
  252. str[0] |= UTF8_3_BITS;
  253. len = 3;
  254. } else if (val < 0x110000) {
  255. str[3] = val & UTF8_V_MASK;
  256. str[3] |= UTF8_N_BITS;
  257. val >>= UTF8_V_SHIFT;
  258. str[2] = val & UTF8_V_MASK;
  259. str[2] |= UTF8_N_BITS;
  260. val >>= UTF8_V_SHIFT;
  261. str[1] = val & UTF8_V_MASK;
  262. str[1] |= UTF8_N_BITS;
  263. val >>= UTF8_V_SHIFT;
  264. str[0] = val;
  265. str[0] |= UTF8_4_BITS;
  266. len = 4;
  267. } else {
  268. printf("%#x: illegal val\n", val);
  269. len = 0;
  270. }
  271. return len;
  272. }
  273. static unsigned int utf8decode(const char *str)
  274. {
  275. const unsigned char *s = (const unsigned char*)str;
  276. unsigned int unichar = 0;
  277. if (*s < 0x80) {
  278. unichar = *s;
  279. } else if (*s < UTF8_3_BITS) {
  280. unichar = *s++ & 0x1F;
  281. unichar <<= UTF8_V_SHIFT;
  282. unichar |= *s & 0x3F;
  283. } else if (*s < UTF8_4_BITS) {
  284. unichar = *s++ & 0x0F;
  285. unichar <<= UTF8_V_SHIFT;
  286. unichar |= *s++ & 0x3F;
  287. unichar <<= UTF8_V_SHIFT;
  288. unichar |= *s & 0x3F;
  289. } else {
  290. unichar = *s++ & 0x0F;
  291. unichar <<= UTF8_V_SHIFT;
  292. unichar |= *s++ & 0x3F;
  293. unichar <<= UTF8_V_SHIFT;
  294. unichar |= *s++ & 0x3F;
  295. unichar <<= UTF8_V_SHIFT;
  296. unichar |= *s & 0x3F;
  297. }
  298. return unichar;
  299. }
  300. static int utf32valid(unsigned int unichar)
  301. {
  302. return unichar < 0x110000;
  303. }
  304. #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
  305. #define NODE 1
  306. #define LEAF 0
  307. struct tree {
  308. void *root;
  309. int childnode;
  310. const char *type;
  311. unsigned int maxage;
  312. struct tree *next;
  313. int (*leaf_equal)(void *, void *);
  314. void (*leaf_print)(void *, int);
  315. int (*leaf_mark)(void *);
  316. int (*leaf_size)(void *);
  317. int *(*leaf_index)(struct tree *, void *);
  318. unsigned char *(*leaf_emit)(void *, unsigned char *);
  319. int leafindex[0x110000];
  320. int index;
  321. };
  322. struct node {
  323. int index;
  324. int offset;
  325. int mark;
  326. int size;
  327. struct node *parent;
  328. void *left;
  329. void *right;
  330. unsigned char bitnum;
  331. unsigned char nextbyte;
  332. unsigned char leftnode;
  333. unsigned char rightnode;
  334. unsigned int keybits;
  335. unsigned int keymask;
  336. };
  337. /*
  338. * Example lookup function for a tree.
  339. */
  340. static void *lookup(struct tree *tree, const char *key)
  341. {
  342. struct node *node;
  343. void *leaf = NULL;
  344. node = tree->root;
  345. while (!leaf && node) {
  346. if (node->nextbyte)
  347. key++;
  348. if (*key & (1 << (node->bitnum & 7))) {
  349. /* Right leg */
  350. if (node->rightnode == NODE) {
  351. node = node->right;
  352. } else if (node->rightnode == LEAF) {
  353. leaf = node->right;
  354. } else {
  355. node = NULL;
  356. }
  357. } else {
  358. /* Left leg */
  359. if (node->leftnode == NODE) {
  360. node = node->left;
  361. } else if (node->leftnode == LEAF) {
  362. leaf = node->left;
  363. } else {
  364. node = NULL;
  365. }
  366. }
  367. }
  368. return leaf;
  369. }
  370. /*
  371. * A simple non-recursive tree walker: keep track of visits to the
  372. * left and right branches in the leftmask and rightmask.
  373. */
  374. static void tree_walk(struct tree *tree)
  375. {
  376. struct node *node;
  377. unsigned int leftmask;
  378. unsigned int rightmask;
  379. unsigned int bitmask;
  380. int indent = 1;
  381. int nodes, singletons, leaves;
  382. nodes = singletons = leaves = 0;
  383. printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
  384. if (tree->childnode == LEAF) {
  385. assert(tree->root);
  386. tree->leaf_print(tree->root, indent);
  387. leaves = 1;
  388. } else {
  389. assert(tree->childnode == NODE);
  390. node = tree->root;
  391. leftmask = rightmask = 0;
  392. while (node) {
  393. printf("%*snode @ %p bitnum %d nextbyte %d"
  394. " left %p right %p mask %x bits %x\n",
  395. indent, "", node,
  396. node->bitnum, node->nextbyte,
  397. node->left, node->right,
  398. node->keymask, node->keybits);
  399. nodes += 1;
  400. if (!(node->left && node->right))
  401. singletons += 1;
  402. while (node) {
  403. bitmask = 1 << node->bitnum;
  404. if ((leftmask & bitmask) == 0) {
  405. leftmask |= bitmask;
  406. if (node->leftnode == LEAF) {
  407. assert(node->left);
  408. tree->leaf_print(node->left,
  409. indent+1);
  410. leaves += 1;
  411. } else if (node->left) {
  412. assert(node->leftnode == NODE);
  413. indent += 1;
  414. node = node->left;
  415. break;
  416. }
  417. }
  418. if ((rightmask & bitmask) == 0) {
  419. rightmask |= bitmask;
  420. if (node->rightnode == LEAF) {
  421. assert(node->right);
  422. tree->leaf_print(node->right,
  423. indent+1);
  424. leaves += 1;
  425. } else if (node->right) {
  426. assert(node->rightnode == NODE);
  427. indent += 1;
  428. node = node->right;
  429. break;
  430. }
  431. }
  432. leftmask &= ~bitmask;
  433. rightmask &= ~bitmask;
  434. node = node->parent;
  435. indent -= 1;
  436. }
  437. }
  438. }
  439. printf("nodes %d leaves %d singletons %d\n",
  440. nodes, leaves, singletons);
  441. }
  442. /*
  443. * Allocate an initialize a new internal node.
  444. */
  445. static struct node *alloc_node(struct node *parent)
  446. {
  447. struct node *node;
  448. int bitnum;
  449. node = malloc(sizeof(*node));
  450. node->left = node->right = NULL;
  451. node->parent = parent;
  452. node->leftnode = NODE;
  453. node->rightnode = NODE;
  454. node->keybits = 0;
  455. node->keymask = 0;
  456. node->mark = 0;
  457. node->index = 0;
  458. node->offset = -1;
  459. node->size = 4;
  460. if (node->parent) {
  461. bitnum = parent->bitnum;
  462. if ((bitnum & 7) == 0) {
  463. node->bitnum = bitnum + 7 + 8;
  464. node->nextbyte = 1;
  465. } else {
  466. node->bitnum = bitnum - 1;
  467. node->nextbyte = 0;
  468. }
  469. } else {
  470. node->bitnum = 7;
  471. node->nextbyte = 0;
  472. }
  473. return node;
  474. }
  475. /*
  476. * Insert a new leaf into the tree, and collapse any subtrees that are
  477. * fully populated and end in identical leaves. A nextbyte tagged
  478. * internal node will not be removed to preserve the tree's integrity.
  479. * Note that due to the structure of utf8, no nextbyte tagged node
  480. * will be a candidate for removal.
  481. */
  482. static int insert(struct tree *tree, char *key, int keylen, void *leaf)
  483. {
  484. struct node *node;
  485. struct node *parent;
  486. void **cursor;
  487. int keybits;
  488. assert(keylen >= 1 && keylen <= 4);
  489. node = NULL;
  490. cursor = &tree->root;
  491. keybits = 8 * keylen;
  492. /* Insert, creating path along the way. */
  493. while (keybits) {
  494. if (!*cursor)
  495. *cursor = alloc_node(node);
  496. node = *cursor;
  497. if (node->nextbyte)
  498. key++;
  499. if (*key & (1 << (node->bitnum & 7)))
  500. cursor = &node->right;
  501. else
  502. cursor = &node->left;
  503. keybits--;
  504. }
  505. *cursor = leaf;
  506. /* Merge subtrees if possible. */
  507. while (node) {
  508. if (*key & (1 << (node->bitnum & 7)))
  509. node->rightnode = LEAF;
  510. else
  511. node->leftnode = LEAF;
  512. if (node->nextbyte)
  513. break;
  514. if (node->leftnode == NODE || node->rightnode == NODE)
  515. break;
  516. assert(node->left);
  517. assert(node->right);
  518. /* Compare */
  519. if (! tree->leaf_equal(node->left, node->right))
  520. break;
  521. /* Keep left, drop right leaf. */
  522. leaf = node->left;
  523. /* Check in parent */
  524. parent = node->parent;
  525. if (!parent) {
  526. /* root of tree! */
  527. tree->root = leaf;
  528. tree->childnode = LEAF;
  529. } else if (parent->left == node) {
  530. parent->left = leaf;
  531. parent->leftnode = LEAF;
  532. if (parent->right) {
  533. parent->keymask = 0;
  534. parent->keybits = 0;
  535. } else {
  536. parent->keymask |= (1 << node->bitnum);
  537. }
  538. } else if (parent->right == node) {
  539. parent->right = leaf;
  540. parent->rightnode = LEAF;
  541. if (parent->left) {
  542. parent->keymask = 0;
  543. parent->keybits = 0;
  544. } else {
  545. parent->keymask |= (1 << node->bitnum);
  546. parent->keybits |= (1 << node->bitnum);
  547. }
  548. } else {
  549. /* internal tree error */
  550. assert(0);
  551. }
  552. free(node);
  553. node = parent;
  554. }
  555. /* Propagate keymasks up along singleton chains. */
  556. while (node) {
  557. parent = node->parent;
  558. if (!parent)
  559. break;
  560. /* Nix the mask for parents with two children. */
  561. if (node->keymask == 0) {
  562. parent->keymask = 0;
  563. parent->keybits = 0;
  564. } else if (parent->left && parent->right) {
  565. parent->keymask = 0;
  566. parent->keybits = 0;
  567. } else {
  568. assert((parent->keymask & node->keymask) == 0);
  569. parent->keymask |= node->keymask;
  570. parent->keymask |= (1 << parent->bitnum);
  571. parent->keybits |= node->keybits;
  572. if (parent->right)
  573. parent->keybits |= (1 << parent->bitnum);
  574. }
  575. node = parent;
  576. }
  577. return 0;
  578. }
  579. /*
  580. * Prune internal nodes.
  581. *
  582. * Fully populated subtrees that end at the same leaf have already
  583. * been collapsed. There are still internal nodes that have for both
  584. * their left and right branches a sequence of singletons that make
  585. * identical choices and end in identical leaves. The keymask and
  586. * keybits collected in the nodes describe the choices made in these
  587. * singleton chains. When they are identical for the left and right
  588. * branch of a node, and the two leaves comare identical, the node in
  589. * question can be removed.
  590. *
  591. * Note that nodes with the nextbyte tag set will not be removed by
  592. * this to ensure tree integrity. Note as well that the structure of
  593. * utf8 ensures that these nodes would not have been candidates for
  594. * removal in any case.
  595. */
  596. static void prune(struct tree *tree)
  597. {
  598. struct node *node;
  599. struct node *left;
  600. struct node *right;
  601. struct node *parent;
  602. void *leftleaf;
  603. void *rightleaf;
  604. unsigned int leftmask;
  605. unsigned int rightmask;
  606. unsigned int bitmask;
  607. int count;
  608. if (verbose > 0)
  609. printf("Pruning %s_%x\n", tree->type, tree->maxage);
  610. count = 0;
  611. if (tree->childnode == LEAF)
  612. return;
  613. if (!tree->root)
  614. return;
  615. leftmask = rightmask = 0;
  616. node = tree->root;
  617. while (node) {
  618. if (node->nextbyte)
  619. goto advance;
  620. if (node->leftnode == LEAF)
  621. goto advance;
  622. if (node->rightnode == LEAF)
  623. goto advance;
  624. if (!node->left)
  625. goto advance;
  626. if (!node->right)
  627. goto advance;
  628. left = node->left;
  629. right = node->right;
  630. if (left->keymask == 0)
  631. goto advance;
  632. if (right->keymask == 0)
  633. goto advance;
  634. if (left->keymask != right->keymask)
  635. goto advance;
  636. if (left->keybits != right->keybits)
  637. goto advance;
  638. leftleaf = NULL;
  639. while (!leftleaf) {
  640. assert(left->left || left->right);
  641. if (left->leftnode == LEAF)
  642. leftleaf = left->left;
  643. else if (left->rightnode == LEAF)
  644. leftleaf = left->right;
  645. else if (left->left)
  646. left = left->left;
  647. else if (left->right)
  648. left = left->right;
  649. else
  650. assert(0);
  651. }
  652. rightleaf = NULL;
  653. while (!rightleaf) {
  654. assert(right->left || right->right);
  655. if (right->leftnode == LEAF)
  656. rightleaf = right->left;
  657. else if (right->rightnode == LEAF)
  658. rightleaf = right->right;
  659. else if (right->left)
  660. right = right->left;
  661. else if (right->right)
  662. right = right->right;
  663. else
  664. assert(0);
  665. }
  666. if (! tree->leaf_equal(leftleaf, rightleaf))
  667. goto advance;
  668. /*
  669. * This node has identical singleton-only subtrees.
  670. * Remove it.
  671. */
  672. parent = node->parent;
  673. left = node->left;
  674. right = node->right;
  675. if (parent->left == node)
  676. parent->left = left;
  677. else if (parent->right == node)
  678. parent->right = left;
  679. else
  680. assert(0);
  681. left->parent = parent;
  682. left->keymask |= (1 << node->bitnum);
  683. node->left = NULL;
  684. while (node) {
  685. bitmask = 1 << node->bitnum;
  686. leftmask &= ~bitmask;
  687. rightmask &= ~bitmask;
  688. if (node->leftnode == NODE && node->left) {
  689. left = node->left;
  690. free(node);
  691. count++;
  692. node = left;
  693. } else if (node->rightnode == NODE && node->right) {
  694. right = node->right;
  695. free(node);
  696. count++;
  697. node = right;
  698. } else {
  699. node = NULL;
  700. }
  701. }
  702. /* Propagate keymasks up along singleton chains. */
  703. node = parent;
  704. /* Force re-check */
  705. bitmask = 1 << node->bitnum;
  706. leftmask &= ~bitmask;
  707. rightmask &= ~bitmask;
  708. for (;;) {
  709. if (node->left && node->right)
  710. break;
  711. if (node->left) {
  712. left = node->left;
  713. node->keymask |= left->keymask;
  714. node->keybits |= left->keybits;
  715. }
  716. if (node->right) {
  717. right = node->right;
  718. node->keymask |= right->keymask;
  719. node->keybits |= right->keybits;
  720. }
  721. node->keymask |= (1 << node->bitnum);
  722. node = node->parent;
  723. /* Force re-check */
  724. bitmask = 1 << node->bitnum;
  725. leftmask &= ~bitmask;
  726. rightmask &= ~bitmask;
  727. }
  728. advance:
  729. bitmask = 1 << node->bitnum;
  730. if ((leftmask & bitmask) == 0 &&
  731. node->leftnode == NODE &&
  732. node->left) {
  733. leftmask |= bitmask;
  734. node = node->left;
  735. } else if ((rightmask & bitmask) == 0 &&
  736. node->rightnode == NODE &&
  737. node->right) {
  738. rightmask |= bitmask;
  739. node = node->right;
  740. } else {
  741. leftmask &= ~bitmask;
  742. rightmask &= ~bitmask;
  743. node = node->parent;
  744. }
  745. }
  746. if (verbose > 0)
  747. printf("Pruned %d nodes\n", count);
  748. }
  749. /*
  750. * Mark the nodes in the tree that lead to leaves that must be
  751. * emitted.
  752. */
  753. static void mark_nodes(struct tree *tree)
  754. {
  755. struct node *node;
  756. struct node *n;
  757. unsigned int leftmask;
  758. unsigned int rightmask;
  759. unsigned int bitmask;
  760. int marked;
  761. marked = 0;
  762. if (verbose > 0)
  763. printf("Marking %s_%x\n", tree->type, tree->maxage);
  764. if (tree->childnode == LEAF)
  765. goto done;
  766. assert(tree->childnode == NODE);
  767. node = tree->root;
  768. leftmask = rightmask = 0;
  769. while (node) {
  770. bitmask = 1 << node->bitnum;
  771. if ((leftmask & bitmask) == 0) {
  772. leftmask |= bitmask;
  773. if (node->leftnode == LEAF) {
  774. assert(node->left);
  775. if (tree->leaf_mark(node->left)) {
  776. n = node;
  777. while (n && !n->mark) {
  778. marked++;
  779. n->mark = 1;
  780. n = n->parent;
  781. }
  782. }
  783. } else if (node->left) {
  784. assert(node->leftnode == NODE);
  785. node = node->left;
  786. continue;
  787. }
  788. }
  789. if ((rightmask & bitmask) == 0) {
  790. rightmask |= bitmask;
  791. if (node->rightnode == LEAF) {
  792. assert(node->right);
  793. if (tree->leaf_mark(node->right)) {
  794. n = node;
  795. while (n && !n->mark) {
  796. marked++;
  797. n->mark = 1;
  798. n = n->parent;
  799. }
  800. }
  801. } else if (node->right) {
  802. assert(node->rightnode == NODE);
  803. node = node->right;
  804. continue;
  805. }
  806. }
  807. leftmask &= ~bitmask;
  808. rightmask &= ~bitmask;
  809. node = node->parent;
  810. }
  811. /* second pass: left siblings and singletons */
  812. assert(tree->childnode == NODE);
  813. node = tree->root;
  814. leftmask = rightmask = 0;
  815. while (node) {
  816. bitmask = 1 << node->bitnum;
  817. if ((leftmask & bitmask) == 0) {
  818. leftmask |= bitmask;
  819. if (node->leftnode == LEAF) {
  820. assert(node->left);
  821. if (tree->leaf_mark(node->left)) {
  822. n = node;
  823. while (n && !n->mark) {
  824. marked++;
  825. n->mark = 1;
  826. n = n->parent;
  827. }
  828. }
  829. } else if (node->left) {
  830. assert(node->leftnode == NODE);
  831. node = node->left;
  832. if (!node->mark && node->parent->mark) {
  833. marked++;
  834. node->mark = 1;
  835. }
  836. continue;
  837. }
  838. }
  839. if ((rightmask & bitmask) == 0) {
  840. rightmask |= bitmask;
  841. if (node->rightnode == LEAF) {
  842. assert(node->right);
  843. if (tree->leaf_mark(node->right)) {
  844. n = node;
  845. while (n && !n->mark) {
  846. marked++;
  847. n->mark = 1;
  848. n = n->parent;
  849. }
  850. }
  851. } else if (node->right) {
  852. assert(node->rightnode == NODE);
  853. node = node->right;
  854. if (!node->mark && node->parent->mark &&
  855. !node->parent->left) {
  856. marked++;
  857. node->mark = 1;
  858. }
  859. continue;
  860. }
  861. }
  862. leftmask &= ~bitmask;
  863. rightmask &= ~bitmask;
  864. node = node->parent;
  865. }
  866. done:
  867. if (verbose > 0)
  868. printf("Marked %d nodes\n", marked);
  869. }
  870. /*
  871. * Compute the index of each node and leaf, which is the offset in the
  872. * emitted trie. These values must be pre-computed because relative
  873. * offsets between nodes are used to navigate the tree.
  874. */
  875. static int index_nodes(struct tree *tree, int index)
  876. {
  877. struct node *node;
  878. unsigned int leftmask;
  879. unsigned int rightmask;
  880. unsigned int bitmask;
  881. int count;
  882. int indent;
  883. /* Align to a cache line (or half a cache line?). */
  884. while (index % 64)
  885. index++;
  886. tree->index = index;
  887. indent = 1;
  888. count = 0;
  889. if (verbose > 0)
  890. printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
  891. if (tree->childnode == LEAF) {
  892. index += tree->leaf_size(tree->root);
  893. goto done;
  894. }
  895. assert(tree->childnode == NODE);
  896. node = tree->root;
  897. leftmask = rightmask = 0;
  898. while (node) {
  899. if (!node->mark)
  900. goto skip;
  901. count++;
  902. if (node->index != index)
  903. node->index = index;
  904. index += node->size;
  905. skip:
  906. while (node) {
  907. bitmask = 1 << node->bitnum;
  908. if (node->mark && (leftmask & bitmask) == 0) {
  909. leftmask |= bitmask;
  910. if (node->leftnode == LEAF) {
  911. assert(node->left);
  912. *tree->leaf_index(tree, node->left) =
  913. index;
  914. index += tree->leaf_size(node->left);
  915. count++;
  916. } else if (node->left) {
  917. assert(node->leftnode == NODE);
  918. indent += 1;
  919. node = node->left;
  920. break;
  921. }
  922. }
  923. if (node->mark && (rightmask & bitmask) == 0) {
  924. rightmask |= bitmask;
  925. if (node->rightnode == LEAF) {
  926. assert(node->right);
  927. *tree->leaf_index(tree, node->right) = index;
  928. index += tree->leaf_size(node->right);
  929. count++;
  930. } else if (node->right) {
  931. assert(node->rightnode == NODE);
  932. indent += 1;
  933. node = node->right;
  934. break;
  935. }
  936. }
  937. leftmask &= ~bitmask;
  938. rightmask &= ~bitmask;
  939. node = node->parent;
  940. indent -= 1;
  941. }
  942. }
  943. done:
  944. /* Round up to a multiple of 16 */
  945. while (index % 16)
  946. index++;
  947. if (verbose > 0)
  948. printf("Final index %d\n", index);
  949. return index;
  950. }
  951. /*
  952. * Mark the nodes in a subtree, helper for size_nodes().
  953. */
  954. static int mark_subtree(struct node *node)
  955. {
  956. int changed;
  957. if (!node || node->mark)
  958. return 0;
  959. node->mark = 1;
  960. node->index = node->parent->index;
  961. changed = 1;
  962. if (node->leftnode == NODE)
  963. changed += mark_subtree(node->left);
  964. if (node->rightnode == NODE)
  965. changed += mark_subtree(node->right);
  966. return changed;
  967. }
  968. /*
  969. * Compute the size of nodes and leaves. We start by assuming that
  970. * each node needs to store a three-byte offset. The indexes of the
  971. * nodes are calculated based on that, and then this function is
  972. * called to see if the sizes of some nodes can be reduced. This is
  973. * repeated until no more changes are seen.
  974. */
  975. static int size_nodes(struct tree *tree)
  976. {
  977. struct tree *next;
  978. struct node *node;
  979. struct node *right;
  980. struct node *n;
  981. unsigned int leftmask;
  982. unsigned int rightmask;
  983. unsigned int bitmask;
  984. unsigned int pathbits;
  985. unsigned int pathmask;
  986. unsigned int nbit;
  987. int changed;
  988. int offset;
  989. int size;
  990. int indent;
  991. indent = 1;
  992. changed = 0;
  993. size = 0;
  994. if (verbose > 0)
  995. printf("Sizing %s_%x\n", tree->type, tree->maxage);
  996. if (tree->childnode == LEAF)
  997. goto done;
  998. assert(tree->childnode == NODE);
  999. pathbits = 0;
  1000. pathmask = 0;
  1001. node = tree->root;
  1002. leftmask = rightmask = 0;
  1003. while (node) {
  1004. if (!node->mark)
  1005. goto skip;
  1006. offset = 0;
  1007. if (!node->left || !node->right) {
  1008. size = 1;
  1009. } else {
  1010. if (node->rightnode == NODE) {
  1011. /*
  1012. * If the right node is not marked,
  1013. * look for a corresponding node in
  1014. * the next tree. Such a node need
  1015. * not exist.
  1016. */
  1017. right = node->right;
  1018. next = tree->next;
  1019. while (!right->mark) {
  1020. assert(next);
  1021. n = next->root;
  1022. while (n->bitnum != node->bitnum) {
  1023. nbit = 1 << n->bitnum;
  1024. if (!(pathmask & nbit))
  1025. break;
  1026. if (pathbits & nbit) {
  1027. if (n->rightnode == LEAF)
  1028. break;
  1029. n = n->right;
  1030. } else {
  1031. if (n->leftnode == LEAF)
  1032. break;
  1033. n = n->left;
  1034. }
  1035. }
  1036. if (n->bitnum != node->bitnum)
  1037. break;
  1038. n = n->right;
  1039. right = n;
  1040. next = next->next;
  1041. }
  1042. /* Make sure the right node is marked. */
  1043. if (!right->mark)
  1044. changed += mark_subtree(right);
  1045. offset = right->index - node->index;
  1046. } else {
  1047. offset = *tree->leaf_index(tree, node->right);
  1048. offset -= node->index;
  1049. }
  1050. assert(offset >= 0);
  1051. assert(offset <= 0xffffff);
  1052. if (offset <= 0xff) {
  1053. size = 2;
  1054. } else if (offset <= 0xffff) {
  1055. size = 3;
  1056. } else { /* offset <= 0xffffff */
  1057. size = 4;
  1058. }
  1059. }
  1060. if (node->size != size || node->offset != offset) {
  1061. node->size = size;
  1062. node->offset = offset;
  1063. changed++;
  1064. }
  1065. skip:
  1066. while (node) {
  1067. bitmask = 1 << node->bitnum;
  1068. pathmask |= bitmask;
  1069. if (node->mark && (leftmask & bitmask) == 0) {
  1070. leftmask |= bitmask;
  1071. if (node->leftnode == LEAF) {
  1072. assert(node->left);
  1073. } else if (node->left) {
  1074. assert(node->leftnode == NODE);
  1075. indent += 1;
  1076. node = node->left;
  1077. break;
  1078. }
  1079. }
  1080. if (node->mark && (rightmask & bitmask) == 0) {
  1081. rightmask |= bitmask;
  1082. pathbits |= bitmask;
  1083. if (node->rightnode == LEAF) {
  1084. assert(node->right);
  1085. } else if (node->right) {
  1086. assert(node->rightnode == NODE);
  1087. indent += 1;
  1088. node = node->right;
  1089. break;
  1090. }
  1091. }
  1092. leftmask &= ~bitmask;
  1093. rightmask &= ~bitmask;
  1094. pathmask &= ~bitmask;
  1095. pathbits &= ~bitmask;
  1096. node = node->parent;
  1097. indent -= 1;
  1098. }
  1099. }
  1100. done:
  1101. if (verbose > 0)
  1102. printf("Found %d changes\n", changed);
  1103. return changed;
  1104. }
  1105. /*
  1106. * Emit a trie for the given tree into the data array.
  1107. */
  1108. static void emit(struct tree *tree, unsigned char *data)
  1109. {
  1110. struct node *node;
  1111. unsigned int leftmask;
  1112. unsigned int rightmask;
  1113. unsigned int bitmask;
  1114. int offlen;
  1115. int offset;
  1116. int index;
  1117. int indent;
  1118. int size;
  1119. int bytes;
  1120. int leaves;
  1121. int nodes[4];
  1122. unsigned char byte;
  1123. nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
  1124. leaves = 0;
  1125. bytes = 0;
  1126. index = tree->index;
  1127. data += index;
  1128. indent = 1;
  1129. if (verbose > 0)
  1130. printf("Emitting %s_%x\n", tree->type, tree->maxage);
  1131. if (tree->childnode == LEAF) {
  1132. assert(tree->root);
  1133. tree->leaf_emit(tree->root, data);
  1134. size = tree->leaf_size(tree->root);
  1135. index += size;
  1136. leaves++;
  1137. goto done;
  1138. }
  1139. assert(tree->childnode == NODE);
  1140. node = tree->root;
  1141. leftmask = rightmask = 0;
  1142. while (node) {
  1143. if (!node->mark)
  1144. goto skip;
  1145. assert(node->offset != -1);
  1146. assert(node->index == index);
  1147. byte = 0;
  1148. if (node->nextbyte)
  1149. byte |= NEXTBYTE;
  1150. byte |= (node->bitnum & BITNUM);
  1151. if (node->left && node->right) {
  1152. if (node->leftnode == NODE)
  1153. byte |= LEFTNODE;
  1154. if (node->rightnode == NODE)
  1155. byte |= RIGHTNODE;
  1156. if (node->offset <= 0xff)
  1157. offlen = 1;
  1158. else if (node->offset <= 0xffff)
  1159. offlen = 2;
  1160. else
  1161. offlen = 3;
  1162. nodes[offlen]++;
  1163. offset = node->offset;
  1164. byte |= offlen << OFFLEN_SHIFT;
  1165. *data++ = byte;
  1166. index++;
  1167. while (offlen--) {
  1168. *data++ = offset & 0xff;
  1169. index++;
  1170. offset >>= 8;
  1171. }
  1172. } else if (node->left) {
  1173. if (node->leftnode == NODE)
  1174. byte |= TRIENODE;
  1175. nodes[0]++;
  1176. *data++ = byte;
  1177. index++;
  1178. } else if (node->right) {
  1179. byte |= RIGHTNODE;
  1180. if (node->rightnode == NODE)
  1181. byte |= TRIENODE;
  1182. nodes[0]++;
  1183. *data++ = byte;
  1184. index++;
  1185. } else {
  1186. assert(0);
  1187. }
  1188. skip:
  1189. while (node) {
  1190. bitmask = 1 << node->bitnum;
  1191. if (node->mark && (leftmask & bitmask) == 0) {
  1192. leftmask |= bitmask;
  1193. if (node->leftnode == LEAF) {
  1194. assert(node->left);
  1195. data = tree->leaf_emit(node->left,
  1196. data);
  1197. size = tree->leaf_size(node->left);
  1198. index += size;
  1199. bytes += size;
  1200. leaves++;
  1201. } else if (node->left) {
  1202. assert(node->leftnode == NODE);
  1203. indent += 1;
  1204. node = node->left;
  1205. break;
  1206. }
  1207. }
  1208. if (node->mark && (rightmask & bitmask) == 0) {
  1209. rightmask |= bitmask;
  1210. if (node->rightnode == LEAF) {
  1211. assert(node->right);
  1212. data = tree->leaf_emit(node->right,
  1213. data);
  1214. size = tree->leaf_size(node->right);
  1215. index += size;
  1216. bytes += size;
  1217. leaves++;
  1218. } else if (node->right) {
  1219. assert(node->rightnode == NODE);
  1220. indent += 1;
  1221. node = node->right;
  1222. break;
  1223. }
  1224. }
  1225. leftmask &= ~bitmask;
  1226. rightmask &= ~bitmask;
  1227. node = node->parent;
  1228. indent -= 1;
  1229. }
  1230. }
  1231. done:
  1232. if (verbose > 0) {
  1233. printf("Emitted %d (%d) leaves",
  1234. leaves, bytes);
  1235. printf(" %d (%d+%d+%d+%d) nodes",
  1236. nodes[0] + nodes[1] + nodes[2] + nodes[3],
  1237. nodes[0], nodes[1], nodes[2], nodes[3]);
  1238. printf(" %d total\n", index - tree->index);
  1239. }
  1240. }
  1241. /* ------------------------------------------------------------------ */
  1242. /*
  1243. * Unicode data.
  1244. *
  1245. * We need to keep track of the Canonical Combining Class, the Age,
  1246. * and decompositions for a code point.
  1247. *
  1248. * For the Age, we store the index into the ages table. Effectively
  1249. * this is a generation number that the table maps to a unicode
  1250. * version.
  1251. *
  1252. * The correction field is used to indicate that this entry is in the
  1253. * corrections array, which contains decompositions that were
  1254. * corrected in later revisions. The value of the correction field is
  1255. * the Unicode version in which the mapping was corrected.
  1256. */
  1257. struct unicode_data {
  1258. unsigned int code;
  1259. int ccc;
  1260. int gen;
  1261. int correction;
  1262. unsigned int *utf32nfdi;
  1263. unsigned int *utf32nfdicf;
  1264. char *utf8nfdi;
  1265. char *utf8nfdicf;
  1266. };
  1267. struct unicode_data unicode_data[0x110000];
  1268. struct unicode_data *corrections;
  1269. int corrections_count;
  1270. struct tree *nfdi_tree;
  1271. struct tree *nfdicf_tree;
  1272. struct tree *trees;
  1273. int trees_count;
  1274. /*
  1275. * Check the corrections array to see if this entry was corrected at
  1276. * some point.
  1277. */
  1278. static struct unicode_data *corrections_lookup(struct unicode_data *u)
  1279. {
  1280. int i;
  1281. for (i = 0; i != corrections_count; i++)
  1282. if (u->code == corrections[i].code)
  1283. return &corrections[i];
  1284. return u;
  1285. }
  1286. static int nfdi_equal(void *l, void *r)
  1287. {
  1288. struct unicode_data *left = l;
  1289. struct unicode_data *right = r;
  1290. if (left->gen != right->gen)
  1291. return 0;
  1292. if (left->ccc != right->ccc)
  1293. return 0;
  1294. if (left->utf8nfdi && right->utf8nfdi &&
  1295. strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
  1296. return 1;
  1297. if (left->utf8nfdi || right->utf8nfdi)
  1298. return 0;
  1299. return 1;
  1300. }
  1301. static int nfdicf_equal(void *l, void *r)
  1302. {
  1303. struct unicode_data *left = l;
  1304. struct unicode_data *right = r;
  1305. if (left->gen != right->gen)
  1306. return 0;
  1307. if (left->ccc != right->ccc)
  1308. return 0;
  1309. if (left->utf8nfdicf && right->utf8nfdicf &&
  1310. strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
  1311. return 1;
  1312. if (left->utf8nfdicf && right->utf8nfdicf)
  1313. return 0;
  1314. if (left->utf8nfdicf || right->utf8nfdicf)
  1315. return 0;
  1316. if (left->utf8nfdi && right->utf8nfdi &&
  1317. strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
  1318. return 1;
  1319. if (left->utf8nfdi || right->utf8nfdi)
  1320. return 0;
  1321. return 1;
  1322. }
  1323. static void nfdi_print(void *l, int indent)
  1324. {
  1325. struct unicode_data *leaf = l;
  1326. printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
  1327. leaf->code, leaf->ccc, leaf->gen);
  1328. if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
  1329. printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
  1330. else if (leaf->utf8nfdi)
  1331. printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
  1332. printf("\n");
  1333. }
  1334. static void nfdicf_print(void *l, int indent)
  1335. {
  1336. struct unicode_data *leaf = l;
  1337. printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
  1338. leaf->code, leaf->ccc, leaf->gen);
  1339. if (leaf->utf8nfdicf)
  1340. printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
  1341. else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
  1342. printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
  1343. else if (leaf->utf8nfdi)
  1344. printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
  1345. printf("\n");
  1346. }
  1347. static int nfdi_mark(void *l)
  1348. {
  1349. return 1;
  1350. }
  1351. static int nfdicf_mark(void *l)
  1352. {
  1353. struct unicode_data *leaf = l;
  1354. if (leaf->utf8nfdicf)
  1355. return 1;
  1356. return 0;
  1357. }
  1358. static int correction_mark(void *l)
  1359. {
  1360. struct unicode_data *leaf = l;
  1361. return leaf->correction;
  1362. }
  1363. static int nfdi_size(void *l)
  1364. {
  1365. struct unicode_data *leaf = l;
  1366. int size = 2;
  1367. if (HANGUL_SYLLABLE(leaf->code))
  1368. size += 1;
  1369. else if (leaf->utf8nfdi)
  1370. size += strlen(leaf->utf8nfdi) + 1;
  1371. return size;
  1372. }
  1373. static int nfdicf_size(void *l)
  1374. {
  1375. struct unicode_data *leaf = l;
  1376. int size = 2;
  1377. if (HANGUL_SYLLABLE(leaf->code))
  1378. size += 1;
  1379. else if (leaf->utf8nfdicf)
  1380. size += strlen(leaf->utf8nfdicf) + 1;
  1381. else if (leaf->utf8nfdi)
  1382. size += strlen(leaf->utf8nfdi) + 1;
  1383. return size;
  1384. }
  1385. static int *nfdi_index(struct tree *tree, void *l)
  1386. {
  1387. struct unicode_data *leaf = l;
  1388. return &tree->leafindex[leaf->code];
  1389. }
  1390. static int *nfdicf_index(struct tree *tree, void *l)
  1391. {
  1392. struct unicode_data *leaf = l;
  1393. return &tree->leafindex[leaf->code];
  1394. }
  1395. static unsigned char *nfdi_emit(void *l, unsigned char *data)
  1396. {
  1397. struct unicode_data *leaf = l;
  1398. unsigned char *s;
  1399. *data++ = leaf->gen;
  1400. if (HANGUL_SYLLABLE(leaf->code)) {
  1401. *data++ = DECOMPOSE;
  1402. *data++ = HANGUL;
  1403. } else if (leaf->utf8nfdi) {
  1404. *data++ = DECOMPOSE;
  1405. s = (unsigned char*)leaf->utf8nfdi;
  1406. while ((*data++ = *s++) != 0)
  1407. ;
  1408. } else {
  1409. *data++ = leaf->ccc;
  1410. }
  1411. return data;
  1412. }
  1413. static unsigned char *nfdicf_emit(void *l, unsigned char *data)
  1414. {
  1415. struct unicode_data *leaf = l;
  1416. unsigned char *s;
  1417. *data++ = leaf->gen;
  1418. if (HANGUL_SYLLABLE(leaf->code)) {
  1419. *data++ = DECOMPOSE;
  1420. *data++ = HANGUL;
  1421. } else if (leaf->utf8nfdicf) {
  1422. *data++ = DECOMPOSE;
  1423. s = (unsigned char*)leaf->utf8nfdicf;
  1424. while ((*data++ = *s++) != 0)
  1425. ;
  1426. } else if (leaf->utf8nfdi) {
  1427. *data++ = DECOMPOSE;
  1428. s = (unsigned char*)leaf->utf8nfdi;
  1429. while ((*data++ = *s++) != 0)
  1430. ;
  1431. } else {
  1432. *data++ = leaf->ccc;
  1433. }
  1434. return data;
  1435. }
  1436. static void utf8_create(struct unicode_data *data)
  1437. {
  1438. char utf[18*4+1];
  1439. char *u;
  1440. unsigned int *um;
  1441. int i;
  1442. if (data->utf8nfdi) {
  1443. assert(data->utf8nfdi[0] == HANGUL);
  1444. return;
  1445. }
  1446. u = utf;
  1447. um = data->utf32nfdi;
  1448. if (um) {
  1449. for (i = 0; um[i]; i++)
  1450. u += utf8encode(u, um[i]);
  1451. *u = '\0';
  1452. data->utf8nfdi = strdup(utf);
  1453. }
  1454. u = utf;
  1455. um = data->utf32nfdicf;
  1456. if (um) {
  1457. for (i = 0; um[i]; i++)
  1458. u += utf8encode(u, um[i]);
  1459. *u = '\0';
  1460. if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
  1461. data->utf8nfdicf = strdup(utf);
  1462. }
  1463. }
  1464. static void utf8_init(void)
  1465. {
  1466. unsigned int unichar;
  1467. int i;
  1468. for (unichar = 0; unichar != 0x110000; unichar++)
  1469. utf8_create(&unicode_data[unichar]);
  1470. for (i = 0; i != corrections_count; i++)
  1471. utf8_create(&corrections[i]);
  1472. }
  1473. static void trees_init(void)
  1474. {
  1475. struct unicode_data *data;
  1476. unsigned int maxage;
  1477. unsigned int nextage;
  1478. int count;
  1479. int i;
  1480. int j;
  1481. /* Count the number of different ages. */
  1482. count = 0;
  1483. nextage = (unsigned int)-1;
  1484. do {
  1485. maxage = nextage;
  1486. nextage = 0;
  1487. for (i = 0; i <= corrections_count; i++) {
  1488. data = &corrections[i];
  1489. if (nextage < data->correction &&
  1490. data->correction < maxage)
  1491. nextage = data->correction;
  1492. }
  1493. count++;
  1494. } while (nextage);
  1495. /* Two trees per age: nfdi and nfdicf */
  1496. trees_count = count * 2;
  1497. trees = calloc(trees_count, sizeof(struct tree));
  1498. /* Assign ages to the trees. */
  1499. count = trees_count;
  1500. nextage = (unsigned int)-1;
  1501. do {
  1502. maxage = nextage;
  1503. trees[--count].maxage = maxage;
  1504. trees[--count].maxage = maxage;
  1505. nextage = 0;
  1506. for (i = 0; i <= corrections_count; i++) {
  1507. data = &corrections[i];
  1508. if (nextage < data->correction &&
  1509. data->correction < maxage)
  1510. nextage = data->correction;
  1511. }
  1512. } while (nextage);
  1513. /* The ages assigned above are off by one. */
  1514. for (i = 0; i != trees_count; i++) {
  1515. j = 0;
  1516. while (ages[j] < trees[i].maxage)
  1517. j++;
  1518. trees[i].maxage = ages[j-1];
  1519. }
  1520. /* Set up the forwarding between trees. */
  1521. trees[trees_count-2].next = &trees[trees_count-1];
  1522. trees[trees_count-1].leaf_mark = nfdi_mark;
  1523. trees[trees_count-2].leaf_mark = nfdicf_mark;
  1524. for (i = 0; i != trees_count-2; i += 2) {
  1525. trees[i].next = &trees[trees_count-2];
  1526. trees[i].leaf_mark = correction_mark;
  1527. trees[i+1].next = &trees[trees_count-1];
  1528. trees[i+1].leaf_mark = correction_mark;
  1529. }
  1530. /* Assign the callouts. */
  1531. for (i = 0; i != trees_count; i += 2) {
  1532. trees[i].type = "nfdicf";
  1533. trees[i].leaf_equal = nfdicf_equal;
  1534. trees[i].leaf_print = nfdicf_print;
  1535. trees[i].leaf_size = nfdicf_size;
  1536. trees[i].leaf_index = nfdicf_index;
  1537. trees[i].leaf_emit = nfdicf_emit;
  1538. trees[i+1].type = "nfdi";
  1539. trees[i+1].leaf_equal = nfdi_equal;
  1540. trees[i+1].leaf_print = nfdi_print;
  1541. trees[i+1].leaf_size = nfdi_size;
  1542. trees[i+1].leaf_index = nfdi_index;
  1543. trees[i+1].leaf_emit = nfdi_emit;
  1544. }
  1545. /* Finish init. */
  1546. for (i = 0; i != trees_count; i++)
  1547. trees[i].childnode = NODE;
  1548. }
  1549. static void trees_populate(void)
  1550. {
  1551. struct unicode_data *data;
  1552. unsigned int unichar;
  1553. char keyval[4];
  1554. int keylen;
  1555. int i;
  1556. for (i = 0; i != trees_count; i++) {
  1557. if (verbose > 0) {
  1558. printf("Populating %s_%x\n",
  1559. trees[i].type, trees[i].maxage);
  1560. }
  1561. for (unichar = 0; unichar != 0x110000; unichar++) {
  1562. if (unicode_data[unichar].gen < 0)
  1563. continue;
  1564. keylen = utf8encode(keyval, unichar);
  1565. data = corrections_lookup(&unicode_data[unichar]);
  1566. if (data->correction <= trees[i].maxage)
  1567. data = &unicode_data[unichar];
  1568. insert(&trees[i], keyval, keylen, data);
  1569. }
  1570. }
  1571. }
  1572. static void trees_reduce(void)
  1573. {
  1574. int i;
  1575. int size;
  1576. int changed;
  1577. for (i = 0; i != trees_count; i++)
  1578. prune(&trees[i]);
  1579. for (i = 0; i != trees_count; i++)
  1580. mark_nodes(&trees[i]);
  1581. do {
  1582. size = 0;
  1583. for (i = 0; i != trees_count; i++)
  1584. size = index_nodes(&trees[i], size);
  1585. changed = 0;
  1586. for (i = 0; i != trees_count; i++)
  1587. changed += size_nodes(&trees[i]);
  1588. } while (changed);
  1589. utf8data = calloc(size, 1);
  1590. utf8data_size = size;
  1591. for (i = 0; i != trees_count; i++)
  1592. emit(&trees[i], utf8data);
  1593. if (verbose > 0) {
  1594. for (i = 0; i != trees_count; i++) {
  1595. printf("%s_%x idx %d\n",
  1596. trees[i].type, trees[i].maxage, trees[i].index);
  1597. }
  1598. }
  1599. nfdi = utf8data + trees[trees_count-1].index;
  1600. nfdicf = utf8data + trees[trees_count-2].index;
  1601. nfdi_tree = &trees[trees_count-1];
  1602. nfdicf_tree = &trees[trees_count-2];
  1603. }
  1604. static void verify(struct tree *tree)
  1605. {
  1606. struct unicode_data *data;
  1607. utf8leaf_t *leaf;
  1608. unsigned int unichar;
  1609. char key[4];
  1610. unsigned char hangul[UTF8HANGULLEAF];
  1611. int report;
  1612. int nocf;
  1613. if (verbose > 0)
  1614. printf("Verifying %s_%x\n", tree->type, tree->maxage);
  1615. nocf = strcmp(tree->type, "nfdicf");
  1616. for (unichar = 0; unichar != 0x110000; unichar++) {
  1617. report = 0;
  1618. data = corrections_lookup(&unicode_data[unichar]);
  1619. if (data->correction <= tree->maxage)
  1620. data = &unicode_data[unichar];
  1621. utf8encode(key,unichar);
  1622. leaf = utf8lookup(tree, hangul, key);
  1623. if (!leaf) {
  1624. if (data->gen != -1)
  1625. report++;
  1626. if (unichar < 0xd800 || unichar > 0xdfff)
  1627. report++;
  1628. } else {
  1629. if (unichar >= 0xd800 && unichar <= 0xdfff)
  1630. report++;
  1631. if (data->gen == -1)
  1632. report++;
  1633. if (data->gen != LEAF_GEN(leaf))
  1634. report++;
  1635. if (LEAF_CCC(leaf) == DECOMPOSE) {
  1636. if (HANGUL_SYLLABLE(data->code)) {
  1637. if (data->utf8nfdi[0] != HANGUL)
  1638. report++;
  1639. } else if (nocf) {
  1640. if (!data->utf8nfdi) {
  1641. report++;
  1642. } else if (strcmp(data->utf8nfdi,
  1643. LEAF_STR(leaf))) {
  1644. report++;
  1645. }
  1646. } else {
  1647. if (!data->utf8nfdicf &&
  1648. !data->utf8nfdi) {
  1649. report++;
  1650. } else if (data->utf8nfdicf) {
  1651. if (strcmp(data->utf8nfdicf,
  1652. LEAF_STR(leaf)))
  1653. report++;
  1654. } else if (strcmp(data->utf8nfdi,
  1655. LEAF_STR(leaf))) {
  1656. report++;
  1657. }
  1658. }
  1659. } else if (data->ccc != LEAF_CCC(leaf)) {
  1660. report++;
  1661. }
  1662. }
  1663. if (report) {
  1664. printf("%X code %X gen %d ccc %d"
  1665. " nfdi -> \"%s\"",
  1666. unichar, data->code, data->gen,
  1667. data->ccc,
  1668. data->utf8nfdi);
  1669. if (leaf) {
  1670. printf(" gen %d ccc %d"
  1671. " nfdi -> \"%s\"",
  1672. LEAF_GEN(leaf),
  1673. LEAF_CCC(leaf),
  1674. LEAF_CCC(leaf) == DECOMPOSE ?
  1675. LEAF_STR(leaf) : "");
  1676. }
  1677. printf("\n");
  1678. }
  1679. }
  1680. }
  1681. static void trees_verify(void)
  1682. {
  1683. int i;
  1684. for (i = 0; i != trees_count; i++)
  1685. verify(&trees[i]);
  1686. }
  1687. /* ------------------------------------------------------------------ */
  1688. static void help(void)
  1689. {
  1690. printf("Usage: %s [options]\n", argv0);
  1691. printf("\n");
  1692. printf("This program creates an a data trie used for parsing and\n");
  1693. printf("normalization of UTF-8 strings. The trie is derived from\n");
  1694. printf("a set of input files from the Unicode character database\n");
  1695. printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
  1696. printf("\n");
  1697. printf("The generated tree supports two normalization forms:\n");
  1698. printf("\n");
  1699. printf("\tnfdi:\n");
  1700. printf("\t- Apply unicode normalization form NFD.\n");
  1701. printf("\t- Remove any Default_Ignorable_Code_Point.\n");
  1702. printf("\n");
  1703. printf("\tnfdicf:\n");
  1704. printf("\t- Apply unicode normalization form NFD.\n");
  1705. printf("\t- Remove any Default_Ignorable_Code_Point.\n");
  1706. printf("\t- Apply a full casefold (C + F).\n");
  1707. printf("\n");
  1708. printf("These forms were chosen as being most useful when dealing\n");
  1709. printf("with file names: NFD catches most cases where characters\n");
  1710. printf("should be considered equivalent. The ignorables are mostly\n");
  1711. printf("invisible, making names hard to type.\n");
  1712. printf("\n");
  1713. printf("The options to specify the files to be used are listed\n");
  1714. printf("below with their default values, which are the names used\n");
  1715. printf("by version 11.0.0 of the Unicode Character Database.\n");
  1716. printf("\n");
  1717. printf("The input files:\n");
  1718. printf("\t-a %s\n", AGE_NAME);
  1719. printf("\t-c %s\n", CCC_NAME);
  1720. printf("\t-p %s\n", PROP_NAME);
  1721. printf("\t-d %s\n", DATA_NAME);
  1722. printf("\t-f %s\n", FOLD_NAME);
  1723. printf("\t-n %s\n", NORM_NAME);
  1724. printf("\n");
  1725. printf("Additionally, the generated tables are tested using:\n");
  1726. printf("\t-t %s\n", TEST_NAME);
  1727. printf("\n");
  1728. printf("Finally, the output file:\n");
  1729. printf("\t-o %s\n", UTF8_NAME);
  1730. printf("\n");
  1731. }
  1732. static void usage(void)
  1733. {
  1734. help();
  1735. exit(1);
  1736. }
  1737. static void open_fail(const char *name, int error)
  1738. {
  1739. printf("Error %d opening %s: %s\n", error, name, strerror(error));
  1740. exit(1);
  1741. }
  1742. static void file_fail(const char *filename)
  1743. {
  1744. printf("Error parsing %s\n", filename);
  1745. exit(1);
  1746. }
  1747. static void line_fail(const char *filename, const char *line)
  1748. {
  1749. printf("Error parsing %s:%s\n", filename, line);
  1750. exit(1);
  1751. }
  1752. /* ------------------------------------------------------------------ */
  1753. static void print_utf32(unsigned int *utf32str)
  1754. {
  1755. int i;
  1756. for (i = 0; utf32str[i]; i++)
  1757. printf(" %X", utf32str[i]);
  1758. }
  1759. static void print_utf32nfdi(unsigned int unichar)
  1760. {
  1761. printf(" %X ->", unichar);
  1762. print_utf32(unicode_data[unichar].utf32nfdi);
  1763. printf("\n");
  1764. }
  1765. static void print_utf32nfdicf(unsigned int unichar)
  1766. {
  1767. printf(" %X ->", unichar);
  1768. print_utf32(unicode_data[unichar].utf32nfdicf);
  1769. printf("\n");
  1770. }
  1771. /* ------------------------------------------------------------------ */
  1772. static void age_init(void)
  1773. {
  1774. FILE *file;
  1775. unsigned int first;
  1776. unsigned int last;
  1777. unsigned int unichar;
  1778. unsigned int major;
  1779. unsigned int minor;
  1780. unsigned int revision;
  1781. int gen;
  1782. int count;
  1783. int ret;
  1784. if (verbose > 0)
  1785. printf("Parsing %s\n", age_name);
  1786. file = fopen(age_name, "r");
  1787. if (!file)
  1788. open_fail(age_name, errno);
  1789. count = 0;
  1790. gen = 0;
  1791. while (fgets(line, LINESIZE, file)) {
  1792. ret = sscanf(line, "# Age=V%d_%d_%d",
  1793. &major, &minor, &revision);
  1794. if (ret == 3) {
  1795. ages_count++;
  1796. if (verbose > 1)
  1797. printf(" Age V%d_%d_%d\n",
  1798. major, minor, revision);
  1799. if (!age_valid(major, minor, revision))
  1800. line_fail(age_name, line);
  1801. continue;
  1802. }
  1803. ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
  1804. if (ret == 2) {
  1805. ages_count++;
  1806. if (verbose > 1)
  1807. printf(" Age V%d_%d\n", major, minor);
  1808. if (!age_valid(major, minor, 0))
  1809. line_fail(age_name, line);
  1810. continue;
  1811. }
  1812. }
  1813. /* We must have found something above. */
  1814. if (verbose > 1)
  1815. printf("%d age entries\n", ages_count);
  1816. if (ages_count == 0 || ages_count > MAXGEN)
  1817. file_fail(age_name);
  1818. /* There is a 0 entry. */
  1819. ages_count++;
  1820. ages = calloc(ages_count + 1, sizeof(*ages));
  1821. /* And a guard entry. */
  1822. ages[ages_count] = (unsigned int)-1;
  1823. rewind(file);
  1824. count = 0;
  1825. gen = 0;
  1826. while (fgets(line, LINESIZE, file)) {
  1827. ret = sscanf(line, "# Age=V%d_%d_%d",
  1828. &major, &minor, &revision);
  1829. if (ret == 3) {
  1830. ages[++gen] =
  1831. UNICODE_AGE(major, minor, revision);
  1832. if (verbose > 1)
  1833. printf(" Age V%d_%d_%d = gen %d\n",
  1834. major, minor, revision, gen);
  1835. if (!age_valid(major, minor, revision))
  1836. line_fail(age_name, line);
  1837. continue;
  1838. }
  1839. ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
  1840. if (ret == 2) {
  1841. ages[++gen] = UNICODE_AGE(major, minor, 0);
  1842. if (verbose > 1)
  1843. printf(" Age V%d_%d = %d\n",
  1844. major, minor, gen);
  1845. if (!age_valid(major, minor, 0))
  1846. line_fail(age_name, line);
  1847. continue;
  1848. }
  1849. ret = sscanf(line, "%X..%X ; %d.%d #",
  1850. &first, &last, &major, &minor);
  1851. if (ret == 4) {
  1852. for (unichar = first; unichar <= last; unichar++)
  1853. unicode_data[unichar].gen = gen;
  1854. count += 1 + last - first;
  1855. if (verbose > 1)
  1856. printf(" %X..%X gen %d\n", first, last, gen);
  1857. if (!utf32valid(first) || !utf32valid(last))
  1858. line_fail(age_name, line);
  1859. continue;
  1860. }
  1861. ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
  1862. if (ret == 3) {
  1863. unicode_data[unichar].gen = gen;
  1864. count++;
  1865. if (verbose > 1)
  1866. printf(" %X gen %d\n", unichar, gen);
  1867. if (!utf32valid(unichar))
  1868. line_fail(age_name, line);
  1869. continue;
  1870. }
  1871. }
  1872. unicode_maxage = ages[gen];
  1873. fclose(file);
  1874. /* Nix surrogate block */
  1875. if (verbose > 1)
  1876. printf(" Removing surrogate block D800..DFFF\n");
  1877. for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
  1878. unicode_data[unichar].gen = -1;
  1879. if (verbose > 0)
  1880. printf("Found %d entries\n", count);
  1881. if (count == 0)
  1882. file_fail(age_name);
  1883. }
  1884. static void ccc_init(void)
  1885. {
  1886. FILE *file;
  1887. unsigned int first;
  1888. unsigned int last;
  1889. unsigned int unichar;
  1890. unsigned int value;
  1891. int count;
  1892. int ret;
  1893. if (verbose > 0)
  1894. printf("Parsing %s\n", ccc_name);
  1895. file = fopen(ccc_name, "r");
  1896. if (!file)
  1897. open_fail(ccc_name, errno);
  1898. count = 0;
  1899. while (fgets(line, LINESIZE, file)) {
  1900. ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
  1901. if (ret == 3) {
  1902. for (unichar = first; unichar <= last; unichar++) {
  1903. unicode_data[unichar].ccc = value;
  1904. count++;
  1905. }
  1906. if (verbose > 1)
  1907. printf(" %X..%X ccc %d\n", first, last, value);
  1908. if (!utf32valid(first) || !utf32valid(last))
  1909. line_fail(ccc_name, line);
  1910. continue;
  1911. }
  1912. ret = sscanf(line, "%X ; %d #", &unichar, &value);
  1913. if (ret == 2) {
  1914. unicode_data[unichar].ccc = value;
  1915. count++;
  1916. if (verbose > 1)
  1917. printf(" %X ccc %d\n", unichar, value);
  1918. if (!utf32valid(unichar))
  1919. line_fail(ccc_name, line);
  1920. continue;
  1921. }
  1922. }
  1923. fclose(file);
  1924. if (verbose > 0)
  1925. printf("Found %d entries\n", count);
  1926. if (count == 0)
  1927. file_fail(ccc_name);
  1928. }
  1929. static int ignore_compatibility_form(char *type)
  1930. {
  1931. int i;
  1932. char *ignored_types[] = {"font", "noBreak", "initial", "medial",
  1933. "final", "isolated", "circle", "super",
  1934. "sub", "vertical", "wide", "narrow",
  1935. "small", "square", "fraction", "compat"};
  1936. for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
  1937. if (strcmp(type, ignored_types[i]) == 0)
  1938. return 1;
  1939. return 0;
  1940. }
  1941. static void nfdi_init(void)
  1942. {
  1943. FILE *file;
  1944. unsigned int unichar;
  1945. unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
  1946. char *s;
  1947. char *type;
  1948. unsigned int *um;
  1949. int count;
  1950. int i;
  1951. int ret;
  1952. if (verbose > 0)
  1953. printf("Parsing %s\n", data_name);
  1954. file = fopen(data_name, "r");
  1955. if (!file)
  1956. open_fail(data_name, errno);
  1957. count = 0;
  1958. while (fgets(line, LINESIZE, file)) {
  1959. ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
  1960. &unichar, buf0);
  1961. if (ret != 2)
  1962. continue;
  1963. if (!utf32valid(unichar))
  1964. line_fail(data_name, line);
  1965. s = buf0;
  1966. /* skip over <tag> */
  1967. if (*s == '<') {
  1968. type = ++s;
  1969. while (*++s != '>');
  1970. *s++ = '\0';
  1971. if(ignore_compatibility_form(type))
  1972. continue;
  1973. }
  1974. /* decode the decomposition into UTF-32 */
  1975. i = 0;
  1976. while (*s) {
  1977. mapping[i] = strtoul(s, &s, 16);
  1978. if (!utf32valid(mapping[i]))
  1979. line_fail(data_name, line);
  1980. i++;
  1981. }
  1982. mapping[i++] = 0;
  1983. um = malloc(i * sizeof(unsigned int));
  1984. memcpy(um, mapping, i * sizeof(unsigned int));
  1985. unicode_data[unichar].utf32nfdi = um;
  1986. if (verbose > 1)
  1987. print_utf32nfdi(unichar);
  1988. count++;
  1989. }
  1990. fclose(file);
  1991. if (verbose > 0)
  1992. printf("Found %d entries\n", count);
  1993. if (count == 0)
  1994. file_fail(data_name);
  1995. }
  1996. static void nfdicf_init(void)
  1997. {
  1998. FILE *file;
  1999. unsigned int unichar;
  2000. unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
  2001. char status;
  2002. char *s;
  2003. unsigned int *um;
  2004. int i;
  2005. int count;
  2006. int ret;
  2007. if (verbose > 0)
  2008. printf("Parsing %s\n", fold_name);
  2009. file = fopen(fold_name, "r");
  2010. if (!file)
  2011. open_fail(fold_name, errno);
  2012. count = 0;
  2013. while (fgets(line, LINESIZE, file)) {
  2014. ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
  2015. if (ret != 3)
  2016. continue;
  2017. if (!utf32valid(unichar))
  2018. line_fail(fold_name, line);
  2019. /* Use the C+F casefold. */
  2020. if (status != 'C' && status != 'F')
  2021. continue;
  2022. s = buf0;
  2023. if (*s == '<')
  2024. while (*s++ != ' ')
  2025. ;
  2026. i = 0;
  2027. while (*s) {
  2028. mapping[i] = strtoul(s, &s, 16);
  2029. if (!utf32valid(mapping[i]))
  2030. line_fail(fold_name, line);
  2031. i++;
  2032. }
  2033. mapping[i++] = 0;
  2034. um = malloc(i * sizeof(unsigned int));
  2035. memcpy(um, mapping, i * sizeof(unsigned int));
  2036. unicode_data[unichar].utf32nfdicf = um;
  2037. if (verbose > 1)
  2038. print_utf32nfdicf(unichar);
  2039. count++;
  2040. }
  2041. fclose(file);
  2042. if (verbose > 0)
  2043. printf("Found %d entries\n", count);
  2044. if (count == 0)
  2045. file_fail(fold_name);
  2046. }
  2047. static void ignore_init(void)
  2048. {
  2049. FILE *file;
  2050. unsigned int unichar;
  2051. unsigned int first;
  2052. unsigned int last;
  2053. unsigned int *um;
  2054. int count;
  2055. int ret;
  2056. if (verbose > 0)
  2057. printf("Parsing %s\n", prop_name);
  2058. file = fopen(prop_name, "r");
  2059. if (!file)
  2060. open_fail(prop_name, errno);
  2061. assert(file);
  2062. count = 0;
  2063. while (fgets(line, LINESIZE, file)) {
  2064. ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
  2065. if (ret == 3) {
  2066. if (strcmp(buf0, "Default_Ignorable_Code_Point"))
  2067. continue;
  2068. if (!utf32valid(first) || !utf32valid(last))
  2069. line_fail(prop_name, line);
  2070. for (unichar = first; unichar <= last; unichar++) {
  2071. free(unicode_data[unichar].utf32nfdi);
  2072. um = malloc(sizeof(unsigned int));
  2073. *um = 0;
  2074. unicode_data[unichar].utf32nfdi = um;
  2075. free(unicode_data[unichar].utf32nfdicf);
  2076. um = malloc(sizeof(unsigned int));
  2077. *um = 0;
  2078. unicode_data[unichar].utf32nfdicf = um;
  2079. count++;
  2080. }
  2081. if (verbose > 1)
  2082. printf(" %X..%X Default_Ignorable_Code_Point\n",
  2083. first, last);
  2084. continue;
  2085. }
  2086. ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
  2087. if (ret == 2) {
  2088. if (strcmp(buf0, "Default_Ignorable_Code_Point"))
  2089. continue;
  2090. if (!utf32valid(unichar))
  2091. line_fail(prop_name, line);
  2092. free(unicode_data[unichar].utf32nfdi);
  2093. um = malloc(sizeof(unsigned int));
  2094. *um = 0;
  2095. unicode_data[unichar].utf32nfdi = um;
  2096. free(unicode_data[unichar].utf32nfdicf);
  2097. um = malloc(sizeof(unsigned int));
  2098. *um = 0;
  2099. unicode_data[unichar].utf32nfdicf = um;
  2100. if (verbose > 1)
  2101. printf(" %X Default_Ignorable_Code_Point\n",
  2102. unichar);
  2103. count++;
  2104. continue;
  2105. }
  2106. }
  2107. fclose(file);
  2108. if (verbose > 0)
  2109. printf("Found %d entries\n", count);
  2110. if (count == 0)
  2111. file_fail(prop_name);
  2112. }
  2113. static void corrections_init(void)
  2114. {
  2115. FILE *file;
  2116. unsigned int unichar;
  2117. unsigned int major;
  2118. unsigned int minor;
  2119. unsigned int revision;
  2120. unsigned int age;
  2121. unsigned int *um;
  2122. unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
  2123. char *s;
  2124. int i;
  2125. int count;
  2126. int ret;
  2127. if (verbose > 0)
  2128. printf("Parsing %s\n", norm_name);
  2129. file = fopen(norm_name, "r");
  2130. if (!file)
  2131. open_fail(norm_name, errno);
  2132. count = 0;
  2133. while (fgets(line, LINESIZE, file)) {
  2134. ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
  2135. &unichar, buf0, buf1,
  2136. &major, &minor, &revision);
  2137. if (ret != 6)
  2138. continue;
  2139. if (!utf32valid(unichar) || !age_valid(major, minor, revision))
  2140. line_fail(norm_name, line);
  2141. count++;
  2142. }
  2143. corrections = calloc(count, sizeof(struct unicode_data));
  2144. corrections_count = count;
  2145. rewind(file);
  2146. count = 0;
  2147. while (fgets(line, LINESIZE, file)) {
  2148. ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
  2149. &unichar, buf0, buf1,
  2150. &major, &minor, &revision);
  2151. if (ret != 6)
  2152. continue;
  2153. if (!utf32valid(unichar) || !age_valid(major, minor, revision))
  2154. line_fail(norm_name, line);
  2155. corrections[count] = unicode_data[unichar];
  2156. assert(corrections[count].code == unichar);
  2157. age = UNICODE_AGE(major, minor, revision);
  2158. corrections[count].correction = age;
  2159. i = 0;
  2160. s = buf0;
  2161. while (*s) {
  2162. mapping[i] = strtoul(s, &s, 16);
  2163. if (!utf32valid(mapping[i]))
  2164. line_fail(norm_name, line);
  2165. i++;
  2166. }
  2167. mapping[i++] = 0;
  2168. um = malloc(i * sizeof(unsigned int));
  2169. memcpy(um, mapping, i * sizeof(unsigned int));
  2170. corrections[count].utf32nfdi = um;
  2171. if (verbose > 1)
  2172. printf(" %X -> %s -> %s V%d_%d_%d\n",
  2173. unichar, buf0, buf1, major, minor, revision);
  2174. count++;
  2175. }
  2176. fclose(file);
  2177. if (verbose > 0)
  2178. printf("Found %d entries\n", count);
  2179. if (count == 0)
  2180. file_fail(norm_name);
  2181. }
  2182. /* ------------------------------------------------------------------ */
  2183. /*
  2184. * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
  2185. *
  2186. * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
  2187. * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
  2188. *
  2189. * SBase = 0xAC00
  2190. * LBase = 0x1100
  2191. * VBase = 0x1161
  2192. * TBase = 0x11A7
  2193. * LCount = 19
  2194. * VCount = 21
  2195. * TCount = 28
  2196. * NCount = 588 (VCount * TCount)
  2197. * SCount = 11172 (LCount * NCount)
  2198. *
  2199. * Decomposition:
  2200. * SIndex = s - SBase
  2201. *
  2202. * LV (Canonical/Full)
  2203. * LIndex = SIndex / NCount
  2204. * VIndex = (Sindex % NCount) / TCount
  2205. * LPart = LBase + LIndex
  2206. * VPart = VBase + VIndex
  2207. *
  2208. * LVT (Canonical)
  2209. * LVIndex = (SIndex / TCount) * TCount
  2210. * TIndex = (Sindex % TCount)
  2211. * LVPart = SBase + LVIndex
  2212. * TPart = TBase + TIndex
  2213. *
  2214. * LVT (Full)
  2215. * LIndex = SIndex / NCount
  2216. * VIndex = (Sindex % NCount) / TCount
  2217. * TIndex = (Sindex % TCount)
  2218. * LPart = LBase + LIndex
  2219. * VPart = VBase + VIndex
  2220. * if (TIndex == 0) {
  2221. * d = <LPart, VPart>
  2222. * } else {
  2223. * TPart = TBase + TIndex
  2224. * d = <LPart, VPart, TPart>
  2225. * }
  2226. *
  2227. */
  2228. static void hangul_decompose(void)
  2229. {
  2230. unsigned int sb = 0xAC00;
  2231. unsigned int lb = 0x1100;
  2232. unsigned int vb = 0x1161;
  2233. unsigned int tb = 0x11a7;
  2234. /* unsigned int lc = 19; */
  2235. unsigned int vc = 21;
  2236. unsigned int tc = 28;
  2237. unsigned int nc = (vc * tc);
  2238. /* unsigned int sc = (lc * nc); */
  2239. unsigned int unichar;
  2240. unsigned int mapping[4];
  2241. unsigned int *um;
  2242. int count;
  2243. int i;
  2244. if (verbose > 0)
  2245. printf("Decomposing hangul\n");
  2246. /* Hangul */
  2247. count = 0;
  2248. for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
  2249. unsigned int si = unichar - sb;
  2250. unsigned int li = si / nc;
  2251. unsigned int vi = (si % nc) / tc;
  2252. unsigned int ti = si % tc;
  2253. i = 0;
  2254. mapping[i++] = lb + li;
  2255. mapping[i++] = vb + vi;
  2256. if (ti)
  2257. mapping[i++] = tb + ti;
  2258. mapping[i++] = 0;
  2259. assert(!unicode_data[unichar].utf32nfdi);
  2260. um = malloc(i * sizeof(unsigned int));
  2261. memcpy(um, mapping, i * sizeof(unsigned int));
  2262. unicode_data[unichar].utf32nfdi = um;
  2263. assert(!unicode_data[unichar].utf32nfdicf);
  2264. um = malloc(i * sizeof(unsigned int));
  2265. memcpy(um, mapping, i * sizeof(unsigned int));
  2266. unicode_data[unichar].utf32nfdicf = um;
  2267. /*
  2268. * Add a cookie as a reminder that the hangul syllable
  2269. * decompositions must not be stored in the generated
  2270. * trie.
  2271. */
  2272. unicode_data[unichar].utf8nfdi = malloc(2);
  2273. unicode_data[unichar].utf8nfdi[0] = HANGUL;
  2274. unicode_data[unichar].utf8nfdi[1] = '\0';
  2275. if (verbose > 1)
  2276. print_utf32nfdi(unichar);
  2277. count++;
  2278. }
  2279. if (verbose > 0)
  2280. printf("Created %d entries\n", count);
  2281. }
  2282. static void nfdi_decompose(void)
  2283. {
  2284. unsigned int unichar;
  2285. unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
  2286. unsigned int *um;
  2287. unsigned int *dc;
  2288. int count;
  2289. int i;
  2290. int j;
  2291. int ret;
  2292. if (verbose > 0)
  2293. printf("Decomposing nfdi\n");
  2294. count = 0;
  2295. for (unichar = 0; unichar != 0x110000; unichar++) {
  2296. if (!unicode_data[unichar].utf32nfdi)
  2297. continue;
  2298. for (;;) {
  2299. ret = 1;
  2300. i = 0;
  2301. um = unicode_data[unichar].utf32nfdi;
  2302. while (*um) {
  2303. dc = unicode_data[*um].utf32nfdi;
  2304. if (dc) {
  2305. for (j = 0; dc[j]; j++)
  2306. mapping[i++] = dc[j];
  2307. ret = 0;
  2308. } else {
  2309. mapping[i++] = *um;
  2310. }
  2311. um++;
  2312. }
  2313. mapping[i++] = 0;
  2314. if (ret)
  2315. break;
  2316. free(unicode_data[unichar].utf32nfdi);
  2317. um = malloc(i * sizeof(unsigned int));
  2318. memcpy(um, mapping, i * sizeof(unsigned int));
  2319. unicode_data[unichar].utf32nfdi = um;
  2320. }
  2321. /* Add this decomposition to nfdicf if there is no entry. */
  2322. if (!unicode_data[unichar].utf32nfdicf) {
  2323. um = malloc(i * sizeof(unsigned int));
  2324. memcpy(um, mapping, i * sizeof(unsigned int));
  2325. unicode_data[unichar].utf32nfdicf = um;
  2326. }
  2327. if (verbose > 1)
  2328. print_utf32nfdi(unichar);
  2329. count++;
  2330. }
  2331. if (verbose > 0)
  2332. printf("Processed %d entries\n", count);
  2333. }
  2334. static void nfdicf_decompose(void)
  2335. {
  2336. unsigned int unichar;
  2337. unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
  2338. unsigned int *um;
  2339. unsigned int *dc;
  2340. int count;
  2341. int i;
  2342. int j;
  2343. int ret;
  2344. if (verbose > 0)
  2345. printf("Decomposing nfdicf\n");
  2346. count = 0;
  2347. for (unichar = 0; unichar != 0x110000; unichar++) {
  2348. if (!unicode_data[unichar].utf32nfdicf)
  2349. continue;
  2350. for (;;) {
  2351. ret = 1;
  2352. i = 0;
  2353. um = unicode_data[unichar].utf32nfdicf;
  2354. while (*um) {
  2355. dc = unicode_data[*um].utf32nfdicf;
  2356. if (dc) {
  2357. for (j = 0; dc[j]; j++)
  2358. mapping[i++] = dc[j];
  2359. ret = 0;
  2360. } else {
  2361. mapping[i++] = *um;
  2362. }
  2363. um++;
  2364. }
  2365. mapping[i++] = 0;
  2366. if (ret)
  2367. break;
  2368. free(unicode_data[unichar].utf32nfdicf);
  2369. um = malloc(i * sizeof(unsigned int));
  2370. memcpy(um, mapping, i * sizeof(unsigned int));
  2371. unicode_data[unichar].utf32nfdicf = um;
  2372. }
  2373. if (verbose > 1)
  2374. print_utf32nfdicf(unichar);
  2375. count++;
  2376. }
  2377. if (verbose > 0)
  2378. printf("Processed %d entries\n", count);
  2379. }
  2380. /* ------------------------------------------------------------------ */
  2381. int utf8agemax(struct tree *, const char *);
  2382. int utf8nagemax(struct tree *, const char *, size_t);
  2383. int utf8agemin(struct tree *, const char *);
  2384. int utf8nagemin(struct tree *, const char *, size_t);
  2385. ssize_t utf8len(struct tree *, const char *);
  2386. ssize_t utf8nlen(struct tree *, const char *, size_t);
  2387. struct utf8cursor;
  2388. int utf8cursor(struct utf8cursor *, struct tree *, const char *);
  2389. int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
  2390. int utf8byte(struct utf8cursor *);
  2391. /*
  2392. * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
  2393. *
  2394. * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
  2395. * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
  2396. *
  2397. * SBase = 0xAC00
  2398. * LBase = 0x1100
  2399. * VBase = 0x1161
  2400. * TBase = 0x11A7
  2401. * LCount = 19
  2402. * VCount = 21
  2403. * TCount = 28
  2404. * NCount = 588 (VCount * TCount)
  2405. * SCount = 11172 (LCount * NCount)
  2406. *
  2407. * Decomposition:
  2408. * SIndex = s - SBase
  2409. *
  2410. * LV (Canonical/Full)
  2411. * LIndex = SIndex / NCount
  2412. * VIndex = (Sindex % NCount) / TCount
  2413. * LPart = LBase + LIndex
  2414. * VPart = VBase + VIndex
  2415. *
  2416. * LVT (Canonical)
  2417. * LVIndex = (SIndex / TCount) * TCount
  2418. * TIndex = (Sindex % TCount)
  2419. * LVPart = SBase + LVIndex
  2420. * TPart = TBase + TIndex
  2421. *
  2422. * LVT (Full)
  2423. * LIndex = SIndex / NCount
  2424. * VIndex = (Sindex % NCount) / TCount
  2425. * TIndex = (Sindex % TCount)
  2426. * LPart = LBase + LIndex
  2427. * VPart = VBase + VIndex
  2428. * if (TIndex == 0) {
  2429. * d = <LPart, VPart>
  2430. * } else {
  2431. * TPart = TBase + TIndex
  2432. * d = <LPart, VPart, TPart>
  2433. * }
  2434. */
  2435. /* Constants */
  2436. #define SB (0xAC00)
  2437. #define LB (0x1100)
  2438. #define VB (0x1161)
  2439. #define TB (0x11A7)
  2440. #define LC (19)
  2441. #define VC (21)
  2442. #define TC (28)
  2443. #define NC (VC * TC)
  2444. #define SC (LC * NC)
  2445. /* Algorithmic decomposition of hangul syllable. */
  2446. static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
  2447. {
  2448. unsigned int si;
  2449. unsigned int li;
  2450. unsigned int vi;
  2451. unsigned int ti;
  2452. unsigned char *h;
  2453. /* Calculate the SI, LI, VI, and TI values. */
  2454. si = utf8decode(str) - SB;
  2455. li = si / NC;
  2456. vi = (si % NC) / TC;
  2457. ti = si % TC;
  2458. /* Fill in base of leaf. */
  2459. h = hangul;
  2460. LEAF_GEN(h) = 2;
  2461. LEAF_CCC(h) = DECOMPOSE;
  2462. h += 2;
  2463. /* Add LPart, a 3-byte UTF-8 sequence. */
  2464. h += utf8encode((char *)h, li + LB);
  2465. /* Add VPart, a 3-byte UTF-8 sequence. */
  2466. h += utf8encode((char *)h, vi + VB);
  2467. /* Add TPart if required, also a 3-byte UTF-8 sequence. */
  2468. if (ti)
  2469. h += utf8encode((char *)h, ti + TB);
  2470. /* Terminate string. */
  2471. h[0] = '\0';
  2472. return hangul;
  2473. }
  2474. /*
  2475. * Use trie to scan s, touching at most len bytes.
  2476. * Returns the leaf if one exists, NULL otherwise.
  2477. *
  2478. * A non-NULL return guarantees that the UTF-8 sequence starting at s
  2479. * is well-formed and corresponds to a known unicode code point. The
  2480. * shorthand for this will be "is valid UTF-8 unicode".
  2481. */
  2482. static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
  2483. const char *s, size_t len)
  2484. {
  2485. utf8trie_t *trie;
  2486. int offlen;
  2487. int offset;
  2488. int mask;
  2489. int node;
  2490. if (!tree)
  2491. return NULL;
  2492. if (len == 0)
  2493. return NULL;
  2494. node = 1;
  2495. trie = utf8data + tree->index;
  2496. while (node) {
  2497. offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
  2498. if (*trie & NEXTBYTE) {
  2499. if (--len == 0)
  2500. return NULL;
  2501. s++;
  2502. }
  2503. mask = 1 << (*trie & BITNUM);
  2504. if (*s & mask) {
  2505. /* Right leg */
  2506. if (offlen) {
  2507. /* Right node at offset of trie */
  2508. node = (*trie & RIGHTNODE);
  2509. offset = trie[offlen];
  2510. while (--offlen) {
  2511. offset <<= 8;
  2512. offset |= trie[offlen];
  2513. }
  2514. trie += offset;
  2515. } else if (*trie & RIGHTPATH) {
  2516. /* Right node after this node */
  2517. node = (*trie & TRIENODE);
  2518. trie++;
  2519. } else {
  2520. /* No right node. */
  2521. return NULL;
  2522. }
  2523. } else {
  2524. /* Left leg */
  2525. if (offlen) {
  2526. /* Left node after this node. */
  2527. node = (*trie & LEFTNODE);
  2528. trie += offlen + 1;
  2529. } else if (*trie & RIGHTPATH) {
  2530. /* No left node. */
  2531. return NULL;
  2532. } else {
  2533. /* Left node after this node */
  2534. node = (*trie & TRIENODE);
  2535. trie++;
  2536. }
  2537. }
  2538. }
  2539. /*
  2540. * Hangul decomposition is done algorithmically. These are the
  2541. * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
  2542. * always 3 bytes long, so s has been advanced twice, and the
  2543. * start of the sequence is at s-2.
  2544. */
  2545. if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
  2546. trie = utf8hangul(s - 2, hangul);
  2547. return trie;
  2548. }
  2549. /*
  2550. * Use trie to scan s.
  2551. * Returns the leaf if one exists, NULL otherwise.
  2552. *
  2553. * Forwards to trie_nlookup().
  2554. */
  2555. static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
  2556. const char *s)
  2557. {
  2558. return utf8nlookup(tree, hangul, s, (size_t)-1);
  2559. }
  2560. /*
  2561. * Return the number of bytes used by the current UTF-8 sequence.
  2562. * Assumes the input points to the first byte of a valid UTF-8
  2563. * sequence.
  2564. */
  2565. static inline int utf8clen(const char *s)
  2566. {
  2567. unsigned char c = *s;
  2568. return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
  2569. }
  2570. /*
  2571. * Maximum age of any character in s.
  2572. * Return -1 if s is not valid UTF-8 unicode.
  2573. * Return 0 if only non-assigned code points are used.
  2574. */
  2575. int utf8agemax(struct tree *tree, const char *s)
  2576. {
  2577. utf8leaf_t *leaf;
  2578. int age = 0;
  2579. int leaf_age;
  2580. unsigned char hangul[UTF8HANGULLEAF];
  2581. if (!tree)
  2582. return -1;
  2583. while (*s) {
  2584. leaf = utf8lookup(tree, hangul, s);
  2585. if (!leaf)
  2586. return -1;
  2587. leaf_age = ages[LEAF_GEN(leaf)];
  2588. if (leaf_age <= tree->maxage && leaf_age > age)
  2589. age = leaf_age;
  2590. s += utf8clen(s);
  2591. }
  2592. return age;
  2593. }
  2594. /*
  2595. * Minimum age of any character in s.
  2596. * Return -1 if s is not valid UTF-8 unicode.
  2597. * Return 0 if non-assigned code points are used.
  2598. */
  2599. int utf8agemin(struct tree *tree, const char *s)
  2600. {
  2601. utf8leaf_t *leaf;
  2602. int age;
  2603. int leaf_age;
  2604. unsigned char hangul[UTF8HANGULLEAF];
  2605. if (!tree)
  2606. return -1;
  2607. age = tree->maxage;
  2608. while (*s) {
  2609. leaf = utf8lookup(tree, hangul, s);
  2610. if (!leaf)
  2611. return -1;
  2612. leaf_age = ages[LEAF_GEN(leaf)];
  2613. if (leaf_age <= tree->maxage && leaf_age < age)
  2614. age = leaf_age;
  2615. s += utf8clen(s);
  2616. }
  2617. return age;
  2618. }
  2619. /*
  2620. * Maximum age of any character in s, touch at most len bytes.
  2621. * Return -1 if s is not valid UTF-8 unicode.
  2622. */
  2623. int utf8nagemax(struct tree *tree, const char *s, size_t len)
  2624. {
  2625. utf8leaf_t *leaf;
  2626. int age = 0;
  2627. int leaf_age;
  2628. unsigned char hangul[UTF8HANGULLEAF];
  2629. if (!tree)
  2630. return -1;
  2631. while (len && *s) {
  2632. leaf = utf8nlookup(tree, hangul, s, len);
  2633. if (!leaf)
  2634. return -1;
  2635. leaf_age = ages[LEAF_GEN(leaf)];
  2636. if (leaf_age <= tree->maxage && leaf_age > age)
  2637. age = leaf_age;
  2638. len -= utf8clen(s);
  2639. s += utf8clen(s);
  2640. }
  2641. return age;
  2642. }
  2643. /*
  2644. * Maximum age of any character in s, touch at most len bytes.
  2645. * Return -1 if s is not valid UTF-8 unicode.
  2646. */
  2647. int utf8nagemin(struct tree *tree, const char *s, size_t len)
  2648. {
  2649. utf8leaf_t *leaf;
  2650. int leaf_age;
  2651. int age;
  2652. unsigned char hangul[UTF8HANGULLEAF];
  2653. if (!tree)
  2654. return -1;
  2655. age = tree->maxage;
  2656. while (len && *s) {
  2657. leaf = utf8nlookup(tree, hangul, s, len);
  2658. if (!leaf)
  2659. return -1;
  2660. leaf_age = ages[LEAF_GEN(leaf)];
  2661. if (leaf_age <= tree->maxage && leaf_age < age)
  2662. age = leaf_age;
  2663. len -= utf8clen(s);
  2664. s += utf8clen(s);
  2665. }
  2666. return age;
  2667. }
  2668. /*
  2669. * Length of the normalization of s.
  2670. * Return -1 if s is not valid UTF-8 unicode.
  2671. *
  2672. * A string of Default_Ignorable_Code_Point has length 0.
  2673. */
  2674. ssize_t utf8len(struct tree *tree, const char *s)
  2675. {
  2676. utf8leaf_t *leaf;
  2677. size_t ret = 0;
  2678. unsigned char hangul[UTF8HANGULLEAF];
  2679. if (!tree)
  2680. return -1;
  2681. while (*s) {
  2682. leaf = utf8lookup(tree, hangul, s);
  2683. if (!leaf)
  2684. return -1;
  2685. if (ages[LEAF_GEN(leaf)] > tree->maxage)
  2686. ret += utf8clen(s);
  2687. else if (LEAF_CCC(leaf) == DECOMPOSE)
  2688. ret += strlen(LEAF_STR(leaf));
  2689. else
  2690. ret += utf8clen(s);
  2691. s += utf8clen(s);
  2692. }
  2693. return ret;
  2694. }
  2695. /*
  2696. * Length of the normalization of s, touch at most len bytes.
  2697. * Return -1 if s is not valid UTF-8 unicode.
  2698. */
  2699. ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
  2700. {
  2701. utf8leaf_t *leaf;
  2702. size_t ret = 0;
  2703. unsigned char hangul[UTF8HANGULLEAF];
  2704. if (!tree)
  2705. return -1;
  2706. while (len && *s) {
  2707. leaf = utf8nlookup(tree, hangul, s, len);
  2708. if (!leaf)
  2709. return -1;
  2710. if (ages[LEAF_GEN(leaf)] > tree->maxage)
  2711. ret += utf8clen(s);
  2712. else if (LEAF_CCC(leaf) == DECOMPOSE)
  2713. ret += strlen(LEAF_STR(leaf));
  2714. else
  2715. ret += utf8clen(s);
  2716. len -= utf8clen(s);
  2717. s += utf8clen(s);
  2718. }
  2719. return ret;
  2720. }
  2721. /*
  2722. * Cursor structure used by the normalizer.
  2723. */
  2724. struct utf8cursor {
  2725. struct tree *tree;
  2726. const char *s;
  2727. const char *p;
  2728. const char *ss;
  2729. const char *sp;
  2730. unsigned int len;
  2731. unsigned int slen;
  2732. short int ccc;
  2733. short int nccc;
  2734. unsigned int unichar;
  2735. unsigned char hangul[UTF8HANGULLEAF];
  2736. };
  2737. /*
  2738. * Set up an utf8cursor for use by utf8byte().
  2739. *
  2740. * s : string.
  2741. * len : length of s.
  2742. * u8c : pointer to cursor.
  2743. * trie : utf8trie_t to use for normalization.
  2744. *
  2745. * Returns -1 on error, 0 on success.
  2746. */
  2747. int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
  2748. size_t len)
  2749. {
  2750. if (!tree)
  2751. return -1;
  2752. if (!s)
  2753. return -1;
  2754. u8c->tree = tree;
  2755. u8c->s = s;
  2756. u8c->p = NULL;
  2757. u8c->ss = NULL;
  2758. u8c->sp = NULL;
  2759. u8c->len = len;
  2760. u8c->slen = 0;
  2761. u8c->ccc = STOPPER;
  2762. u8c->nccc = STOPPER;
  2763. u8c->unichar = 0;
  2764. /* Check we didn't clobber the maximum length. */
  2765. if (u8c->len != len)
  2766. return -1;
  2767. /* The first byte of s may not be an utf8 continuation. */
  2768. if (len > 0 && (*s & 0xC0) == 0x80)
  2769. return -1;
  2770. return 0;
  2771. }
  2772. /*
  2773. * Set up an utf8cursor for use by utf8byte().
  2774. *
  2775. * s : NUL-terminated string.
  2776. * u8c : pointer to cursor.
  2777. * trie : utf8trie_t to use for normalization.
  2778. *
  2779. * Returns -1 on error, 0 on success.
  2780. */
  2781. int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
  2782. {
  2783. return utf8ncursor(u8c, tree, s, (unsigned int)-1);
  2784. }
  2785. /*
  2786. * Get one byte from the normalized form of the string described by u8c.
  2787. *
  2788. * Returns the byte cast to an unsigned char on succes, and -1 on failure.
  2789. *
  2790. * The cursor keeps track of the location in the string in u8c->s.
  2791. * When a character is decomposed, the current location is stored in
  2792. * u8c->p, and u8c->s is set to the start of the decomposition. Note
  2793. * that bytes from a decomposition do not count against u8c->len.
  2794. *
  2795. * Characters are emitted if they match the current CCC in u8c->ccc.
  2796. * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
  2797. * and the function returns 0 in that case.
  2798. *
  2799. * Sorting by CCC is done by repeatedly scanning the string. The
  2800. * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
  2801. * the start of the scan. The first pass finds the lowest CCC to be
  2802. * emitted and stores it in u8c->nccc, the second pass emits the
  2803. * characters with this CCC and finds the next lowest CCC. This limits
  2804. * the number of passes to 1 + the number of different CCCs in the
  2805. * sequence being scanned.
  2806. *
  2807. * Therefore:
  2808. * u8c->p != NULL -> a decomposition is being scanned.
  2809. * u8c->ss != NULL -> this is a repeating scan.
  2810. * u8c->ccc == -1 -> this is the first scan of a repeating scan.
  2811. */
  2812. int utf8byte(struct utf8cursor *u8c)
  2813. {
  2814. utf8leaf_t *leaf;
  2815. int ccc;
  2816. for (;;) {
  2817. /* Check for the end of a decomposed character. */
  2818. if (u8c->p && *u8c->s == '\0') {
  2819. u8c->s = u8c->p;
  2820. u8c->p = NULL;
  2821. }
  2822. /* Check for end-of-string. */
  2823. if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
  2824. /* There is no next byte. */
  2825. if (u8c->ccc == STOPPER)
  2826. return 0;
  2827. /* End-of-string during a scan counts as a stopper. */
  2828. ccc = STOPPER;
  2829. goto ccc_mismatch;
  2830. } else if ((*u8c->s & 0xC0) == 0x80) {
  2831. /* This is a continuation of the current character. */
  2832. if (!u8c->p)
  2833. u8c->len--;
  2834. return (unsigned char)*u8c->s++;
  2835. }
  2836. /* Look up the data for the current character. */
  2837. if (u8c->p) {
  2838. leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
  2839. } else {
  2840. leaf = utf8nlookup(u8c->tree, u8c->hangul,
  2841. u8c->s, u8c->len);
  2842. }
  2843. /* No leaf found implies that the input is a binary blob. */
  2844. if (!leaf)
  2845. return -1;
  2846. /* Characters that are too new have CCC 0. */
  2847. if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
  2848. ccc = STOPPER;
  2849. } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
  2850. u8c->len -= utf8clen(u8c->s);
  2851. u8c->p = u8c->s + utf8clen(u8c->s);
  2852. u8c->s = LEAF_STR(leaf);
  2853. /* Empty decomposition implies CCC 0. */
  2854. if (*u8c->s == '\0') {
  2855. if (u8c->ccc == STOPPER)
  2856. continue;
  2857. ccc = STOPPER;
  2858. goto ccc_mismatch;
  2859. }
  2860. leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
  2861. ccc = LEAF_CCC(leaf);
  2862. }
  2863. u8c->unichar = utf8decode(u8c->s);
  2864. /*
  2865. * If this is not a stopper, then see if it updates
  2866. * the next canonical class to be emitted.
  2867. */
  2868. if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
  2869. u8c->nccc = ccc;
  2870. /*
  2871. * Return the current byte if this is the current
  2872. * combining class.
  2873. */
  2874. if (ccc == u8c->ccc) {
  2875. if (!u8c->p)
  2876. u8c->len--;
  2877. return (unsigned char)*u8c->s++;
  2878. }
  2879. /* Current combining class mismatch. */
  2880. ccc_mismatch:
  2881. if (u8c->nccc == STOPPER) {
  2882. /*
  2883. * Scan forward for the first canonical class
  2884. * to be emitted. Save the position from
  2885. * which to restart.
  2886. */
  2887. assert(u8c->ccc == STOPPER);
  2888. u8c->ccc = MINCCC - 1;
  2889. u8c->nccc = ccc;
  2890. u8c->sp = u8c->p;
  2891. u8c->ss = u8c->s;
  2892. u8c->slen = u8c->len;
  2893. if (!u8c->p)
  2894. u8c->len -= utf8clen(u8c->s);
  2895. u8c->s += utf8clen(u8c->s);
  2896. } else if (ccc != STOPPER) {
  2897. /* Not a stopper, and not the ccc we're emitting. */
  2898. if (!u8c->p)
  2899. u8c->len -= utf8clen(u8c->s);
  2900. u8c->s += utf8clen(u8c->s);
  2901. } else if (u8c->nccc != MAXCCC + 1) {
  2902. /* At a stopper, restart for next ccc. */
  2903. u8c->ccc = u8c->nccc;
  2904. u8c->nccc = MAXCCC + 1;
  2905. u8c->s = u8c->ss;
  2906. u8c->p = u8c->sp;
  2907. u8c->len = u8c->slen;
  2908. } else {
  2909. /* All done, proceed from here. */
  2910. u8c->ccc = STOPPER;
  2911. u8c->nccc = STOPPER;
  2912. u8c->sp = NULL;
  2913. u8c->ss = NULL;
  2914. u8c->slen = 0;
  2915. }
  2916. }
  2917. }
  2918. /* ------------------------------------------------------------------ */
  2919. static int normalize_line(struct tree *tree)
  2920. {
  2921. char *s;
  2922. char *t;
  2923. int c;
  2924. struct utf8cursor u8c;
  2925. /* First test: null-terminated string. */
  2926. s = buf2;
  2927. t = buf3;
  2928. if (utf8cursor(&u8c, tree, s))
  2929. return -1;
  2930. while ((c = utf8byte(&u8c)) > 0)
  2931. if (c != (unsigned char)*t++)
  2932. return -1;
  2933. if (c < 0)
  2934. return -1;
  2935. if (*t != 0)
  2936. return -1;
  2937. /* Second test: length-limited string. */
  2938. s = buf2;
  2939. /* Replace NUL with a value that will cause an error if seen. */
  2940. s[strlen(s) + 1] = -1;
  2941. t = buf3;
  2942. if (utf8cursor(&u8c, tree, s))
  2943. return -1;
  2944. while ((c = utf8byte(&u8c)) > 0)
  2945. if (c != (unsigned char)*t++)
  2946. return -1;
  2947. if (c < 0)
  2948. return -1;
  2949. if (*t != 0)
  2950. return -1;
  2951. return 0;
  2952. }
  2953. static void normalization_test(void)
  2954. {
  2955. FILE *file;
  2956. unsigned int unichar;
  2957. struct unicode_data *data;
  2958. char *s;
  2959. char *t;
  2960. int ret;
  2961. int ignorables;
  2962. int tests = 0;
  2963. int failures = 0;
  2964. if (verbose > 0)
  2965. printf("Parsing %s\n", test_name);
  2966. /* Step one, read data from file. */
  2967. file = fopen(test_name, "r");
  2968. if (!file)
  2969. open_fail(test_name, errno);
  2970. while (fgets(line, LINESIZE, file)) {
  2971. ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
  2972. buf0, buf1);
  2973. if (ret != 2 || *line == '#')
  2974. continue;
  2975. s = buf0;
  2976. t = buf2;
  2977. while (*s) {
  2978. unichar = strtoul(s, &s, 16);
  2979. t += utf8encode(t, unichar);
  2980. }
  2981. *t = '\0';
  2982. ignorables = 0;
  2983. s = buf1;
  2984. t = buf3;
  2985. while (*s) {
  2986. unichar = strtoul(s, &s, 16);
  2987. data = &unicode_data[unichar];
  2988. if (data->utf8nfdi && !*data->utf8nfdi)
  2989. ignorables = 1;
  2990. else
  2991. t += utf8encode(t, unichar);
  2992. }
  2993. *t = '\0';
  2994. tests++;
  2995. if (normalize_line(nfdi_tree) < 0) {
  2996. printf("Line %s -> %s", buf0, buf1);
  2997. if (ignorables)
  2998. printf(" (ignorables removed)");
  2999. printf(" failure\n");
  3000. failures++;
  3001. }
  3002. }
  3003. fclose(file);
  3004. if (verbose > 0)
  3005. printf("Ran %d tests with %d failures\n", tests, failures);
  3006. if (failures)
  3007. file_fail(test_name);
  3008. }
  3009. /* ------------------------------------------------------------------ */
  3010. static void write_file(void)
  3011. {
  3012. FILE *file;
  3013. int i;
  3014. int j;
  3015. int t;
  3016. int gen;
  3017. if (verbose > 0)
  3018. printf("Writing %s\n", utf8_name);
  3019. file = fopen(utf8_name, "w");
  3020. if (!file)
  3021. open_fail(utf8_name, errno);
  3022. fprintf(file, "/* This file is generated code, do not edit. */\n");
  3023. fprintf(file, "\n");
  3024. fprintf(file, "#include <linux/module.h>\n");
  3025. fprintf(file, "#include <linux/kernel.h>\n");
  3026. fprintf(file, "#include \"utf8n.h\"\n");
  3027. fprintf(file, "\n");
  3028. fprintf(file, "static const unsigned int utf8agetab[] = {\n");
  3029. for (i = 0; i != ages_count; i++)
  3030. fprintf(file, "\t%#x%s\n", ages[i],
  3031. ages[i] == unicode_maxage ? "" : ",");
  3032. fprintf(file, "};\n");
  3033. fprintf(file, "\n");
  3034. fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
  3035. t = 0;
  3036. for (gen = 0; gen < ages_count; gen++) {
  3037. fprintf(file, "\t{ %#x, %d }%s\n",
  3038. ages[gen], trees[t].index,
  3039. ages[gen] == unicode_maxage ? "" : ",");
  3040. if (trees[t].maxage == ages[gen])
  3041. t += 2;
  3042. }
  3043. fprintf(file, "};\n");
  3044. fprintf(file, "\n");
  3045. fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
  3046. t = 1;
  3047. for (gen = 0; gen < ages_count; gen++) {
  3048. fprintf(file, "\t{ %#x, %d }%s\n",
  3049. ages[gen], trees[t].index,
  3050. ages[gen] == unicode_maxage ? "" : ",");
  3051. if (trees[t].maxage == ages[gen])
  3052. t += 2;
  3053. }
  3054. fprintf(file, "};\n");
  3055. fprintf(file, "\n");
  3056. fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
  3057. utf8data_size);
  3058. t = 0;
  3059. for (i = 0; i != utf8data_size; i += 16) {
  3060. if (i == trees[t].index) {
  3061. fprintf(file, "\t/* %s_%x */\n",
  3062. trees[t].type, trees[t].maxage);
  3063. if (t < trees_count-1)
  3064. t++;
  3065. }
  3066. fprintf(file, "\t");
  3067. for (j = i; j != i + 16; j++)
  3068. fprintf(file, "0x%.2x%s", utf8data[j],
  3069. (j < utf8data_size -1 ? "," : ""));
  3070. fprintf(file, "\n");
  3071. }
  3072. fprintf(file, "};\n");
  3073. fprintf(file, "\n");
  3074. fprintf(file, "struct utf8data_table utf8_data_table = {\n");
  3075. fprintf(file, "\t.utf8agetab = utf8agetab,\n");
  3076. fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
  3077. fprintf(file, "\n");
  3078. fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
  3079. fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
  3080. fprintf(file, "\n");
  3081. fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
  3082. fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
  3083. fprintf(file, "\n");
  3084. fprintf(file, "\t.utf8data = utf8data,\n");
  3085. fprintf(file, "};\n");
  3086. fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
  3087. fprintf(file, "\n");
  3088. fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
  3089. fclose(file);
  3090. }
  3091. /* ------------------------------------------------------------------ */
  3092. int main(int argc, char *argv[])
  3093. {
  3094. unsigned int unichar;
  3095. int opt;
  3096. argv0 = argv[0];
  3097. while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
  3098. switch (opt) {
  3099. case 'a':
  3100. age_name = optarg;
  3101. break;
  3102. case 'c':
  3103. ccc_name = optarg;
  3104. break;
  3105. case 'd':
  3106. data_name = optarg;
  3107. break;
  3108. case 'f':
  3109. fold_name = optarg;
  3110. break;
  3111. case 'n':
  3112. norm_name = optarg;
  3113. break;
  3114. case 'o':
  3115. utf8_name = optarg;
  3116. break;
  3117. case 'p':
  3118. prop_name = optarg;
  3119. break;
  3120. case 't':
  3121. test_name = optarg;
  3122. break;
  3123. case 'v':
  3124. verbose++;
  3125. break;
  3126. case 'h':
  3127. help();
  3128. exit(0);
  3129. default:
  3130. usage();
  3131. }
  3132. }
  3133. if (verbose > 1)
  3134. help();
  3135. for (unichar = 0; unichar != 0x110000; unichar++)
  3136. unicode_data[unichar].code = unichar;
  3137. age_init();
  3138. ccc_init();
  3139. nfdi_init();
  3140. nfdicf_init();
  3141. ignore_init();
  3142. corrections_init();
  3143. hangul_decompose();
  3144. nfdi_decompose();
  3145. nfdicf_decompose();
  3146. utf8_init();
  3147. trees_init();
  3148. trees_populate();
  3149. trees_reduce();
  3150. trees_verify();
  3151. /* Prevent "unused function" warning. */
  3152. (void)lookup(nfdi_tree, " ");
  3153. if (verbose > 2)
  3154. tree_walk(nfdi_tree);
  3155. if (verbose > 2)
  3156. tree_walk(nfdicf_tree);
  3157. normalization_test();
  3158. write_file();
  3159. return 0;
  3160. }