fib_trie.c 73 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. *
  4. * Robert Olsson <[email protected]> Uppsala Universitet
  5. * & Swedish University of Agricultural Sciences.
  6. *
  7. * Jens Laas <[email protected]> Swedish University of
  8. * Agricultural Sciences.
  9. *
  10. * Hans Liss <[email protected]> Uppsala Universitet
  11. *
  12. * This work is based on the LPC-trie which is originally described in:
  13. *
  14. * An experimental study of compression methods for dynamic tries
  15. * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  16. * https://www.csc.kth.se/~snilsson/software/dyntrie2/
  17. *
  18. * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  19. * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  20. *
  21. * Code from fib_hash has been reused which includes the following header:
  22. *
  23. * INET An implementation of the TCP/IP protocol suite for the LINUX
  24. * operating system. INET is implemented using the BSD Socket
  25. * interface as the means of communication with the user level.
  26. *
  27. * IPv4 FIB: lookup engine and maintenance routines.
  28. *
  29. * Authors: Alexey Kuznetsov, <[email protected]>
  30. *
  31. * Substantial contributions to this work comes from:
  32. *
  33. * David S. Miller, <[email protected]>
  34. * Stephen Hemminger <[email protected]>
  35. * Paul E. McKenney <[email protected]>
  36. * Patrick McHardy <[email protected]>
  37. */
  38. #include <linux/cache.h>
  39. #include <linux/uaccess.h>
  40. #include <linux/bitops.h>
  41. #include <linux/types.h>
  42. #include <linux/kernel.h>
  43. #include <linux/mm.h>
  44. #include <linux/string.h>
  45. #include <linux/socket.h>
  46. #include <linux/sockios.h>
  47. #include <linux/errno.h>
  48. #include <linux/in.h>
  49. #include <linux/inet.h>
  50. #include <linux/inetdevice.h>
  51. #include <linux/netdevice.h>
  52. #include <linux/if_arp.h>
  53. #include <linux/proc_fs.h>
  54. #include <linux/rcupdate.h>
  55. #include <linux/skbuff.h>
  56. #include <linux/netlink.h>
  57. #include <linux/init.h>
  58. #include <linux/list.h>
  59. #include <linux/slab.h>
  60. #include <linux/export.h>
  61. #include <linux/vmalloc.h>
  62. #include <linux/notifier.h>
  63. #include <net/net_namespace.h>
  64. #include <net/inet_dscp.h>
  65. #include <net/ip.h>
  66. #include <net/protocol.h>
  67. #include <net/route.h>
  68. #include <net/tcp.h>
  69. #include <net/sock.h>
  70. #include <net/ip_fib.h>
  71. #include <net/fib_notifier.h>
  72. #include <trace/events/fib.h>
  73. #include "fib_lookup.h"
  74. static int call_fib_entry_notifier(struct notifier_block *nb,
  75. enum fib_event_type event_type, u32 dst,
  76. int dst_len, struct fib_alias *fa,
  77. struct netlink_ext_ack *extack)
  78. {
  79. struct fib_entry_notifier_info info = {
  80. .info.extack = extack,
  81. .dst = dst,
  82. .dst_len = dst_len,
  83. .fi = fa->fa_info,
  84. .dscp = fa->fa_dscp,
  85. .type = fa->fa_type,
  86. .tb_id = fa->tb_id,
  87. };
  88. return call_fib4_notifier(nb, event_type, &info.info);
  89. }
  90. static int call_fib_entry_notifiers(struct net *net,
  91. enum fib_event_type event_type, u32 dst,
  92. int dst_len, struct fib_alias *fa,
  93. struct netlink_ext_ack *extack)
  94. {
  95. struct fib_entry_notifier_info info = {
  96. .info.extack = extack,
  97. .dst = dst,
  98. .dst_len = dst_len,
  99. .fi = fa->fa_info,
  100. .dscp = fa->fa_dscp,
  101. .type = fa->fa_type,
  102. .tb_id = fa->tb_id,
  103. };
  104. return call_fib4_notifiers(net, event_type, &info.info);
  105. }
  106. #define MAX_STAT_DEPTH 32
  107. #define KEYLENGTH (8*sizeof(t_key))
  108. #define KEY_MAX ((t_key)~0)
  109. typedef unsigned int t_key;
  110. #define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
  111. #define IS_TNODE(n) ((n)->bits)
  112. #define IS_LEAF(n) (!(n)->bits)
  113. struct key_vector {
  114. t_key key;
  115. unsigned char pos; /* 2log(KEYLENGTH) bits needed */
  116. unsigned char bits; /* 2log(KEYLENGTH) bits needed */
  117. unsigned char slen;
  118. union {
  119. /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
  120. struct hlist_head leaf;
  121. /* This array is valid if (pos | bits) > 0 (TNODE) */
  122. DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode);
  123. };
  124. };
  125. struct tnode {
  126. struct rcu_head rcu;
  127. t_key empty_children; /* KEYLENGTH bits needed */
  128. t_key full_children; /* KEYLENGTH bits needed */
  129. struct key_vector __rcu *parent;
  130. struct key_vector kv[1];
  131. #define tn_bits kv[0].bits
  132. };
  133. #define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
  134. #define LEAF_SIZE TNODE_SIZE(1)
  135. #ifdef CONFIG_IP_FIB_TRIE_STATS
  136. struct trie_use_stats {
  137. unsigned int gets;
  138. unsigned int backtrack;
  139. unsigned int semantic_match_passed;
  140. unsigned int semantic_match_miss;
  141. unsigned int null_node_hit;
  142. unsigned int resize_node_skipped;
  143. };
  144. #endif
  145. struct trie_stat {
  146. unsigned int totdepth;
  147. unsigned int maxdepth;
  148. unsigned int tnodes;
  149. unsigned int leaves;
  150. unsigned int nullpointers;
  151. unsigned int prefixes;
  152. unsigned int nodesizes[MAX_STAT_DEPTH];
  153. };
  154. struct trie {
  155. struct key_vector kv[1];
  156. #ifdef CONFIG_IP_FIB_TRIE_STATS
  157. struct trie_use_stats __percpu *stats;
  158. #endif
  159. };
  160. static struct key_vector *resize(struct trie *t, struct key_vector *tn);
  161. static unsigned int tnode_free_size;
  162. /*
  163. * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
  164. * especially useful before resizing the root node with PREEMPT_NONE configs;
  165. * the value was obtained experimentally, aiming to avoid visible slowdown.
  166. */
  167. unsigned int sysctl_fib_sync_mem = 512 * 1024;
  168. unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
  169. unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
  170. static struct kmem_cache *fn_alias_kmem __ro_after_init;
  171. static struct kmem_cache *trie_leaf_kmem __ro_after_init;
  172. static inline struct tnode *tn_info(struct key_vector *kv)
  173. {
  174. return container_of(kv, struct tnode, kv[0]);
  175. }
  176. /* caller must hold RTNL */
  177. #define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
  178. #define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
  179. /* caller must hold RCU read lock or RTNL */
  180. #define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
  181. #define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
  182. /* wrapper for rcu_assign_pointer */
  183. static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
  184. {
  185. if (n)
  186. rcu_assign_pointer(tn_info(n)->parent, tp);
  187. }
  188. #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
  189. /* This provides us with the number of children in this node, in the case of a
  190. * leaf this will return 0 meaning none of the children are accessible.
  191. */
  192. static inline unsigned long child_length(const struct key_vector *tn)
  193. {
  194. return (1ul << tn->bits) & ~(1ul);
  195. }
  196. #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
  197. static inline unsigned long get_index(t_key key, struct key_vector *kv)
  198. {
  199. unsigned long index = key ^ kv->key;
  200. if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
  201. return 0;
  202. return index >> kv->pos;
  203. }
  204. /* To understand this stuff, an understanding of keys and all their bits is
  205. * necessary. Every node in the trie has a key associated with it, but not
  206. * all of the bits in that key are significant.
  207. *
  208. * Consider a node 'n' and its parent 'tp'.
  209. *
  210. * If n is a leaf, every bit in its key is significant. Its presence is
  211. * necessitated by path compression, since during a tree traversal (when
  212. * searching for a leaf - unless we are doing an insertion) we will completely
  213. * ignore all skipped bits we encounter. Thus we need to verify, at the end of
  214. * a potentially successful search, that we have indeed been walking the
  215. * correct key path.
  216. *
  217. * Note that we can never "miss" the correct key in the tree if present by
  218. * following the wrong path. Path compression ensures that segments of the key
  219. * that are the same for all keys with a given prefix are skipped, but the
  220. * skipped part *is* identical for each node in the subtrie below the skipped
  221. * bit! trie_insert() in this implementation takes care of that.
  222. *
  223. * if n is an internal node - a 'tnode' here, the various parts of its key
  224. * have many different meanings.
  225. *
  226. * Example:
  227. * _________________________________________________________________
  228. * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  229. * -----------------------------------------------------------------
  230. * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
  231. *
  232. * _________________________________________________________________
  233. * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  234. * -----------------------------------------------------------------
  235. * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
  236. *
  237. * tp->pos = 22
  238. * tp->bits = 3
  239. * n->pos = 13
  240. * n->bits = 4
  241. *
  242. * First, let's just ignore the bits that come before the parent tp, that is
  243. * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
  244. * point we do not use them for anything.
  245. *
  246. * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
  247. * index into the parent's child array. That is, they will be used to find
  248. * 'n' among tp's children.
  249. *
  250. * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
  251. * for the node n.
  252. *
  253. * All the bits we have seen so far are significant to the node n. The rest
  254. * of the bits are really not needed or indeed known in n->key.
  255. *
  256. * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
  257. * n's child array, and will of course be different for each child.
  258. *
  259. * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
  260. * at this point.
  261. */
  262. static const int halve_threshold = 25;
  263. static const int inflate_threshold = 50;
  264. static const int halve_threshold_root = 15;
  265. static const int inflate_threshold_root = 30;
  266. static void __alias_free_mem(struct rcu_head *head)
  267. {
  268. struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
  269. kmem_cache_free(fn_alias_kmem, fa);
  270. }
  271. static inline void alias_free_mem_rcu(struct fib_alias *fa)
  272. {
  273. call_rcu(&fa->rcu, __alias_free_mem);
  274. }
  275. #define TNODE_VMALLOC_MAX \
  276. ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
  277. static void __node_free_rcu(struct rcu_head *head)
  278. {
  279. struct tnode *n = container_of(head, struct tnode, rcu);
  280. if (!n->tn_bits)
  281. kmem_cache_free(trie_leaf_kmem, n);
  282. else
  283. kvfree(n);
  284. }
  285. #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
  286. static struct tnode *tnode_alloc(int bits)
  287. {
  288. size_t size;
  289. /* verify bits is within bounds */
  290. if (bits > TNODE_VMALLOC_MAX)
  291. return NULL;
  292. /* determine size and verify it is non-zero and didn't overflow */
  293. size = TNODE_SIZE(1ul << bits);
  294. if (size <= PAGE_SIZE)
  295. return kzalloc(size, GFP_KERNEL);
  296. else
  297. return vzalloc(size);
  298. }
  299. static inline void empty_child_inc(struct key_vector *n)
  300. {
  301. tn_info(n)->empty_children++;
  302. if (!tn_info(n)->empty_children)
  303. tn_info(n)->full_children++;
  304. }
  305. static inline void empty_child_dec(struct key_vector *n)
  306. {
  307. if (!tn_info(n)->empty_children)
  308. tn_info(n)->full_children--;
  309. tn_info(n)->empty_children--;
  310. }
  311. static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
  312. {
  313. struct key_vector *l;
  314. struct tnode *kv;
  315. kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
  316. if (!kv)
  317. return NULL;
  318. /* initialize key vector */
  319. l = kv->kv;
  320. l->key = key;
  321. l->pos = 0;
  322. l->bits = 0;
  323. l->slen = fa->fa_slen;
  324. /* link leaf to fib alias */
  325. INIT_HLIST_HEAD(&l->leaf);
  326. hlist_add_head(&fa->fa_list, &l->leaf);
  327. return l;
  328. }
  329. static struct key_vector *tnode_new(t_key key, int pos, int bits)
  330. {
  331. unsigned int shift = pos + bits;
  332. struct key_vector *tn;
  333. struct tnode *tnode;
  334. /* verify bits and pos their msb bits clear and values are valid */
  335. BUG_ON(!bits || (shift > KEYLENGTH));
  336. tnode = tnode_alloc(bits);
  337. if (!tnode)
  338. return NULL;
  339. pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
  340. sizeof(struct key_vector *) << bits);
  341. if (bits == KEYLENGTH)
  342. tnode->full_children = 1;
  343. else
  344. tnode->empty_children = 1ul << bits;
  345. tn = tnode->kv;
  346. tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
  347. tn->pos = pos;
  348. tn->bits = bits;
  349. tn->slen = pos;
  350. return tn;
  351. }
  352. /* Check whether a tnode 'n' is "full", i.e. it is an internal node
  353. * and no bits are skipped. See discussion in dyntree paper p. 6
  354. */
  355. static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
  356. {
  357. return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
  358. }
  359. /* Add a child at position i overwriting the old value.
  360. * Update the value of full_children and empty_children.
  361. */
  362. static void put_child(struct key_vector *tn, unsigned long i,
  363. struct key_vector *n)
  364. {
  365. struct key_vector *chi = get_child(tn, i);
  366. int isfull, wasfull;
  367. BUG_ON(i >= child_length(tn));
  368. /* update emptyChildren, overflow into fullChildren */
  369. if (!n && chi)
  370. empty_child_inc(tn);
  371. if (n && !chi)
  372. empty_child_dec(tn);
  373. /* update fullChildren */
  374. wasfull = tnode_full(tn, chi);
  375. isfull = tnode_full(tn, n);
  376. if (wasfull && !isfull)
  377. tn_info(tn)->full_children--;
  378. else if (!wasfull && isfull)
  379. tn_info(tn)->full_children++;
  380. if (n && (tn->slen < n->slen))
  381. tn->slen = n->slen;
  382. rcu_assign_pointer(tn->tnode[i], n);
  383. }
  384. static void update_children(struct key_vector *tn)
  385. {
  386. unsigned long i;
  387. /* update all of the child parent pointers */
  388. for (i = child_length(tn); i;) {
  389. struct key_vector *inode = get_child(tn, --i);
  390. if (!inode)
  391. continue;
  392. /* Either update the children of a tnode that
  393. * already belongs to us or update the child
  394. * to point to ourselves.
  395. */
  396. if (node_parent(inode) == tn)
  397. update_children(inode);
  398. else
  399. node_set_parent(inode, tn);
  400. }
  401. }
  402. static inline void put_child_root(struct key_vector *tp, t_key key,
  403. struct key_vector *n)
  404. {
  405. if (IS_TRIE(tp))
  406. rcu_assign_pointer(tp->tnode[0], n);
  407. else
  408. put_child(tp, get_index(key, tp), n);
  409. }
  410. static inline void tnode_free_init(struct key_vector *tn)
  411. {
  412. tn_info(tn)->rcu.next = NULL;
  413. }
  414. static inline void tnode_free_append(struct key_vector *tn,
  415. struct key_vector *n)
  416. {
  417. tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
  418. tn_info(tn)->rcu.next = &tn_info(n)->rcu;
  419. }
  420. static void tnode_free(struct key_vector *tn)
  421. {
  422. struct callback_head *head = &tn_info(tn)->rcu;
  423. while (head) {
  424. head = head->next;
  425. tnode_free_size += TNODE_SIZE(1ul << tn->bits);
  426. node_free(tn);
  427. tn = container_of(head, struct tnode, rcu)->kv;
  428. }
  429. if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
  430. tnode_free_size = 0;
  431. synchronize_rcu();
  432. }
  433. }
  434. static struct key_vector *replace(struct trie *t,
  435. struct key_vector *oldtnode,
  436. struct key_vector *tn)
  437. {
  438. struct key_vector *tp = node_parent(oldtnode);
  439. unsigned long i;
  440. /* setup the parent pointer out of and back into this node */
  441. NODE_INIT_PARENT(tn, tp);
  442. put_child_root(tp, tn->key, tn);
  443. /* update all of the child parent pointers */
  444. update_children(tn);
  445. /* all pointers should be clean so we are done */
  446. tnode_free(oldtnode);
  447. /* resize children now that oldtnode is freed */
  448. for (i = child_length(tn); i;) {
  449. struct key_vector *inode = get_child(tn, --i);
  450. /* resize child node */
  451. if (tnode_full(tn, inode))
  452. tn = resize(t, inode);
  453. }
  454. return tp;
  455. }
  456. static struct key_vector *inflate(struct trie *t,
  457. struct key_vector *oldtnode)
  458. {
  459. struct key_vector *tn;
  460. unsigned long i;
  461. t_key m;
  462. pr_debug("In inflate\n");
  463. tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
  464. if (!tn)
  465. goto notnode;
  466. /* prepare oldtnode to be freed */
  467. tnode_free_init(oldtnode);
  468. /* Assemble all of the pointers in our cluster, in this case that
  469. * represents all of the pointers out of our allocated nodes that
  470. * point to existing tnodes and the links between our allocated
  471. * nodes.
  472. */
  473. for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
  474. struct key_vector *inode = get_child(oldtnode, --i);
  475. struct key_vector *node0, *node1;
  476. unsigned long j, k;
  477. /* An empty child */
  478. if (!inode)
  479. continue;
  480. /* A leaf or an internal node with skipped bits */
  481. if (!tnode_full(oldtnode, inode)) {
  482. put_child(tn, get_index(inode->key, tn), inode);
  483. continue;
  484. }
  485. /* drop the node in the old tnode free list */
  486. tnode_free_append(oldtnode, inode);
  487. /* An internal node with two children */
  488. if (inode->bits == 1) {
  489. put_child(tn, 2 * i + 1, get_child(inode, 1));
  490. put_child(tn, 2 * i, get_child(inode, 0));
  491. continue;
  492. }
  493. /* We will replace this node 'inode' with two new
  494. * ones, 'node0' and 'node1', each with half of the
  495. * original children. The two new nodes will have
  496. * a position one bit further down the key and this
  497. * means that the "significant" part of their keys
  498. * (see the discussion near the top of this file)
  499. * will differ by one bit, which will be "0" in
  500. * node0's key and "1" in node1's key. Since we are
  501. * moving the key position by one step, the bit that
  502. * we are moving away from - the bit at position
  503. * (tn->pos) - is the one that will differ between
  504. * node0 and node1. So... we synthesize that bit in the
  505. * two new keys.
  506. */
  507. node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
  508. if (!node1)
  509. goto nomem;
  510. node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
  511. tnode_free_append(tn, node1);
  512. if (!node0)
  513. goto nomem;
  514. tnode_free_append(tn, node0);
  515. /* populate child pointers in new nodes */
  516. for (k = child_length(inode), j = k / 2; j;) {
  517. put_child(node1, --j, get_child(inode, --k));
  518. put_child(node0, j, get_child(inode, j));
  519. put_child(node1, --j, get_child(inode, --k));
  520. put_child(node0, j, get_child(inode, j));
  521. }
  522. /* link new nodes to parent */
  523. NODE_INIT_PARENT(node1, tn);
  524. NODE_INIT_PARENT(node0, tn);
  525. /* link parent to nodes */
  526. put_child(tn, 2 * i + 1, node1);
  527. put_child(tn, 2 * i, node0);
  528. }
  529. /* setup the parent pointers into and out of this node */
  530. return replace(t, oldtnode, tn);
  531. nomem:
  532. /* all pointers should be clean so we are done */
  533. tnode_free(tn);
  534. notnode:
  535. return NULL;
  536. }
  537. static struct key_vector *halve(struct trie *t,
  538. struct key_vector *oldtnode)
  539. {
  540. struct key_vector *tn;
  541. unsigned long i;
  542. pr_debug("In halve\n");
  543. tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
  544. if (!tn)
  545. goto notnode;
  546. /* prepare oldtnode to be freed */
  547. tnode_free_init(oldtnode);
  548. /* Assemble all of the pointers in our cluster, in this case that
  549. * represents all of the pointers out of our allocated nodes that
  550. * point to existing tnodes and the links between our allocated
  551. * nodes.
  552. */
  553. for (i = child_length(oldtnode); i;) {
  554. struct key_vector *node1 = get_child(oldtnode, --i);
  555. struct key_vector *node0 = get_child(oldtnode, --i);
  556. struct key_vector *inode;
  557. /* At least one of the children is empty */
  558. if (!node1 || !node0) {
  559. put_child(tn, i / 2, node1 ? : node0);
  560. continue;
  561. }
  562. /* Two nonempty children */
  563. inode = tnode_new(node0->key, oldtnode->pos, 1);
  564. if (!inode)
  565. goto nomem;
  566. tnode_free_append(tn, inode);
  567. /* initialize pointers out of node */
  568. put_child(inode, 1, node1);
  569. put_child(inode, 0, node0);
  570. NODE_INIT_PARENT(inode, tn);
  571. /* link parent to node */
  572. put_child(tn, i / 2, inode);
  573. }
  574. /* setup the parent pointers into and out of this node */
  575. return replace(t, oldtnode, tn);
  576. nomem:
  577. /* all pointers should be clean so we are done */
  578. tnode_free(tn);
  579. notnode:
  580. return NULL;
  581. }
  582. static struct key_vector *collapse(struct trie *t,
  583. struct key_vector *oldtnode)
  584. {
  585. struct key_vector *n, *tp;
  586. unsigned long i;
  587. /* scan the tnode looking for that one child that might still exist */
  588. for (n = NULL, i = child_length(oldtnode); !n && i;)
  589. n = get_child(oldtnode, --i);
  590. /* compress one level */
  591. tp = node_parent(oldtnode);
  592. put_child_root(tp, oldtnode->key, n);
  593. node_set_parent(n, tp);
  594. /* drop dead node */
  595. node_free(oldtnode);
  596. return tp;
  597. }
  598. static unsigned char update_suffix(struct key_vector *tn)
  599. {
  600. unsigned char slen = tn->pos;
  601. unsigned long stride, i;
  602. unsigned char slen_max;
  603. /* only vector 0 can have a suffix length greater than or equal to
  604. * tn->pos + tn->bits, the second highest node will have a suffix
  605. * length at most of tn->pos + tn->bits - 1
  606. */
  607. slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
  608. /* search though the list of children looking for nodes that might
  609. * have a suffix greater than the one we currently have. This is
  610. * why we start with a stride of 2 since a stride of 1 would
  611. * represent the nodes with suffix length equal to tn->pos
  612. */
  613. for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
  614. struct key_vector *n = get_child(tn, i);
  615. if (!n || (n->slen <= slen))
  616. continue;
  617. /* update stride and slen based on new value */
  618. stride <<= (n->slen - slen);
  619. slen = n->slen;
  620. i &= ~(stride - 1);
  621. /* stop searching if we have hit the maximum possible value */
  622. if (slen >= slen_max)
  623. break;
  624. }
  625. tn->slen = slen;
  626. return slen;
  627. }
  628. /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
  629. * the Helsinki University of Technology and Matti Tikkanen of Nokia
  630. * Telecommunications, page 6:
  631. * "A node is doubled if the ratio of non-empty children to all
  632. * children in the *doubled* node is at least 'high'."
  633. *
  634. * 'high' in this instance is the variable 'inflate_threshold'. It
  635. * is expressed as a percentage, so we multiply it with
  636. * child_length() and instead of multiplying by 2 (since the
  637. * child array will be doubled by inflate()) and multiplying
  638. * the left-hand side by 100 (to handle the percentage thing) we
  639. * multiply the left-hand side by 50.
  640. *
  641. * The left-hand side may look a bit weird: child_length(tn)
  642. * - tn->empty_children is of course the number of non-null children
  643. * in the current node. tn->full_children is the number of "full"
  644. * children, that is non-null tnodes with a skip value of 0.
  645. * All of those will be doubled in the resulting inflated tnode, so
  646. * we just count them one extra time here.
  647. *
  648. * A clearer way to write this would be:
  649. *
  650. * to_be_doubled = tn->full_children;
  651. * not_to_be_doubled = child_length(tn) - tn->empty_children -
  652. * tn->full_children;
  653. *
  654. * new_child_length = child_length(tn) * 2;
  655. *
  656. * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
  657. * new_child_length;
  658. * if (new_fill_factor >= inflate_threshold)
  659. *
  660. * ...and so on, tho it would mess up the while () loop.
  661. *
  662. * anyway,
  663. * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
  664. * inflate_threshold
  665. *
  666. * avoid a division:
  667. * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
  668. * inflate_threshold * new_child_length
  669. *
  670. * expand not_to_be_doubled and to_be_doubled, and shorten:
  671. * 100 * (child_length(tn) - tn->empty_children +
  672. * tn->full_children) >= inflate_threshold * new_child_length
  673. *
  674. * expand new_child_length:
  675. * 100 * (child_length(tn) - tn->empty_children +
  676. * tn->full_children) >=
  677. * inflate_threshold * child_length(tn) * 2
  678. *
  679. * shorten again:
  680. * 50 * (tn->full_children + child_length(tn) -
  681. * tn->empty_children) >= inflate_threshold *
  682. * child_length(tn)
  683. *
  684. */
  685. static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
  686. {
  687. unsigned long used = child_length(tn);
  688. unsigned long threshold = used;
  689. /* Keep root node larger */
  690. threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
  691. used -= tn_info(tn)->empty_children;
  692. used += tn_info(tn)->full_children;
  693. /* if bits == KEYLENGTH then pos = 0, and will fail below */
  694. return (used > 1) && tn->pos && ((50 * used) >= threshold);
  695. }
  696. static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
  697. {
  698. unsigned long used = child_length(tn);
  699. unsigned long threshold = used;
  700. /* Keep root node larger */
  701. threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
  702. used -= tn_info(tn)->empty_children;
  703. /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
  704. return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
  705. }
  706. static inline bool should_collapse(struct key_vector *tn)
  707. {
  708. unsigned long used = child_length(tn);
  709. used -= tn_info(tn)->empty_children;
  710. /* account for bits == KEYLENGTH case */
  711. if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
  712. used -= KEY_MAX;
  713. /* One child or none, time to drop us from the trie */
  714. return used < 2;
  715. }
  716. #define MAX_WORK 10
  717. static struct key_vector *resize(struct trie *t, struct key_vector *tn)
  718. {
  719. #ifdef CONFIG_IP_FIB_TRIE_STATS
  720. struct trie_use_stats __percpu *stats = t->stats;
  721. #endif
  722. struct key_vector *tp = node_parent(tn);
  723. unsigned long cindex = get_index(tn->key, tp);
  724. int max_work = MAX_WORK;
  725. pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
  726. tn, inflate_threshold, halve_threshold);
  727. /* track the tnode via the pointer from the parent instead of
  728. * doing it ourselves. This way we can let RCU fully do its
  729. * thing without us interfering
  730. */
  731. BUG_ON(tn != get_child(tp, cindex));
  732. /* Double as long as the resulting node has a number of
  733. * nonempty nodes that are above the threshold.
  734. */
  735. while (should_inflate(tp, tn) && max_work) {
  736. tp = inflate(t, tn);
  737. if (!tp) {
  738. #ifdef CONFIG_IP_FIB_TRIE_STATS
  739. this_cpu_inc(stats->resize_node_skipped);
  740. #endif
  741. break;
  742. }
  743. max_work--;
  744. tn = get_child(tp, cindex);
  745. }
  746. /* update parent in case inflate failed */
  747. tp = node_parent(tn);
  748. /* Return if at least one inflate is run */
  749. if (max_work != MAX_WORK)
  750. return tp;
  751. /* Halve as long as the number of empty children in this
  752. * node is above threshold.
  753. */
  754. while (should_halve(tp, tn) && max_work) {
  755. tp = halve(t, tn);
  756. if (!tp) {
  757. #ifdef CONFIG_IP_FIB_TRIE_STATS
  758. this_cpu_inc(stats->resize_node_skipped);
  759. #endif
  760. break;
  761. }
  762. max_work--;
  763. tn = get_child(tp, cindex);
  764. }
  765. /* Only one child remains */
  766. if (should_collapse(tn))
  767. return collapse(t, tn);
  768. /* update parent in case halve failed */
  769. return node_parent(tn);
  770. }
  771. static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
  772. {
  773. unsigned char node_slen = tn->slen;
  774. while ((node_slen > tn->pos) && (node_slen > slen)) {
  775. slen = update_suffix(tn);
  776. if (node_slen == slen)
  777. break;
  778. tn = node_parent(tn);
  779. node_slen = tn->slen;
  780. }
  781. }
  782. static void node_push_suffix(struct key_vector *tn, unsigned char slen)
  783. {
  784. while (tn->slen < slen) {
  785. tn->slen = slen;
  786. tn = node_parent(tn);
  787. }
  788. }
  789. /* rcu_read_lock needs to be hold by caller from readside */
  790. static struct key_vector *fib_find_node(struct trie *t,
  791. struct key_vector **tp, u32 key)
  792. {
  793. struct key_vector *pn, *n = t->kv;
  794. unsigned long index = 0;
  795. do {
  796. pn = n;
  797. n = get_child_rcu(n, index);
  798. if (!n)
  799. break;
  800. index = get_cindex(key, n);
  801. /* This bit of code is a bit tricky but it combines multiple
  802. * checks into a single check. The prefix consists of the
  803. * prefix plus zeros for the bits in the cindex. The index
  804. * is the difference between the key and this value. From
  805. * this we can actually derive several pieces of data.
  806. * if (index >= (1ul << bits))
  807. * we have a mismatch in skip bits and failed
  808. * else
  809. * we know the value is cindex
  810. *
  811. * This check is safe even if bits == KEYLENGTH due to the
  812. * fact that we can only allocate a node with 32 bits if a
  813. * long is greater than 32 bits.
  814. */
  815. if (index >= (1ul << n->bits)) {
  816. n = NULL;
  817. break;
  818. }
  819. /* keep searching until we find a perfect match leaf or NULL */
  820. } while (IS_TNODE(n));
  821. *tp = pn;
  822. return n;
  823. }
  824. /* Return the first fib alias matching DSCP with
  825. * priority less than or equal to PRIO.
  826. * If 'find_first' is set, return the first matching
  827. * fib alias, regardless of DSCP and priority.
  828. */
  829. static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
  830. dscp_t dscp, u32 prio, u32 tb_id,
  831. bool find_first)
  832. {
  833. struct fib_alias *fa;
  834. if (!fah)
  835. return NULL;
  836. hlist_for_each_entry(fa, fah, fa_list) {
  837. /* Avoid Sparse warning when using dscp_t in inequalities */
  838. u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
  839. u8 __dscp = inet_dscp_to_dsfield(dscp);
  840. if (fa->fa_slen < slen)
  841. continue;
  842. if (fa->fa_slen != slen)
  843. break;
  844. if (fa->tb_id > tb_id)
  845. continue;
  846. if (fa->tb_id != tb_id)
  847. break;
  848. if (find_first)
  849. return fa;
  850. if (__fa_dscp > __dscp)
  851. continue;
  852. if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
  853. return fa;
  854. }
  855. return NULL;
  856. }
  857. static struct fib_alias *
  858. fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
  859. {
  860. u8 slen = KEYLENGTH - fri->dst_len;
  861. struct key_vector *l, *tp;
  862. struct fib_table *tb;
  863. struct fib_alias *fa;
  864. struct trie *t;
  865. tb = fib_get_table(net, fri->tb_id);
  866. if (!tb)
  867. return NULL;
  868. t = (struct trie *)tb->tb_data;
  869. l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
  870. if (!l)
  871. return NULL;
  872. hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
  873. if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
  874. fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi &&
  875. fa->fa_type == fri->type)
  876. return fa;
  877. }
  878. return NULL;
  879. }
  880. void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
  881. {
  882. u8 fib_notify_on_flag_change;
  883. struct fib_alias *fa_match;
  884. struct sk_buff *skb;
  885. int err;
  886. rcu_read_lock();
  887. fa_match = fib_find_matching_alias(net, fri);
  888. if (!fa_match)
  889. goto out;
  890. /* These are paired with the WRITE_ONCE() happening in this function.
  891. * The reason is that we are only protected by RCU at this point.
  892. */
  893. if (READ_ONCE(fa_match->offload) == fri->offload &&
  894. READ_ONCE(fa_match->trap) == fri->trap &&
  895. READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
  896. goto out;
  897. WRITE_ONCE(fa_match->offload, fri->offload);
  898. WRITE_ONCE(fa_match->trap, fri->trap);
  899. fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change);
  900. /* 2 means send notifications only if offload_failed was changed. */
  901. if (fib_notify_on_flag_change == 2 &&
  902. READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
  903. goto out;
  904. WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
  905. if (!fib_notify_on_flag_change)
  906. goto out;
  907. skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
  908. if (!skb) {
  909. err = -ENOBUFS;
  910. goto errout;
  911. }
  912. err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
  913. if (err < 0) {
  914. /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
  915. WARN_ON(err == -EMSGSIZE);
  916. kfree_skb(skb);
  917. goto errout;
  918. }
  919. rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
  920. goto out;
  921. errout:
  922. rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
  923. out:
  924. rcu_read_unlock();
  925. }
  926. EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
  927. static void trie_rebalance(struct trie *t, struct key_vector *tn)
  928. {
  929. while (!IS_TRIE(tn))
  930. tn = resize(t, tn);
  931. }
  932. static int fib_insert_node(struct trie *t, struct key_vector *tp,
  933. struct fib_alias *new, t_key key)
  934. {
  935. struct key_vector *n, *l;
  936. l = leaf_new(key, new);
  937. if (!l)
  938. goto noleaf;
  939. /* retrieve child from parent node */
  940. n = get_child(tp, get_index(key, tp));
  941. /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
  942. *
  943. * Add a new tnode here
  944. * first tnode need some special handling
  945. * leaves us in position for handling as case 3
  946. */
  947. if (n) {
  948. struct key_vector *tn;
  949. tn = tnode_new(key, __fls(key ^ n->key), 1);
  950. if (!tn)
  951. goto notnode;
  952. /* initialize routes out of node */
  953. NODE_INIT_PARENT(tn, tp);
  954. put_child(tn, get_index(key, tn) ^ 1, n);
  955. /* start adding routes into the node */
  956. put_child_root(tp, key, tn);
  957. node_set_parent(n, tn);
  958. /* parent now has a NULL spot where the leaf can go */
  959. tp = tn;
  960. }
  961. /* Case 3: n is NULL, and will just insert a new leaf */
  962. node_push_suffix(tp, new->fa_slen);
  963. NODE_INIT_PARENT(l, tp);
  964. put_child_root(tp, key, l);
  965. trie_rebalance(t, tp);
  966. return 0;
  967. notnode:
  968. node_free(l);
  969. noleaf:
  970. return -ENOMEM;
  971. }
  972. static int fib_insert_alias(struct trie *t, struct key_vector *tp,
  973. struct key_vector *l, struct fib_alias *new,
  974. struct fib_alias *fa, t_key key)
  975. {
  976. if (!l)
  977. return fib_insert_node(t, tp, new, key);
  978. if (fa) {
  979. hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
  980. } else {
  981. struct fib_alias *last;
  982. hlist_for_each_entry(last, &l->leaf, fa_list) {
  983. if (new->fa_slen < last->fa_slen)
  984. break;
  985. if ((new->fa_slen == last->fa_slen) &&
  986. (new->tb_id > last->tb_id))
  987. break;
  988. fa = last;
  989. }
  990. if (fa)
  991. hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
  992. else
  993. hlist_add_head_rcu(&new->fa_list, &l->leaf);
  994. }
  995. /* if we added to the tail node then we need to update slen */
  996. if (l->slen < new->fa_slen) {
  997. l->slen = new->fa_slen;
  998. node_push_suffix(tp, new->fa_slen);
  999. }
  1000. return 0;
  1001. }
  1002. static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
  1003. {
  1004. if (plen > KEYLENGTH) {
  1005. NL_SET_ERR_MSG(extack, "Invalid prefix length");
  1006. return false;
  1007. }
  1008. if ((plen < KEYLENGTH) && (key << plen)) {
  1009. NL_SET_ERR_MSG(extack,
  1010. "Invalid prefix for given prefix length");
  1011. return false;
  1012. }
  1013. return true;
  1014. }
  1015. static void fib_remove_alias(struct trie *t, struct key_vector *tp,
  1016. struct key_vector *l, struct fib_alias *old);
  1017. /* Caller must hold RTNL. */
  1018. int fib_table_insert(struct net *net, struct fib_table *tb,
  1019. struct fib_config *cfg, struct netlink_ext_ack *extack)
  1020. {
  1021. struct trie *t = (struct trie *)tb->tb_data;
  1022. struct fib_alias *fa, *new_fa;
  1023. struct key_vector *l, *tp;
  1024. u16 nlflags = NLM_F_EXCL;
  1025. struct fib_info *fi;
  1026. u8 plen = cfg->fc_dst_len;
  1027. u8 slen = KEYLENGTH - plen;
  1028. dscp_t dscp;
  1029. u32 key;
  1030. int err;
  1031. key = ntohl(cfg->fc_dst);
  1032. if (!fib_valid_key_len(key, plen, extack))
  1033. return -EINVAL;
  1034. pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
  1035. fi = fib_create_info(cfg, extack);
  1036. if (IS_ERR(fi)) {
  1037. err = PTR_ERR(fi);
  1038. goto err;
  1039. }
  1040. dscp = cfg->fc_dscp;
  1041. l = fib_find_node(t, &tp, key);
  1042. fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
  1043. tb->tb_id, false) : NULL;
  1044. /* Now fa, if non-NULL, points to the first fib alias
  1045. * with the same keys [prefix,dscp,priority], if such key already
  1046. * exists or to the node before which we will insert new one.
  1047. *
  1048. * If fa is NULL, we will need to allocate a new one and
  1049. * insert to the tail of the section matching the suffix length
  1050. * of the new alias.
  1051. */
  1052. if (fa && fa->fa_dscp == dscp &&
  1053. fa->fa_info->fib_priority == fi->fib_priority) {
  1054. struct fib_alias *fa_first, *fa_match;
  1055. err = -EEXIST;
  1056. if (cfg->fc_nlflags & NLM_F_EXCL)
  1057. goto out;
  1058. nlflags &= ~NLM_F_EXCL;
  1059. /* We have 2 goals:
  1060. * 1. Find exact match for type, scope, fib_info to avoid
  1061. * duplicate routes
  1062. * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
  1063. */
  1064. fa_match = NULL;
  1065. fa_first = fa;
  1066. hlist_for_each_entry_from(fa, fa_list) {
  1067. if ((fa->fa_slen != slen) ||
  1068. (fa->tb_id != tb->tb_id) ||
  1069. (fa->fa_dscp != dscp))
  1070. break;
  1071. if (fa->fa_info->fib_priority != fi->fib_priority)
  1072. break;
  1073. if (fa->fa_type == cfg->fc_type &&
  1074. fa->fa_info == fi) {
  1075. fa_match = fa;
  1076. break;
  1077. }
  1078. }
  1079. if (cfg->fc_nlflags & NLM_F_REPLACE) {
  1080. struct fib_info *fi_drop;
  1081. u8 state;
  1082. nlflags |= NLM_F_REPLACE;
  1083. fa = fa_first;
  1084. if (fa_match) {
  1085. if (fa == fa_match)
  1086. err = 0;
  1087. goto out;
  1088. }
  1089. err = -ENOBUFS;
  1090. new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
  1091. if (!new_fa)
  1092. goto out;
  1093. fi_drop = fa->fa_info;
  1094. new_fa->fa_dscp = fa->fa_dscp;
  1095. new_fa->fa_info = fi;
  1096. new_fa->fa_type = cfg->fc_type;
  1097. state = fa->fa_state;
  1098. new_fa->fa_state = state & ~FA_S_ACCESSED;
  1099. new_fa->fa_slen = fa->fa_slen;
  1100. new_fa->tb_id = tb->tb_id;
  1101. new_fa->fa_default = -1;
  1102. new_fa->offload = 0;
  1103. new_fa->trap = 0;
  1104. new_fa->offload_failed = 0;
  1105. hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
  1106. if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
  1107. tb->tb_id, true) == new_fa) {
  1108. enum fib_event_type fib_event;
  1109. fib_event = FIB_EVENT_ENTRY_REPLACE;
  1110. err = call_fib_entry_notifiers(net, fib_event,
  1111. key, plen,
  1112. new_fa, extack);
  1113. if (err) {
  1114. hlist_replace_rcu(&new_fa->fa_list,
  1115. &fa->fa_list);
  1116. goto out_free_new_fa;
  1117. }
  1118. }
  1119. rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
  1120. tb->tb_id, &cfg->fc_nlinfo, nlflags);
  1121. alias_free_mem_rcu(fa);
  1122. fib_release_info(fi_drop);
  1123. if (state & FA_S_ACCESSED)
  1124. rt_cache_flush(cfg->fc_nlinfo.nl_net);
  1125. goto succeeded;
  1126. }
  1127. /* Error if we find a perfect match which
  1128. * uses the same scope, type, and nexthop
  1129. * information.
  1130. */
  1131. if (fa_match)
  1132. goto out;
  1133. if (cfg->fc_nlflags & NLM_F_APPEND)
  1134. nlflags |= NLM_F_APPEND;
  1135. else
  1136. fa = fa_first;
  1137. }
  1138. err = -ENOENT;
  1139. if (!(cfg->fc_nlflags & NLM_F_CREATE))
  1140. goto out;
  1141. nlflags |= NLM_F_CREATE;
  1142. err = -ENOBUFS;
  1143. new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
  1144. if (!new_fa)
  1145. goto out;
  1146. new_fa->fa_info = fi;
  1147. new_fa->fa_dscp = dscp;
  1148. new_fa->fa_type = cfg->fc_type;
  1149. new_fa->fa_state = 0;
  1150. new_fa->fa_slen = slen;
  1151. new_fa->tb_id = tb->tb_id;
  1152. new_fa->fa_default = -1;
  1153. new_fa->offload = 0;
  1154. new_fa->trap = 0;
  1155. new_fa->offload_failed = 0;
  1156. /* Insert new entry to the list. */
  1157. err = fib_insert_alias(t, tp, l, new_fa, fa, key);
  1158. if (err)
  1159. goto out_free_new_fa;
  1160. /* The alias was already inserted, so the node must exist. */
  1161. l = l ? l : fib_find_node(t, &tp, key);
  1162. if (WARN_ON_ONCE(!l)) {
  1163. err = -ENOENT;
  1164. goto out_free_new_fa;
  1165. }
  1166. if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
  1167. new_fa) {
  1168. enum fib_event_type fib_event;
  1169. fib_event = FIB_EVENT_ENTRY_REPLACE;
  1170. err = call_fib_entry_notifiers(net, fib_event, key, plen,
  1171. new_fa, extack);
  1172. if (err)
  1173. goto out_remove_new_fa;
  1174. }
  1175. if (!plen)
  1176. tb->tb_num_default++;
  1177. rt_cache_flush(cfg->fc_nlinfo.nl_net);
  1178. rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
  1179. &cfg->fc_nlinfo, nlflags);
  1180. succeeded:
  1181. return 0;
  1182. out_remove_new_fa:
  1183. fib_remove_alias(t, tp, l, new_fa);
  1184. out_free_new_fa:
  1185. kmem_cache_free(fn_alias_kmem, new_fa);
  1186. out:
  1187. fib_release_info(fi);
  1188. err:
  1189. return err;
  1190. }
  1191. static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
  1192. {
  1193. t_key prefix = n->key;
  1194. return (key ^ prefix) & (prefix | -prefix);
  1195. }
  1196. bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
  1197. const struct flowi4 *flp)
  1198. {
  1199. if (nhc->nhc_flags & RTNH_F_DEAD)
  1200. return false;
  1201. if (ip_ignore_linkdown(nhc->nhc_dev) &&
  1202. nhc->nhc_flags & RTNH_F_LINKDOWN &&
  1203. !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
  1204. return false;
  1205. if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif)
  1206. return false;
  1207. return true;
  1208. }
  1209. /* should be called with rcu_read_lock */
  1210. int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
  1211. struct fib_result *res, int fib_flags)
  1212. {
  1213. struct trie *t = (struct trie *) tb->tb_data;
  1214. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1215. struct trie_use_stats __percpu *stats = t->stats;
  1216. #endif
  1217. const t_key key = ntohl(flp->daddr);
  1218. struct key_vector *n, *pn;
  1219. struct fib_alias *fa;
  1220. unsigned long index;
  1221. t_key cindex;
  1222. pn = t->kv;
  1223. cindex = 0;
  1224. n = get_child_rcu(pn, cindex);
  1225. if (!n) {
  1226. trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
  1227. return -EAGAIN;
  1228. }
  1229. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1230. this_cpu_inc(stats->gets);
  1231. #endif
  1232. /* Step 1: Travel to the longest prefix match in the trie */
  1233. for (;;) {
  1234. index = get_cindex(key, n);
  1235. /* This bit of code is a bit tricky but it combines multiple
  1236. * checks into a single check. The prefix consists of the
  1237. * prefix plus zeros for the "bits" in the prefix. The index
  1238. * is the difference between the key and this value. From
  1239. * this we can actually derive several pieces of data.
  1240. * if (index >= (1ul << bits))
  1241. * we have a mismatch in skip bits and failed
  1242. * else
  1243. * we know the value is cindex
  1244. *
  1245. * This check is safe even if bits == KEYLENGTH due to the
  1246. * fact that we can only allocate a node with 32 bits if a
  1247. * long is greater than 32 bits.
  1248. */
  1249. if (index >= (1ul << n->bits))
  1250. break;
  1251. /* we have found a leaf. Prefixes have already been compared */
  1252. if (IS_LEAF(n))
  1253. goto found;
  1254. /* only record pn and cindex if we are going to be chopping
  1255. * bits later. Otherwise we are just wasting cycles.
  1256. */
  1257. if (n->slen > n->pos) {
  1258. pn = n;
  1259. cindex = index;
  1260. }
  1261. n = get_child_rcu(n, index);
  1262. if (unlikely(!n))
  1263. goto backtrace;
  1264. }
  1265. /* Step 2: Sort out leaves and begin backtracing for longest prefix */
  1266. for (;;) {
  1267. /* record the pointer where our next node pointer is stored */
  1268. struct key_vector __rcu **cptr = n->tnode;
  1269. /* This test verifies that none of the bits that differ
  1270. * between the key and the prefix exist in the region of
  1271. * the lsb and higher in the prefix.
  1272. */
  1273. if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
  1274. goto backtrace;
  1275. /* exit out and process leaf */
  1276. if (unlikely(IS_LEAF(n)))
  1277. break;
  1278. /* Don't bother recording parent info. Since we are in
  1279. * prefix match mode we will have to come back to wherever
  1280. * we started this traversal anyway
  1281. */
  1282. while ((n = rcu_dereference(*cptr)) == NULL) {
  1283. backtrace:
  1284. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1285. if (!n)
  1286. this_cpu_inc(stats->null_node_hit);
  1287. #endif
  1288. /* If we are at cindex 0 there are no more bits for
  1289. * us to strip at this level so we must ascend back
  1290. * up one level to see if there are any more bits to
  1291. * be stripped there.
  1292. */
  1293. while (!cindex) {
  1294. t_key pkey = pn->key;
  1295. /* If we don't have a parent then there is
  1296. * nothing for us to do as we do not have any
  1297. * further nodes to parse.
  1298. */
  1299. if (IS_TRIE(pn)) {
  1300. trace_fib_table_lookup(tb->tb_id, flp,
  1301. NULL, -EAGAIN);
  1302. return -EAGAIN;
  1303. }
  1304. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1305. this_cpu_inc(stats->backtrack);
  1306. #endif
  1307. /* Get Child's index */
  1308. pn = node_parent_rcu(pn);
  1309. cindex = get_index(pkey, pn);
  1310. }
  1311. /* strip the least significant bit from the cindex */
  1312. cindex &= cindex - 1;
  1313. /* grab pointer for next child node */
  1314. cptr = &pn->tnode[cindex];
  1315. }
  1316. }
  1317. found:
  1318. /* this line carries forward the xor from earlier in the function */
  1319. index = key ^ n->key;
  1320. /* Step 3: Process the leaf, if that fails fall back to backtracing */
  1321. hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
  1322. struct fib_info *fi = fa->fa_info;
  1323. struct fib_nh_common *nhc;
  1324. int nhsel, err;
  1325. if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
  1326. if (index >= (1ul << fa->fa_slen))
  1327. continue;
  1328. }
  1329. if (fa->fa_dscp &&
  1330. inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
  1331. continue;
  1332. /* Paired with WRITE_ONCE() in fib_release_info() */
  1333. if (READ_ONCE(fi->fib_dead))
  1334. continue;
  1335. if (fa->fa_info->fib_scope < flp->flowi4_scope)
  1336. continue;
  1337. fib_alias_accessed(fa);
  1338. err = fib_props[fa->fa_type].error;
  1339. if (unlikely(err < 0)) {
  1340. out_reject:
  1341. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1342. this_cpu_inc(stats->semantic_match_passed);
  1343. #endif
  1344. trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
  1345. return err;
  1346. }
  1347. if (fi->fib_flags & RTNH_F_DEAD)
  1348. continue;
  1349. if (unlikely(fi->nh)) {
  1350. if (nexthop_is_blackhole(fi->nh)) {
  1351. err = fib_props[RTN_BLACKHOLE].error;
  1352. goto out_reject;
  1353. }
  1354. nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
  1355. &nhsel);
  1356. if (nhc)
  1357. goto set_result;
  1358. goto miss;
  1359. }
  1360. for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
  1361. nhc = fib_info_nhc(fi, nhsel);
  1362. if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
  1363. continue;
  1364. set_result:
  1365. if (!(fib_flags & FIB_LOOKUP_NOREF))
  1366. refcount_inc(&fi->fib_clntref);
  1367. res->prefix = htonl(n->key);
  1368. res->prefixlen = KEYLENGTH - fa->fa_slen;
  1369. res->nh_sel = nhsel;
  1370. res->nhc = nhc;
  1371. res->type = fa->fa_type;
  1372. res->scope = fi->fib_scope;
  1373. res->fi = fi;
  1374. res->table = tb;
  1375. res->fa_head = &n->leaf;
  1376. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1377. this_cpu_inc(stats->semantic_match_passed);
  1378. #endif
  1379. trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
  1380. return err;
  1381. }
  1382. }
  1383. miss:
  1384. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1385. this_cpu_inc(stats->semantic_match_miss);
  1386. #endif
  1387. goto backtrace;
  1388. }
  1389. EXPORT_SYMBOL_GPL(fib_table_lookup);
  1390. static void fib_remove_alias(struct trie *t, struct key_vector *tp,
  1391. struct key_vector *l, struct fib_alias *old)
  1392. {
  1393. /* record the location of the previous list_info entry */
  1394. struct hlist_node **pprev = old->fa_list.pprev;
  1395. struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
  1396. /* remove the fib_alias from the list */
  1397. hlist_del_rcu(&old->fa_list);
  1398. /* if we emptied the list this leaf will be freed and we can sort
  1399. * out parent suffix lengths as a part of trie_rebalance
  1400. */
  1401. if (hlist_empty(&l->leaf)) {
  1402. if (tp->slen == l->slen)
  1403. node_pull_suffix(tp, tp->pos);
  1404. put_child_root(tp, l->key, NULL);
  1405. node_free(l);
  1406. trie_rebalance(t, tp);
  1407. return;
  1408. }
  1409. /* only access fa if it is pointing at the last valid hlist_node */
  1410. if (*pprev)
  1411. return;
  1412. /* update the trie with the latest suffix length */
  1413. l->slen = fa->fa_slen;
  1414. node_pull_suffix(tp, fa->fa_slen);
  1415. }
  1416. static void fib_notify_alias_delete(struct net *net, u32 key,
  1417. struct hlist_head *fah,
  1418. struct fib_alias *fa_to_delete,
  1419. struct netlink_ext_ack *extack)
  1420. {
  1421. struct fib_alias *fa_next, *fa_to_notify;
  1422. u32 tb_id = fa_to_delete->tb_id;
  1423. u8 slen = fa_to_delete->fa_slen;
  1424. enum fib_event_type fib_event;
  1425. /* Do not notify if we do not care about the route. */
  1426. if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
  1427. return;
  1428. /* Determine if the route should be replaced by the next route in the
  1429. * list.
  1430. */
  1431. fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
  1432. struct fib_alias, fa_list);
  1433. if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
  1434. fib_event = FIB_EVENT_ENTRY_REPLACE;
  1435. fa_to_notify = fa_next;
  1436. } else {
  1437. fib_event = FIB_EVENT_ENTRY_DEL;
  1438. fa_to_notify = fa_to_delete;
  1439. }
  1440. call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
  1441. fa_to_notify, extack);
  1442. }
  1443. /* Caller must hold RTNL. */
  1444. int fib_table_delete(struct net *net, struct fib_table *tb,
  1445. struct fib_config *cfg, struct netlink_ext_ack *extack)
  1446. {
  1447. struct trie *t = (struct trie *) tb->tb_data;
  1448. struct fib_alias *fa, *fa_to_delete;
  1449. struct key_vector *l, *tp;
  1450. u8 plen = cfg->fc_dst_len;
  1451. u8 slen = KEYLENGTH - plen;
  1452. dscp_t dscp;
  1453. u32 key;
  1454. key = ntohl(cfg->fc_dst);
  1455. if (!fib_valid_key_len(key, plen, extack))
  1456. return -EINVAL;
  1457. l = fib_find_node(t, &tp, key);
  1458. if (!l)
  1459. return -ESRCH;
  1460. dscp = cfg->fc_dscp;
  1461. fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
  1462. if (!fa)
  1463. return -ESRCH;
  1464. pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
  1465. inet_dscp_to_dsfield(dscp), t);
  1466. fa_to_delete = NULL;
  1467. hlist_for_each_entry_from(fa, fa_list) {
  1468. struct fib_info *fi = fa->fa_info;
  1469. if ((fa->fa_slen != slen) ||
  1470. (fa->tb_id != tb->tb_id) ||
  1471. (fa->fa_dscp != dscp))
  1472. break;
  1473. if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
  1474. (cfg->fc_scope == RT_SCOPE_NOWHERE ||
  1475. fa->fa_info->fib_scope == cfg->fc_scope) &&
  1476. (!cfg->fc_prefsrc ||
  1477. fi->fib_prefsrc == cfg->fc_prefsrc) &&
  1478. (!cfg->fc_protocol ||
  1479. fi->fib_protocol == cfg->fc_protocol) &&
  1480. fib_nh_match(net, cfg, fi, extack) == 0 &&
  1481. fib_metrics_match(cfg, fi)) {
  1482. fa_to_delete = fa;
  1483. break;
  1484. }
  1485. }
  1486. if (!fa_to_delete)
  1487. return -ESRCH;
  1488. fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
  1489. rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
  1490. &cfg->fc_nlinfo, 0);
  1491. if (!plen)
  1492. tb->tb_num_default--;
  1493. fib_remove_alias(t, tp, l, fa_to_delete);
  1494. if (fa_to_delete->fa_state & FA_S_ACCESSED)
  1495. rt_cache_flush(cfg->fc_nlinfo.nl_net);
  1496. fib_release_info(fa_to_delete->fa_info);
  1497. alias_free_mem_rcu(fa_to_delete);
  1498. return 0;
  1499. }
  1500. /* Scan for the next leaf starting at the provided key value */
  1501. static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
  1502. {
  1503. struct key_vector *pn, *n = *tn;
  1504. unsigned long cindex;
  1505. /* this loop is meant to try and find the key in the trie */
  1506. do {
  1507. /* record parent and next child index */
  1508. pn = n;
  1509. cindex = (key > pn->key) ? get_index(key, pn) : 0;
  1510. if (cindex >> pn->bits)
  1511. break;
  1512. /* descend into the next child */
  1513. n = get_child_rcu(pn, cindex++);
  1514. if (!n)
  1515. break;
  1516. /* guarantee forward progress on the keys */
  1517. if (IS_LEAF(n) && (n->key >= key))
  1518. goto found;
  1519. } while (IS_TNODE(n));
  1520. /* this loop will search for the next leaf with a greater key */
  1521. while (!IS_TRIE(pn)) {
  1522. /* if we exhausted the parent node we will need to climb */
  1523. if (cindex >= (1ul << pn->bits)) {
  1524. t_key pkey = pn->key;
  1525. pn = node_parent_rcu(pn);
  1526. cindex = get_index(pkey, pn) + 1;
  1527. continue;
  1528. }
  1529. /* grab the next available node */
  1530. n = get_child_rcu(pn, cindex++);
  1531. if (!n)
  1532. continue;
  1533. /* no need to compare keys since we bumped the index */
  1534. if (IS_LEAF(n))
  1535. goto found;
  1536. /* Rescan start scanning in new node */
  1537. pn = n;
  1538. cindex = 0;
  1539. }
  1540. *tn = pn;
  1541. return NULL; /* Root of trie */
  1542. found:
  1543. /* if we are at the limit for keys just return NULL for the tnode */
  1544. *tn = pn;
  1545. return n;
  1546. }
  1547. static void fib_trie_free(struct fib_table *tb)
  1548. {
  1549. struct trie *t = (struct trie *)tb->tb_data;
  1550. struct key_vector *pn = t->kv;
  1551. unsigned long cindex = 1;
  1552. struct hlist_node *tmp;
  1553. struct fib_alias *fa;
  1554. /* walk trie in reverse order and free everything */
  1555. for (;;) {
  1556. struct key_vector *n;
  1557. if (!(cindex--)) {
  1558. t_key pkey = pn->key;
  1559. if (IS_TRIE(pn))
  1560. break;
  1561. n = pn;
  1562. pn = node_parent(pn);
  1563. /* drop emptied tnode */
  1564. put_child_root(pn, n->key, NULL);
  1565. node_free(n);
  1566. cindex = get_index(pkey, pn);
  1567. continue;
  1568. }
  1569. /* grab the next available node */
  1570. n = get_child(pn, cindex);
  1571. if (!n)
  1572. continue;
  1573. if (IS_TNODE(n)) {
  1574. /* record pn and cindex for leaf walking */
  1575. pn = n;
  1576. cindex = 1ul << n->bits;
  1577. continue;
  1578. }
  1579. hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
  1580. hlist_del_rcu(&fa->fa_list);
  1581. alias_free_mem_rcu(fa);
  1582. }
  1583. put_child_root(pn, n->key, NULL);
  1584. node_free(n);
  1585. }
  1586. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1587. free_percpu(t->stats);
  1588. #endif
  1589. kfree(tb);
  1590. }
  1591. struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
  1592. {
  1593. struct trie *ot = (struct trie *)oldtb->tb_data;
  1594. struct key_vector *l, *tp = ot->kv;
  1595. struct fib_table *local_tb;
  1596. struct fib_alias *fa;
  1597. struct trie *lt;
  1598. t_key key = 0;
  1599. if (oldtb->tb_data == oldtb->__data)
  1600. return oldtb;
  1601. local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
  1602. if (!local_tb)
  1603. return NULL;
  1604. lt = (struct trie *)local_tb->tb_data;
  1605. while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
  1606. struct key_vector *local_l = NULL, *local_tp;
  1607. hlist_for_each_entry(fa, &l->leaf, fa_list) {
  1608. struct fib_alias *new_fa;
  1609. if (local_tb->tb_id != fa->tb_id)
  1610. continue;
  1611. /* clone fa for new local table */
  1612. new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
  1613. if (!new_fa)
  1614. goto out;
  1615. memcpy(new_fa, fa, sizeof(*fa));
  1616. /* insert clone into table */
  1617. if (!local_l)
  1618. local_l = fib_find_node(lt, &local_tp, l->key);
  1619. if (fib_insert_alias(lt, local_tp, local_l, new_fa,
  1620. NULL, l->key)) {
  1621. kmem_cache_free(fn_alias_kmem, new_fa);
  1622. goto out;
  1623. }
  1624. }
  1625. /* stop loop if key wrapped back to 0 */
  1626. key = l->key + 1;
  1627. if (key < l->key)
  1628. break;
  1629. }
  1630. return local_tb;
  1631. out:
  1632. fib_trie_free(local_tb);
  1633. return NULL;
  1634. }
  1635. /* Caller must hold RTNL */
  1636. void fib_table_flush_external(struct fib_table *tb)
  1637. {
  1638. struct trie *t = (struct trie *)tb->tb_data;
  1639. struct key_vector *pn = t->kv;
  1640. unsigned long cindex = 1;
  1641. struct hlist_node *tmp;
  1642. struct fib_alias *fa;
  1643. /* walk trie in reverse order */
  1644. for (;;) {
  1645. unsigned char slen = 0;
  1646. struct key_vector *n;
  1647. if (!(cindex--)) {
  1648. t_key pkey = pn->key;
  1649. /* cannot resize the trie vector */
  1650. if (IS_TRIE(pn))
  1651. break;
  1652. /* update the suffix to address pulled leaves */
  1653. if (pn->slen > pn->pos)
  1654. update_suffix(pn);
  1655. /* resize completed node */
  1656. pn = resize(t, pn);
  1657. cindex = get_index(pkey, pn);
  1658. continue;
  1659. }
  1660. /* grab the next available node */
  1661. n = get_child(pn, cindex);
  1662. if (!n)
  1663. continue;
  1664. if (IS_TNODE(n)) {
  1665. /* record pn and cindex for leaf walking */
  1666. pn = n;
  1667. cindex = 1ul << n->bits;
  1668. continue;
  1669. }
  1670. hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
  1671. /* if alias was cloned to local then we just
  1672. * need to remove the local copy from main
  1673. */
  1674. if (tb->tb_id != fa->tb_id) {
  1675. hlist_del_rcu(&fa->fa_list);
  1676. alias_free_mem_rcu(fa);
  1677. continue;
  1678. }
  1679. /* record local slen */
  1680. slen = fa->fa_slen;
  1681. }
  1682. /* update leaf slen */
  1683. n->slen = slen;
  1684. if (hlist_empty(&n->leaf)) {
  1685. put_child_root(pn, n->key, NULL);
  1686. node_free(n);
  1687. }
  1688. }
  1689. }
  1690. /* Caller must hold RTNL. */
  1691. int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
  1692. {
  1693. struct trie *t = (struct trie *)tb->tb_data;
  1694. struct key_vector *pn = t->kv;
  1695. unsigned long cindex = 1;
  1696. struct hlist_node *tmp;
  1697. struct fib_alias *fa;
  1698. int found = 0;
  1699. /* walk trie in reverse order */
  1700. for (;;) {
  1701. unsigned char slen = 0;
  1702. struct key_vector *n;
  1703. if (!(cindex--)) {
  1704. t_key pkey = pn->key;
  1705. /* cannot resize the trie vector */
  1706. if (IS_TRIE(pn))
  1707. break;
  1708. /* update the suffix to address pulled leaves */
  1709. if (pn->slen > pn->pos)
  1710. update_suffix(pn);
  1711. /* resize completed node */
  1712. pn = resize(t, pn);
  1713. cindex = get_index(pkey, pn);
  1714. continue;
  1715. }
  1716. /* grab the next available node */
  1717. n = get_child(pn, cindex);
  1718. if (!n)
  1719. continue;
  1720. if (IS_TNODE(n)) {
  1721. /* record pn and cindex for leaf walking */
  1722. pn = n;
  1723. cindex = 1ul << n->bits;
  1724. continue;
  1725. }
  1726. hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
  1727. struct fib_info *fi = fa->fa_info;
  1728. if (!fi || tb->tb_id != fa->tb_id ||
  1729. (!(fi->fib_flags & RTNH_F_DEAD) &&
  1730. !fib_props[fa->fa_type].error)) {
  1731. slen = fa->fa_slen;
  1732. continue;
  1733. }
  1734. /* Do not flush error routes if network namespace is
  1735. * not being dismantled
  1736. */
  1737. if (!flush_all && fib_props[fa->fa_type].error) {
  1738. slen = fa->fa_slen;
  1739. continue;
  1740. }
  1741. fib_notify_alias_delete(net, n->key, &n->leaf, fa,
  1742. NULL);
  1743. hlist_del_rcu(&fa->fa_list);
  1744. fib_release_info(fa->fa_info);
  1745. alias_free_mem_rcu(fa);
  1746. found++;
  1747. }
  1748. /* update leaf slen */
  1749. n->slen = slen;
  1750. if (hlist_empty(&n->leaf)) {
  1751. put_child_root(pn, n->key, NULL);
  1752. node_free(n);
  1753. }
  1754. }
  1755. pr_debug("trie_flush found=%d\n", found);
  1756. return found;
  1757. }
  1758. /* derived from fib_trie_free */
  1759. static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
  1760. struct nl_info *info)
  1761. {
  1762. struct trie *t = (struct trie *)tb->tb_data;
  1763. struct key_vector *pn = t->kv;
  1764. unsigned long cindex = 1;
  1765. struct fib_alias *fa;
  1766. for (;;) {
  1767. struct key_vector *n;
  1768. if (!(cindex--)) {
  1769. t_key pkey = pn->key;
  1770. if (IS_TRIE(pn))
  1771. break;
  1772. pn = node_parent(pn);
  1773. cindex = get_index(pkey, pn);
  1774. continue;
  1775. }
  1776. /* grab the next available node */
  1777. n = get_child(pn, cindex);
  1778. if (!n)
  1779. continue;
  1780. if (IS_TNODE(n)) {
  1781. /* record pn and cindex for leaf walking */
  1782. pn = n;
  1783. cindex = 1ul << n->bits;
  1784. continue;
  1785. }
  1786. hlist_for_each_entry(fa, &n->leaf, fa_list) {
  1787. struct fib_info *fi = fa->fa_info;
  1788. if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
  1789. continue;
  1790. rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
  1791. KEYLENGTH - fa->fa_slen, tb->tb_id,
  1792. info, NLM_F_REPLACE);
  1793. }
  1794. }
  1795. }
  1796. void fib_info_notify_update(struct net *net, struct nl_info *info)
  1797. {
  1798. unsigned int h;
  1799. for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
  1800. struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  1801. struct fib_table *tb;
  1802. hlist_for_each_entry_rcu(tb, head, tb_hlist,
  1803. lockdep_rtnl_is_held())
  1804. __fib_info_notify_update(net, tb, info);
  1805. }
  1806. }
  1807. static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
  1808. struct notifier_block *nb,
  1809. struct netlink_ext_ack *extack)
  1810. {
  1811. struct fib_alias *fa;
  1812. int last_slen = -1;
  1813. int err;
  1814. hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
  1815. struct fib_info *fi = fa->fa_info;
  1816. if (!fi)
  1817. continue;
  1818. /* local and main table can share the same trie,
  1819. * so don't notify twice for the same entry.
  1820. */
  1821. if (tb->tb_id != fa->tb_id)
  1822. continue;
  1823. if (fa->fa_slen == last_slen)
  1824. continue;
  1825. last_slen = fa->fa_slen;
  1826. err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
  1827. l->key, KEYLENGTH - fa->fa_slen,
  1828. fa, extack);
  1829. if (err)
  1830. return err;
  1831. }
  1832. return 0;
  1833. }
  1834. static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
  1835. struct netlink_ext_ack *extack)
  1836. {
  1837. struct trie *t = (struct trie *)tb->tb_data;
  1838. struct key_vector *l, *tp = t->kv;
  1839. t_key key = 0;
  1840. int err;
  1841. while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
  1842. err = fib_leaf_notify(l, tb, nb, extack);
  1843. if (err)
  1844. return err;
  1845. key = l->key + 1;
  1846. /* stop in case of wrap around */
  1847. if (key < l->key)
  1848. break;
  1849. }
  1850. return 0;
  1851. }
  1852. int fib_notify(struct net *net, struct notifier_block *nb,
  1853. struct netlink_ext_ack *extack)
  1854. {
  1855. unsigned int h;
  1856. int err;
  1857. for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
  1858. struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  1859. struct fib_table *tb;
  1860. hlist_for_each_entry_rcu(tb, head, tb_hlist) {
  1861. err = fib_table_notify(tb, nb, extack);
  1862. if (err)
  1863. return err;
  1864. }
  1865. }
  1866. return 0;
  1867. }
  1868. static void __trie_free_rcu(struct rcu_head *head)
  1869. {
  1870. struct fib_table *tb = container_of(head, struct fib_table, rcu);
  1871. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1872. struct trie *t = (struct trie *)tb->tb_data;
  1873. if (tb->tb_data == tb->__data)
  1874. free_percpu(t->stats);
  1875. #endif /* CONFIG_IP_FIB_TRIE_STATS */
  1876. kfree(tb);
  1877. }
  1878. void fib_free_table(struct fib_table *tb)
  1879. {
  1880. call_rcu(&tb->rcu, __trie_free_rcu);
  1881. }
  1882. static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
  1883. struct sk_buff *skb, struct netlink_callback *cb,
  1884. struct fib_dump_filter *filter)
  1885. {
  1886. unsigned int flags = NLM_F_MULTI;
  1887. __be32 xkey = htonl(l->key);
  1888. int i, s_i, i_fa, s_fa, err;
  1889. struct fib_alias *fa;
  1890. if (filter->filter_set ||
  1891. !filter->dump_exceptions || !filter->dump_routes)
  1892. flags |= NLM_F_DUMP_FILTERED;
  1893. s_i = cb->args[4];
  1894. s_fa = cb->args[5];
  1895. i = 0;
  1896. /* rcu_read_lock is hold by caller */
  1897. hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
  1898. struct fib_info *fi = fa->fa_info;
  1899. if (i < s_i)
  1900. goto next;
  1901. i_fa = 0;
  1902. if (tb->tb_id != fa->tb_id)
  1903. goto next;
  1904. if (filter->filter_set) {
  1905. if (filter->rt_type && fa->fa_type != filter->rt_type)
  1906. goto next;
  1907. if ((filter->protocol &&
  1908. fi->fib_protocol != filter->protocol))
  1909. goto next;
  1910. if (filter->dev &&
  1911. !fib_info_nh_uses_dev(fi, filter->dev))
  1912. goto next;
  1913. }
  1914. if (filter->dump_routes) {
  1915. if (!s_fa) {
  1916. struct fib_rt_info fri;
  1917. fri.fi = fi;
  1918. fri.tb_id = tb->tb_id;
  1919. fri.dst = xkey;
  1920. fri.dst_len = KEYLENGTH - fa->fa_slen;
  1921. fri.dscp = fa->fa_dscp;
  1922. fri.type = fa->fa_type;
  1923. fri.offload = READ_ONCE(fa->offload);
  1924. fri.trap = READ_ONCE(fa->trap);
  1925. fri.offload_failed = READ_ONCE(fa->offload_failed);
  1926. err = fib_dump_info(skb,
  1927. NETLINK_CB(cb->skb).portid,
  1928. cb->nlh->nlmsg_seq,
  1929. RTM_NEWROUTE, &fri, flags);
  1930. if (err < 0)
  1931. goto stop;
  1932. }
  1933. i_fa++;
  1934. }
  1935. if (filter->dump_exceptions) {
  1936. err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
  1937. &i_fa, s_fa, flags);
  1938. if (err < 0)
  1939. goto stop;
  1940. }
  1941. next:
  1942. i++;
  1943. }
  1944. cb->args[4] = i;
  1945. return skb->len;
  1946. stop:
  1947. cb->args[4] = i;
  1948. cb->args[5] = i_fa;
  1949. return err;
  1950. }
  1951. /* rcu_read_lock needs to be hold by caller from readside */
  1952. int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
  1953. struct netlink_callback *cb, struct fib_dump_filter *filter)
  1954. {
  1955. struct trie *t = (struct trie *)tb->tb_data;
  1956. struct key_vector *l, *tp = t->kv;
  1957. /* Dump starting at last key.
  1958. * Note: 0.0.0.0/0 (ie default) is first key.
  1959. */
  1960. int count = cb->args[2];
  1961. t_key key = cb->args[3];
  1962. /* First time here, count and key are both always 0. Count > 0
  1963. * and key == 0 means the dump has wrapped around and we are done.
  1964. */
  1965. if (count && !key)
  1966. return skb->len;
  1967. while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
  1968. int err;
  1969. err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
  1970. if (err < 0) {
  1971. cb->args[3] = key;
  1972. cb->args[2] = count;
  1973. return err;
  1974. }
  1975. ++count;
  1976. key = l->key + 1;
  1977. memset(&cb->args[4], 0,
  1978. sizeof(cb->args) - 4*sizeof(cb->args[0]));
  1979. /* stop loop if key wrapped back to 0 */
  1980. if (key < l->key)
  1981. break;
  1982. }
  1983. cb->args[3] = key;
  1984. cb->args[2] = count;
  1985. return skb->len;
  1986. }
  1987. void __init fib_trie_init(void)
  1988. {
  1989. fn_alias_kmem = kmem_cache_create("ip_fib_alias",
  1990. sizeof(struct fib_alias),
  1991. 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
  1992. trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
  1993. LEAF_SIZE,
  1994. 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
  1995. }
  1996. struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
  1997. {
  1998. struct fib_table *tb;
  1999. struct trie *t;
  2000. size_t sz = sizeof(*tb);
  2001. if (!alias)
  2002. sz += sizeof(struct trie);
  2003. tb = kzalloc(sz, GFP_KERNEL);
  2004. if (!tb)
  2005. return NULL;
  2006. tb->tb_id = id;
  2007. tb->tb_num_default = 0;
  2008. tb->tb_data = (alias ? alias->__data : tb->__data);
  2009. if (alias)
  2010. return tb;
  2011. t = (struct trie *) tb->tb_data;
  2012. t->kv[0].pos = KEYLENGTH;
  2013. t->kv[0].slen = KEYLENGTH;
  2014. #ifdef CONFIG_IP_FIB_TRIE_STATS
  2015. t->stats = alloc_percpu(struct trie_use_stats);
  2016. if (!t->stats) {
  2017. kfree(tb);
  2018. tb = NULL;
  2019. }
  2020. #endif
  2021. return tb;
  2022. }
  2023. #ifdef CONFIG_PROC_FS
  2024. /* Depth first Trie walk iterator */
  2025. struct fib_trie_iter {
  2026. struct seq_net_private p;
  2027. struct fib_table *tb;
  2028. struct key_vector *tnode;
  2029. unsigned int index;
  2030. unsigned int depth;
  2031. };
  2032. static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
  2033. {
  2034. unsigned long cindex = iter->index;
  2035. struct key_vector *pn = iter->tnode;
  2036. t_key pkey;
  2037. pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
  2038. iter->tnode, iter->index, iter->depth);
  2039. while (!IS_TRIE(pn)) {
  2040. while (cindex < child_length(pn)) {
  2041. struct key_vector *n = get_child_rcu(pn, cindex++);
  2042. if (!n)
  2043. continue;
  2044. if (IS_LEAF(n)) {
  2045. iter->tnode = pn;
  2046. iter->index = cindex;
  2047. } else {
  2048. /* push down one level */
  2049. iter->tnode = n;
  2050. iter->index = 0;
  2051. ++iter->depth;
  2052. }
  2053. return n;
  2054. }
  2055. /* Current node exhausted, pop back up */
  2056. pkey = pn->key;
  2057. pn = node_parent_rcu(pn);
  2058. cindex = get_index(pkey, pn) + 1;
  2059. --iter->depth;
  2060. }
  2061. /* record root node so further searches know we are done */
  2062. iter->tnode = pn;
  2063. iter->index = 0;
  2064. return NULL;
  2065. }
  2066. static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
  2067. struct trie *t)
  2068. {
  2069. struct key_vector *n, *pn;
  2070. if (!t)
  2071. return NULL;
  2072. pn = t->kv;
  2073. n = rcu_dereference(pn->tnode[0]);
  2074. if (!n)
  2075. return NULL;
  2076. if (IS_TNODE(n)) {
  2077. iter->tnode = n;
  2078. iter->index = 0;
  2079. iter->depth = 1;
  2080. } else {
  2081. iter->tnode = pn;
  2082. iter->index = 0;
  2083. iter->depth = 0;
  2084. }
  2085. return n;
  2086. }
  2087. static void trie_collect_stats(struct trie *t, struct trie_stat *s)
  2088. {
  2089. struct key_vector *n;
  2090. struct fib_trie_iter iter;
  2091. memset(s, 0, sizeof(*s));
  2092. rcu_read_lock();
  2093. for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
  2094. if (IS_LEAF(n)) {
  2095. struct fib_alias *fa;
  2096. s->leaves++;
  2097. s->totdepth += iter.depth;
  2098. if (iter.depth > s->maxdepth)
  2099. s->maxdepth = iter.depth;
  2100. hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
  2101. ++s->prefixes;
  2102. } else {
  2103. s->tnodes++;
  2104. if (n->bits < MAX_STAT_DEPTH)
  2105. s->nodesizes[n->bits]++;
  2106. s->nullpointers += tn_info(n)->empty_children;
  2107. }
  2108. }
  2109. rcu_read_unlock();
  2110. }
  2111. /*
  2112. * This outputs /proc/net/fib_triestats
  2113. */
  2114. static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
  2115. {
  2116. unsigned int i, max, pointers, bytes, avdepth;
  2117. if (stat->leaves)
  2118. avdepth = stat->totdepth*100 / stat->leaves;
  2119. else
  2120. avdepth = 0;
  2121. seq_printf(seq, "\tAver depth: %u.%02d\n",
  2122. avdepth / 100, avdepth % 100);
  2123. seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
  2124. seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
  2125. bytes = LEAF_SIZE * stat->leaves;
  2126. seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
  2127. bytes += sizeof(struct fib_alias) * stat->prefixes;
  2128. seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
  2129. bytes += TNODE_SIZE(0) * stat->tnodes;
  2130. max = MAX_STAT_DEPTH;
  2131. while (max > 0 && stat->nodesizes[max-1] == 0)
  2132. max--;
  2133. pointers = 0;
  2134. for (i = 1; i < max; i++)
  2135. if (stat->nodesizes[i] != 0) {
  2136. seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
  2137. pointers += (1<<i) * stat->nodesizes[i];
  2138. }
  2139. seq_putc(seq, '\n');
  2140. seq_printf(seq, "\tPointers: %u\n", pointers);
  2141. bytes += sizeof(struct key_vector *) * pointers;
  2142. seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
  2143. seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
  2144. }
  2145. #ifdef CONFIG_IP_FIB_TRIE_STATS
  2146. static void trie_show_usage(struct seq_file *seq,
  2147. const struct trie_use_stats __percpu *stats)
  2148. {
  2149. struct trie_use_stats s = { 0 };
  2150. int cpu;
  2151. /* loop through all of the CPUs and gather up the stats */
  2152. for_each_possible_cpu(cpu) {
  2153. const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
  2154. s.gets += pcpu->gets;
  2155. s.backtrack += pcpu->backtrack;
  2156. s.semantic_match_passed += pcpu->semantic_match_passed;
  2157. s.semantic_match_miss += pcpu->semantic_match_miss;
  2158. s.null_node_hit += pcpu->null_node_hit;
  2159. s.resize_node_skipped += pcpu->resize_node_skipped;
  2160. }
  2161. seq_printf(seq, "\nCounters:\n---------\n");
  2162. seq_printf(seq, "gets = %u\n", s.gets);
  2163. seq_printf(seq, "backtracks = %u\n", s.backtrack);
  2164. seq_printf(seq, "semantic match passed = %u\n",
  2165. s.semantic_match_passed);
  2166. seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
  2167. seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
  2168. seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
  2169. }
  2170. #endif /* CONFIG_IP_FIB_TRIE_STATS */
  2171. static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
  2172. {
  2173. if (tb->tb_id == RT_TABLE_LOCAL)
  2174. seq_puts(seq, "Local:\n");
  2175. else if (tb->tb_id == RT_TABLE_MAIN)
  2176. seq_puts(seq, "Main:\n");
  2177. else
  2178. seq_printf(seq, "Id %d:\n", tb->tb_id);
  2179. }
  2180. static int fib_triestat_seq_show(struct seq_file *seq, void *v)
  2181. {
  2182. struct net *net = seq->private;
  2183. unsigned int h;
  2184. seq_printf(seq,
  2185. "Basic info: size of leaf:"
  2186. " %zd bytes, size of tnode: %zd bytes.\n",
  2187. LEAF_SIZE, TNODE_SIZE(0));
  2188. rcu_read_lock();
  2189. for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
  2190. struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  2191. struct fib_table *tb;
  2192. hlist_for_each_entry_rcu(tb, head, tb_hlist) {
  2193. struct trie *t = (struct trie *) tb->tb_data;
  2194. struct trie_stat stat;
  2195. if (!t)
  2196. continue;
  2197. fib_table_print(seq, tb);
  2198. trie_collect_stats(t, &stat);
  2199. trie_show_stats(seq, &stat);
  2200. #ifdef CONFIG_IP_FIB_TRIE_STATS
  2201. trie_show_usage(seq, t->stats);
  2202. #endif
  2203. }
  2204. cond_resched_rcu();
  2205. }
  2206. rcu_read_unlock();
  2207. return 0;
  2208. }
  2209. static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
  2210. {
  2211. struct fib_trie_iter *iter = seq->private;
  2212. struct net *net = seq_file_net(seq);
  2213. loff_t idx = 0;
  2214. unsigned int h;
  2215. for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
  2216. struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  2217. struct fib_table *tb;
  2218. hlist_for_each_entry_rcu(tb, head, tb_hlist) {
  2219. struct key_vector *n;
  2220. for (n = fib_trie_get_first(iter,
  2221. (struct trie *) tb->tb_data);
  2222. n; n = fib_trie_get_next(iter))
  2223. if (pos == idx++) {
  2224. iter->tb = tb;
  2225. return n;
  2226. }
  2227. }
  2228. }
  2229. return NULL;
  2230. }
  2231. static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
  2232. __acquires(RCU)
  2233. {
  2234. rcu_read_lock();
  2235. return fib_trie_get_idx(seq, *pos);
  2236. }
  2237. static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  2238. {
  2239. struct fib_trie_iter *iter = seq->private;
  2240. struct net *net = seq_file_net(seq);
  2241. struct fib_table *tb = iter->tb;
  2242. struct hlist_node *tb_node;
  2243. unsigned int h;
  2244. struct key_vector *n;
  2245. ++*pos;
  2246. /* next node in same table */
  2247. n = fib_trie_get_next(iter);
  2248. if (n)
  2249. return n;
  2250. /* walk rest of this hash chain */
  2251. h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
  2252. while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
  2253. tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
  2254. n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
  2255. if (n)
  2256. goto found;
  2257. }
  2258. /* new hash chain */
  2259. while (++h < FIB_TABLE_HASHSZ) {
  2260. struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  2261. hlist_for_each_entry_rcu(tb, head, tb_hlist) {
  2262. n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
  2263. if (n)
  2264. goto found;
  2265. }
  2266. }
  2267. return NULL;
  2268. found:
  2269. iter->tb = tb;
  2270. return n;
  2271. }
  2272. static void fib_trie_seq_stop(struct seq_file *seq, void *v)
  2273. __releases(RCU)
  2274. {
  2275. rcu_read_unlock();
  2276. }
  2277. static void seq_indent(struct seq_file *seq, int n)
  2278. {
  2279. while (n-- > 0)
  2280. seq_puts(seq, " ");
  2281. }
  2282. static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
  2283. {
  2284. switch (s) {
  2285. case RT_SCOPE_UNIVERSE: return "universe";
  2286. case RT_SCOPE_SITE: return "site";
  2287. case RT_SCOPE_LINK: return "link";
  2288. case RT_SCOPE_HOST: return "host";
  2289. case RT_SCOPE_NOWHERE: return "nowhere";
  2290. default:
  2291. snprintf(buf, len, "scope=%d", s);
  2292. return buf;
  2293. }
  2294. }
  2295. static const char *const rtn_type_names[__RTN_MAX] = {
  2296. [RTN_UNSPEC] = "UNSPEC",
  2297. [RTN_UNICAST] = "UNICAST",
  2298. [RTN_LOCAL] = "LOCAL",
  2299. [RTN_BROADCAST] = "BROADCAST",
  2300. [RTN_ANYCAST] = "ANYCAST",
  2301. [RTN_MULTICAST] = "MULTICAST",
  2302. [RTN_BLACKHOLE] = "BLACKHOLE",
  2303. [RTN_UNREACHABLE] = "UNREACHABLE",
  2304. [RTN_PROHIBIT] = "PROHIBIT",
  2305. [RTN_THROW] = "THROW",
  2306. [RTN_NAT] = "NAT",
  2307. [RTN_XRESOLVE] = "XRESOLVE",
  2308. };
  2309. static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
  2310. {
  2311. if (t < __RTN_MAX && rtn_type_names[t])
  2312. return rtn_type_names[t];
  2313. snprintf(buf, len, "type %u", t);
  2314. return buf;
  2315. }
  2316. /* Pretty print the trie */
  2317. static int fib_trie_seq_show(struct seq_file *seq, void *v)
  2318. {
  2319. const struct fib_trie_iter *iter = seq->private;
  2320. struct key_vector *n = v;
  2321. if (IS_TRIE(node_parent_rcu(n)))
  2322. fib_table_print(seq, iter->tb);
  2323. if (IS_TNODE(n)) {
  2324. __be32 prf = htonl(n->key);
  2325. seq_indent(seq, iter->depth-1);
  2326. seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
  2327. &prf, KEYLENGTH - n->pos - n->bits, n->bits,
  2328. tn_info(n)->full_children,
  2329. tn_info(n)->empty_children);
  2330. } else {
  2331. __be32 val = htonl(n->key);
  2332. struct fib_alias *fa;
  2333. seq_indent(seq, iter->depth);
  2334. seq_printf(seq, " |-- %pI4\n", &val);
  2335. hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
  2336. char buf1[32], buf2[32];
  2337. seq_indent(seq, iter->depth + 1);
  2338. seq_printf(seq, " /%zu %s %s",
  2339. KEYLENGTH - fa->fa_slen,
  2340. rtn_scope(buf1, sizeof(buf1),
  2341. fa->fa_info->fib_scope),
  2342. rtn_type(buf2, sizeof(buf2),
  2343. fa->fa_type));
  2344. if (fa->fa_dscp)
  2345. seq_printf(seq, " tos=%d",
  2346. inet_dscp_to_dsfield(fa->fa_dscp));
  2347. seq_putc(seq, '\n');
  2348. }
  2349. }
  2350. return 0;
  2351. }
  2352. static const struct seq_operations fib_trie_seq_ops = {
  2353. .start = fib_trie_seq_start,
  2354. .next = fib_trie_seq_next,
  2355. .stop = fib_trie_seq_stop,
  2356. .show = fib_trie_seq_show,
  2357. };
  2358. struct fib_route_iter {
  2359. struct seq_net_private p;
  2360. struct fib_table *main_tb;
  2361. struct key_vector *tnode;
  2362. loff_t pos;
  2363. t_key key;
  2364. };
  2365. static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
  2366. loff_t pos)
  2367. {
  2368. struct key_vector *l, **tp = &iter->tnode;
  2369. t_key key;
  2370. /* use cached location of previously found key */
  2371. if (iter->pos > 0 && pos >= iter->pos) {
  2372. key = iter->key;
  2373. } else {
  2374. iter->pos = 1;
  2375. key = 0;
  2376. }
  2377. pos -= iter->pos;
  2378. while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
  2379. key = l->key + 1;
  2380. iter->pos++;
  2381. l = NULL;
  2382. /* handle unlikely case of a key wrap */
  2383. if (!key)
  2384. break;
  2385. }
  2386. if (l)
  2387. iter->key = l->key; /* remember it */
  2388. else
  2389. iter->pos = 0; /* forget it */
  2390. return l;
  2391. }
  2392. static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
  2393. __acquires(RCU)
  2394. {
  2395. struct fib_route_iter *iter = seq->private;
  2396. struct fib_table *tb;
  2397. struct trie *t;
  2398. rcu_read_lock();
  2399. tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
  2400. if (!tb)
  2401. return NULL;
  2402. iter->main_tb = tb;
  2403. t = (struct trie *)tb->tb_data;
  2404. iter->tnode = t->kv;
  2405. if (*pos != 0)
  2406. return fib_route_get_idx(iter, *pos);
  2407. iter->pos = 0;
  2408. iter->key = KEY_MAX;
  2409. return SEQ_START_TOKEN;
  2410. }
  2411. static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  2412. {
  2413. struct fib_route_iter *iter = seq->private;
  2414. struct key_vector *l = NULL;
  2415. t_key key = iter->key + 1;
  2416. ++*pos;
  2417. /* only allow key of 0 for start of sequence */
  2418. if ((v == SEQ_START_TOKEN) || key)
  2419. l = leaf_walk_rcu(&iter->tnode, key);
  2420. if (l) {
  2421. iter->key = l->key;
  2422. iter->pos++;
  2423. } else {
  2424. iter->pos = 0;
  2425. }
  2426. return l;
  2427. }
  2428. static void fib_route_seq_stop(struct seq_file *seq, void *v)
  2429. __releases(RCU)
  2430. {
  2431. rcu_read_unlock();
  2432. }
  2433. static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
  2434. {
  2435. unsigned int flags = 0;
  2436. if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
  2437. flags = RTF_REJECT;
  2438. if (fi) {
  2439. const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
  2440. if (nhc->nhc_gw.ipv4)
  2441. flags |= RTF_GATEWAY;
  2442. }
  2443. if (mask == htonl(0xFFFFFFFF))
  2444. flags |= RTF_HOST;
  2445. flags |= RTF_UP;
  2446. return flags;
  2447. }
  2448. /*
  2449. * This outputs /proc/net/route.
  2450. * The format of the file is not supposed to be changed
  2451. * and needs to be same as fib_hash output to avoid breaking
  2452. * legacy utilities
  2453. */
  2454. static int fib_route_seq_show(struct seq_file *seq, void *v)
  2455. {
  2456. struct fib_route_iter *iter = seq->private;
  2457. struct fib_table *tb = iter->main_tb;
  2458. struct fib_alias *fa;
  2459. struct key_vector *l = v;
  2460. __be32 prefix;
  2461. if (v == SEQ_START_TOKEN) {
  2462. seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
  2463. "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
  2464. "\tWindow\tIRTT");
  2465. return 0;
  2466. }
  2467. prefix = htonl(l->key);
  2468. hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
  2469. struct fib_info *fi = fa->fa_info;
  2470. __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
  2471. unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
  2472. if ((fa->fa_type == RTN_BROADCAST) ||
  2473. (fa->fa_type == RTN_MULTICAST))
  2474. continue;
  2475. if (fa->tb_id != tb->tb_id)
  2476. continue;
  2477. seq_setwidth(seq, 127);
  2478. if (fi) {
  2479. struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
  2480. __be32 gw = 0;
  2481. if (nhc->nhc_gw_family == AF_INET)
  2482. gw = nhc->nhc_gw.ipv4;
  2483. seq_printf(seq,
  2484. "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
  2485. "%d\t%08X\t%d\t%u\t%u",
  2486. nhc->nhc_dev ? nhc->nhc_dev->name : "*",
  2487. prefix, gw, flags, 0, 0,
  2488. fi->fib_priority,
  2489. mask,
  2490. (fi->fib_advmss ?
  2491. fi->fib_advmss + 40 : 0),
  2492. fi->fib_window,
  2493. fi->fib_rtt >> 3);
  2494. } else {
  2495. seq_printf(seq,
  2496. "*\t%08X\t%08X\t%04X\t%d\t%u\t"
  2497. "%d\t%08X\t%d\t%u\t%u",
  2498. prefix, 0, flags, 0, 0, 0,
  2499. mask, 0, 0, 0);
  2500. }
  2501. seq_pad(seq, '\n');
  2502. }
  2503. return 0;
  2504. }
  2505. static const struct seq_operations fib_route_seq_ops = {
  2506. .start = fib_route_seq_start,
  2507. .next = fib_route_seq_next,
  2508. .stop = fib_route_seq_stop,
  2509. .show = fib_route_seq_show,
  2510. };
  2511. int __net_init fib_proc_init(struct net *net)
  2512. {
  2513. if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
  2514. sizeof(struct fib_trie_iter)))
  2515. goto out1;
  2516. if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
  2517. fib_triestat_seq_show, NULL))
  2518. goto out2;
  2519. if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
  2520. sizeof(struct fib_route_iter)))
  2521. goto out3;
  2522. return 0;
  2523. out3:
  2524. remove_proc_entry("fib_triestat", net->proc_net);
  2525. out2:
  2526. remove_proc_entry("fib_trie", net->proc_net);
  2527. out1:
  2528. return -ENOMEM;
  2529. }
  2530. void __net_exit fib_proc_exit(struct net *net)
  2531. {
  2532. remove_proc_entry("fib_trie", net->proc_net);
  2533. remove_proc_entry("fib_triestat", net->proc_net);
  2534. remove_proc_entry("route", net->proc_net);
  2535. }
  2536. #endif /* CONFIG_PROC_FS */