route.c 95 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * INET An implementation of the TCP/IP protocol suite for the LINUX
  4. * operating system. INET is implemented using the BSD Socket
  5. * interface as the means of communication with the user level.
  6. *
  7. * ROUTE - implementation of the IP router.
  8. *
  9. * Authors: Ross Biro
  10. * Fred N. van Kempen, <[email protected]>
  11. * Alan Cox, <[email protected]>
  12. * Linus Torvalds, <[email protected]>
  13. * Alexey Kuznetsov, <[email protected]>
  14. *
  15. * Fixes:
  16. * Alan Cox : Verify area fixes.
  17. * Alan Cox : cli() protects routing changes
  18. * Rui Oliveira : ICMP routing table updates
  19. * ([email protected]) Routing table insertion and update
  20. * Linus Torvalds : Rewrote bits to be sensible
  21. * Alan Cox : Added BSD route gw semantics
  22. * Alan Cox : Super /proc >4K
  23. * Alan Cox : MTU in route table
  24. * Alan Cox : MSS actually. Also added the window
  25. * clamper.
  26. * Sam Lantinga : Fixed route matching in rt_del()
  27. * Alan Cox : Routing cache support.
  28. * Alan Cox : Removed compatibility cruft.
  29. * Alan Cox : RTF_REJECT support.
  30. * Alan Cox : TCP irtt support.
  31. * Jonathan Naylor : Added Metric support.
  32. * Miquel van Smoorenburg : BSD API fixes.
  33. * Miquel van Smoorenburg : Metrics.
  34. * Alan Cox : Use __u32 properly
  35. * Alan Cox : Aligned routing errors more closely with BSD
  36. * our system is still very different.
  37. * Alan Cox : Faster /proc handling
  38. * Alexey Kuznetsov : Massive rework to support tree based routing,
  39. * routing caches and better behaviour.
  40. *
  41. * Olaf Erb : irtt wasn't being copied right.
  42. * Bjorn Ekwall : Kerneld route support.
  43. * Alan Cox : Multicast fixed (I hope)
  44. * Pavel Krauz : Limited broadcast fixed
  45. * Mike McLagan : Routing by source
  46. * Alexey Kuznetsov : End of old history. Split to fib.c and
  47. * route.c and rewritten from scratch.
  48. * Andi Kleen : Load-limit warning messages.
  49. * Vitaly E. Lavrov : Transparent proxy revived after year coma.
  50. * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
  51. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
  52. * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
  53. * Marc Boucher : routing by fwmark
  54. * Robert Olsson : Added rt_cache statistics
  55. * Arnaldo C. Melo : Convert proc stuff to seq_file
  56. * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
  57. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
  58. * Ilia Sotnikov : Removed TOS from hash calculations
  59. */
  60. #define pr_fmt(fmt) "IPv4: " fmt
  61. #include <linux/module.h>
  62. #include <linux/bitops.h>
  63. #include <linux/kernel.h>
  64. #include <linux/mm.h>
  65. #include <linux/memblock.h>
  66. #include <linux/socket.h>
  67. #include <linux/errno.h>
  68. #include <linux/in.h>
  69. #include <linux/inet.h>
  70. #include <linux/netdevice.h>
  71. #include <linux/proc_fs.h>
  72. #include <linux/init.h>
  73. #include <linux/skbuff.h>
  74. #include <linux/inetdevice.h>
  75. #include <linux/igmp.h>
  76. #include <linux/pkt_sched.h>
  77. #include <linux/mroute.h>
  78. #include <linux/netfilter_ipv4.h>
  79. #include <linux/random.h>
  80. #include <linux/rcupdate.h>
  81. #include <linux/slab.h>
  82. #include <linux/jhash.h>
  83. #include <net/dst.h>
  84. #include <net/dst_metadata.h>
  85. #include <net/inet_dscp.h>
  86. #include <net/net_namespace.h>
  87. #include <net/ip.h>
  88. #include <net/route.h>
  89. #include <net/inetpeer.h>
  90. #include <net/sock.h>
  91. #include <net/ip_fib.h>
  92. #include <net/nexthop.h>
  93. #include <net/tcp.h>
  94. #include <net/icmp.h>
  95. #include <net/xfrm.h>
  96. #include <net/lwtunnel.h>
  97. #include <net/netevent.h>
  98. #include <net/rtnetlink.h>
  99. #ifdef CONFIG_SYSCTL
  100. #include <linux/sysctl.h>
  101. #endif
  102. #include <net/secure_seq.h>
  103. #include <net/ip_tunnels.h>
  104. #include "fib_lookup.h"
  105. #define RT_FL_TOS(oldflp4) \
  106. ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
  107. #define RT_GC_TIMEOUT (300*HZ)
  108. #define DEFAULT_MIN_PMTU (512 + 20 + 20)
  109. #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
  110. #define DEFAULT_MIN_ADVMSS 256
  111. static int ip_rt_max_size;
  112. static int ip_rt_redirect_number __read_mostly = 9;
  113. static int ip_rt_redirect_load __read_mostly = HZ / 50;
  114. static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
  115. static int ip_rt_error_cost __read_mostly = HZ;
  116. static int ip_rt_error_burst __read_mostly = 5 * HZ;
  117. static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
  118. /*
  119. * Interface to generic destination cache.
  120. */
  121. INDIRECT_CALLABLE_SCOPE
  122. struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
  123. static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
  124. INDIRECT_CALLABLE_SCOPE
  125. unsigned int ipv4_mtu(const struct dst_entry *dst);
  126. static void ipv4_negative_advice(struct sock *sk,
  127. struct dst_entry *dst);
  128. static void ipv4_link_failure(struct sk_buff *skb);
  129. static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
  130. struct sk_buff *skb, u32 mtu,
  131. bool confirm_neigh);
  132. static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
  133. struct sk_buff *skb);
  134. static void ipv4_dst_destroy(struct dst_entry *dst);
  135. static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
  136. {
  137. WARN_ON(1);
  138. return NULL;
  139. }
  140. static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
  141. struct sk_buff *skb,
  142. const void *daddr);
  143. static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
  144. static struct dst_ops ipv4_dst_ops = {
  145. .family = AF_INET,
  146. .check = ipv4_dst_check,
  147. .default_advmss = ipv4_default_advmss,
  148. .mtu = ipv4_mtu,
  149. .cow_metrics = ipv4_cow_metrics,
  150. .destroy = ipv4_dst_destroy,
  151. .negative_advice = (void *)ipv4_negative_advice,
  152. .link_failure = ipv4_link_failure,
  153. .update_pmtu = ip_rt_update_pmtu,
  154. .redirect = ip_do_redirect,
  155. .local_out = __ip_local_out,
  156. .neigh_lookup = ipv4_neigh_lookup,
  157. .confirm_neigh = ipv4_confirm_neigh,
  158. };
  159. #define ECN_OR_COST(class) TC_PRIO_##class
  160. const __u8 ip_tos2prio[16] = {
  161. TC_PRIO_BESTEFFORT,
  162. ECN_OR_COST(BESTEFFORT),
  163. TC_PRIO_BESTEFFORT,
  164. ECN_OR_COST(BESTEFFORT),
  165. TC_PRIO_BULK,
  166. ECN_OR_COST(BULK),
  167. TC_PRIO_BULK,
  168. ECN_OR_COST(BULK),
  169. TC_PRIO_INTERACTIVE,
  170. ECN_OR_COST(INTERACTIVE),
  171. TC_PRIO_INTERACTIVE,
  172. ECN_OR_COST(INTERACTIVE),
  173. TC_PRIO_INTERACTIVE_BULK,
  174. ECN_OR_COST(INTERACTIVE_BULK),
  175. TC_PRIO_INTERACTIVE_BULK,
  176. ECN_OR_COST(INTERACTIVE_BULK)
  177. };
  178. EXPORT_SYMBOL(ip_tos2prio);
  179. static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
  180. #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
  181. #ifdef CONFIG_PROC_FS
  182. static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
  183. {
  184. if (*pos)
  185. return NULL;
  186. return SEQ_START_TOKEN;
  187. }
  188. static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  189. {
  190. ++*pos;
  191. return NULL;
  192. }
  193. static void rt_cache_seq_stop(struct seq_file *seq, void *v)
  194. {
  195. }
  196. static int rt_cache_seq_show(struct seq_file *seq, void *v)
  197. {
  198. if (v == SEQ_START_TOKEN)
  199. seq_printf(seq, "%-127s\n",
  200. "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
  201. "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
  202. "HHUptod\tSpecDst");
  203. return 0;
  204. }
  205. static const struct seq_operations rt_cache_seq_ops = {
  206. .start = rt_cache_seq_start,
  207. .next = rt_cache_seq_next,
  208. .stop = rt_cache_seq_stop,
  209. .show = rt_cache_seq_show,
  210. };
  211. static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
  212. {
  213. int cpu;
  214. if (*pos == 0)
  215. return SEQ_START_TOKEN;
  216. for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
  217. if (!cpu_possible(cpu))
  218. continue;
  219. *pos = cpu+1;
  220. return &per_cpu(rt_cache_stat, cpu);
  221. }
  222. return NULL;
  223. }
  224. static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  225. {
  226. int cpu;
  227. for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
  228. if (!cpu_possible(cpu))
  229. continue;
  230. *pos = cpu+1;
  231. return &per_cpu(rt_cache_stat, cpu);
  232. }
  233. (*pos)++;
  234. return NULL;
  235. }
  236. static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
  237. {
  238. }
  239. static int rt_cpu_seq_show(struct seq_file *seq, void *v)
  240. {
  241. struct rt_cache_stat *st = v;
  242. if (v == SEQ_START_TOKEN) {
  243. seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
  244. return 0;
  245. }
  246. seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x "
  247. "%08x %08x %08x %08x %08x %08x "
  248. "%08x %08x %08x %08x\n",
  249. dst_entries_get_slow(&ipv4_dst_ops),
  250. 0, /* st->in_hit */
  251. st->in_slow_tot,
  252. st->in_slow_mc,
  253. st->in_no_route,
  254. st->in_brd,
  255. st->in_martian_dst,
  256. st->in_martian_src,
  257. 0, /* st->out_hit */
  258. st->out_slow_tot,
  259. st->out_slow_mc,
  260. 0, /* st->gc_total */
  261. 0, /* st->gc_ignored */
  262. 0, /* st->gc_goal_miss */
  263. 0, /* st->gc_dst_overflow */
  264. 0, /* st->in_hlist_search */
  265. 0 /* st->out_hlist_search */
  266. );
  267. return 0;
  268. }
  269. static const struct seq_operations rt_cpu_seq_ops = {
  270. .start = rt_cpu_seq_start,
  271. .next = rt_cpu_seq_next,
  272. .stop = rt_cpu_seq_stop,
  273. .show = rt_cpu_seq_show,
  274. };
  275. #ifdef CONFIG_IP_ROUTE_CLASSID
  276. static int rt_acct_proc_show(struct seq_file *m, void *v)
  277. {
  278. struct ip_rt_acct *dst, *src;
  279. unsigned int i, j;
  280. dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
  281. if (!dst)
  282. return -ENOMEM;
  283. for_each_possible_cpu(i) {
  284. src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
  285. for (j = 0; j < 256; j++) {
  286. dst[j].o_bytes += src[j].o_bytes;
  287. dst[j].o_packets += src[j].o_packets;
  288. dst[j].i_bytes += src[j].i_bytes;
  289. dst[j].i_packets += src[j].i_packets;
  290. }
  291. }
  292. seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
  293. kfree(dst);
  294. return 0;
  295. }
  296. #endif
  297. static int __net_init ip_rt_do_proc_init(struct net *net)
  298. {
  299. struct proc_dir_entry *pde;
  300. pde = proc_create_seq("rt_cache", 0444, net->proc_net,
  301. &rt_cache_seq_ops);
  302. if (!pde)
  303. goto err1;
  304. pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
  305. &rt_cpu_seq_ops);
  306. if (!pde)
  307. goto err2;
  308. #ifdef CONFIG_IP_ROUTE_CLASSID
  309. pde = proc_create_single("rt_acct", 0, net->proc_net,
  310. rt_acct_proc_show);
  311. if (!pde)
  312. goto err3;
  313. #endif
  314. return 0;
  315. #ifdef CONFIG_IP_ROUTE_CLASSID
  316. err3:
  317. remove_proc_entry("rt_cache", net->proc_net_stat);
  318. #endif
  319. err2:
  320. remove_proc_entry("rt_cache", net->proc_net);
  321. err1:
  322. return -ENOMEM;
  323. }
  324. static void __net_exit ip_rt_do_proc_exit(struct net *net)
  325. {
  326. remove_proc_entry("rt_cache", net->proc_net_stat);
  327. remove_proc_entry("rt_cache", net->proc_net);
  328. #ifdef CONFIG_IP_ROUTE_CLASSID
  329. remove_proc_entry("rt_acct", net->proc_net);
  330. #endif
  331. }
  332. static struct pernet_operations ip_rt_proc_ops __net_initdata = {
  333. .init = ip_rt_do_proc_init,
  334. .exit = ip_rt_do_proc_exit,
  335. };
  336. static int __init ip_rt_proc_init(void)
  337. {
  338. return register_pernet_subsys(&ip_rt_proc_ops);
  339. }
  340. #else
  341. static inline int ip_rt_proc_init(void)
  342. {
  343. return 0;
  344. }
  345. #endif /* CONFIG_PROC_FS */
  346. static inline bool rt_is_expired(const struct rtable *rth)
  347. {
  348. return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
  349. }
  350. void rt_cache_flush(struct net *net)
  351. {
  352. rt_genid_bump_ipv4(net);
  353. }
  354. static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
  355. struct sk_buff *skb,
  356. const void *daddr)
  357. {
  358. const struct rtable *rt = container_of(dst, struct rtable, dst);
  359. struct net_device *dev = dst->dev;
  360. struct neighbour *n;
  361. rcu_read_lock();
  362. if (likely(rt->rt_gw_family == AF_INET)) {
  363. n = ip_neigh_gw4(dev, rt->rt_gw4);
  364. } else if (rt->rt_gw_family == AF_INET6) {
  365. n = ip_neigh_gw6(dev, &rt->rt_gw6);
  366. } else {
  367. __be32 pkey;
  368. pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
  369. n = ip_neigh_gw4(dev, pkey);
  370. }
  371. if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
  372. n = NULL;
  373. rcu_read_unlock();
  374. return n;
  375. }
  376. static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
  377. {
  378. const struct rtable *rt = container_of(dst, struct rtable, dst);
  379. struct net_device *dev = dst->dev;
  380. const __be32 *pkey = daddr;
  381. if (rt->rt_gw_family == AF_INET) {
  382. pkey = (const __be32 *)&rt->rt_gw4;
  383. } else if (rt->rt_gw_family == AF_INET6) {
  384. return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
  385. } else if (!daddr ||
  386. (rt->rt_flags &
  387. (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
  388. return;
  389. }
  390. __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
  391. }
  392. /* Hash tables of size 2048..262144 depending on RAM size.
  393. * Each bucket uses 8 bytes.
  394. */
  395. static u32 ip_idents_mask __read_mostly;
  396. static atomic_t *ip_idents __read_mostly;
  397. static u32 *ip_tstamps __read_mostly;
  398. /* In order to protect privacy, we add a perturbation to identifiers
  399. * if one generator is seldom used. This makes hard for an attacker
  400. * to infer how many packets were sent between two points in time.
  401. */
  402. static u32 ip_idents_reserve(u32 hash, int segs)
  403. {
  404. u32 bucket, old, now = (u32)jiffies;
  405. atomic_t *p_id;
  406. u32 *p_tstamp;
  407. u32 delta = 0;
  408. bucket = hash & ip_idents_mask;
  409. p_tstamp = ip_tstamps + bucket;
  410. p_id = ip_idents + bucket;
  411. old = READ_ONCE(*p_tstamp);
  412. if (old != now && cmpxchg(p_tstamp, old, now) == old)
  413. delta = prandom_u32_max(now - old);
  414. /* If UBSAN reports an error there, please make sure your compiler
  415. * supports -fno-strict-overflow before reporting it that was a bug
  416. * in UBSAN, and it has been fixed in GCC-8.
  417. */
  418. return atomic_add_return(segs + delta, p_id) - segs;
  419. }
  420. void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
  421. {
  422. u32 hash, id;
  423. /* Note the following code is not safe, but this is okay. */
  424. if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
  425. get_random_bytes(&net->ipv4.ip_id_key,
  426. sizeof(net->ipv4.ip_id_key));
  427. hash = siphash_3u32((__force u32)iph->daddr,
  428. (__force u32)iph->saddr,
  429. iph->protocol,
  430. &net->ipv4.ip_id_key);
  431. id = ip_idents_reserve(hash, segs);
  432. iph->id = htons(id);
  433. }
  434. EXPORT_SYMBOL(__ip_select_ident);
  435. static void ip_rt_fix_tos(struct flowi4 *fl4)
  436. {
  437. __u8 tos = RT_FL_TOS(fl4);
  438. fl4->flowi4_tos = tos & IPTOS_RT_MASK;
  439. if (tos & RTO_ONLINK)
  440. fl4->flowi4_scope = RT_SCOPE_LINK;
  441. }
  442. static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
  443. const struct sock *sk, const struct iphdr *iph,
  444. int oif, __u8 tos, u8 prot, u32 mark,
  445. int flow_flags)
  446. {
  447. __u8 scope = RT_SCOPE_UNIVERSE;
  448. if (sk) {
  449. const struct inet_sock *inet = inet_sk(sk);
  450. oif = sk->sk_bound_dev_if;
  451. mark = READ_ONCE(sk->sk_mark);
  452. tos = ip_sock_rt_tos(sk);
  453. scope = ip_sock_rt_scope(sk);
  454. prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
  455. }
  456. flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
  457. prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
  458. sock_net_uid(net, sk));
  459. }
  460. static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
  461. const struct sock *sk)
  462. {
  463. const struct net *net = dev_net(skb->dev);
  464. const struct iphdr *iph = ip_hdr(skb);
  465. int oif = skb->dev->ifindex;
  466. u8 prot = iph->protocol;
  467. u32 mark = skb->mark;
  468. __u8 tos = iph->tos;
  469. __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
  470. }
  471. static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
  472. {
  473. const struct inet_sock *inet = inet_sk(sk);
  474. const struct ip_options_rcu *inet_opt;
  475. __be32 daddr = inet->inet_daddr;
  476. rcu_read_lock();
  477. inet_opt = rcu_dereference(inet->inet_opt);
  478. if (inet_opt && inet_opt->opt.srr)
  479. daddr = inet_opt->opt.faddr;
  480. flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
  481. ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
  482. ip_sock_rt_scope(sk),
  483. inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
  484. inet_sk_flowi_flags(sk),
  485. daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
  486. rcu_read_unlock();
  487. }
  488. static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
  489. const struct sk_buff *skb)
  490. {
  491. if (skb)
  492. build_skb_flow_key(fl4, skb, sk);
  493. else
  494. build_sk_flow_key(fl4, sk);
  495. }
  496. static DEFINE_SPINLOCK(fnhe_lock);
  497. static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
  498. {
  499. struct rtable *rt;
  500. rt = rcu_dereference(fnhe->fnhe_rth_input);
  501. if (rt) {
  502. RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
  503. dst_dev_put(&rt->dst);
  504. dst_release(&rt->dst);
  505. }
  506. rt = rcu_dereference(fnhe->fnhe_rth_output);
  507. if (rt) {
  508. RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
  509. dst_dev_put(&rt->dst);
  510. dst_release(&rt->dst);
  511. }
  512. }
  513. static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
  514. {
  515. struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
  516. struct fib_nh_exception *fnhe, *oldest = NULL;
  517. for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
  518. fnhe = rcu_dereference_protected(*fnhe_p,
  519. lockdep_is_held(&fnhe_lock));
  520. if (!fnhe)
  521. break;
  522. if (!oldest ||
  523. time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
  524. oldest = fnhe;
  525. oldest_p = fnhe_p;
  526. }
  527. }
  528. fnhe_flush_routes(oldest);
  529. *oldest_p = oldest->fnhe_next;
  530. kfree_rcu(oldest, rcu);
  531. }
  532. static u32 fnhe_hashfun(__be32 daddr)
  533. {
  534. static siphash_aligned_key_t fnhe_hash_key;
  535. u64 hval;
  536. net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
  537. hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
  538. return hash_64(hval, FNHE_HASH_SHIFT);
  539. }
  540. static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
  541. {
  542. rt->rt_pmtu = fnhe->fnhe_pmtu;
  543. rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
  544. rt->dst.expires = fnhe->fnhe_expires;
  545. if (fnhe->fnhe_gw) {
  546. rt->rt_flags |= RTCF_REDIRECTED;
  547. rt->rt_uses_gateway = 1;
  548. rt->rt_gw_family = AF_INET;
  549. rt->rt_gw4 = fnhe->fnhe_gw;
  550. }
  551. }
  552. static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
  553. __be32 gw, u32 pmtu, bool lock,
  554. unsigned long expires)
  555. {
  556. struct fnhe_hash_bucket *hash;
  557. struct fib_nh_exception *fnhe;
  558. struct rtable *rt;
  559. u32 genid, hval;
  560. unsigned int i;
  561. int depth;
  562. genid = fnhe_genid(dev_net(nhc->nhc_dev));
  563. hval = fnhe_hashfun(daddr);
  564. spin_lock_bh(&fnhe_lock);
  565. hash = rcu_dereference(nhc->nhc_exceptions);
  566. if (!hash) {
  567. hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
  568. if (!hash)
  569. goto out_unlock;
  570. rcu_assign_pointer(nhc->nhc_exceptions, hash);
  571. }
  572. hash += hval;
  573. depth = 0;
  574. for (fnhe = rcu_dereference(hash->chain); fnhe;
  575. fnhe = rcu_dereference(fnhe->fnhe_next)) {
  576. if (fnhe->fnhe_daddr == daddr)
  577. break;
  578. depth++;
  579. }
  580. if (fnhe) {
  581. if (fnhe->fnhe_genid != genid)
  582. fnhe->fnhe_genid = genid;
  583. if (gw)
  584. fnhe->fnhe_gw = gw;
  585. if (pmtu) {
  586. fnhe->fnhe_pmtu = pmtu;
  587. fnhe->fnhe_mtu_locked = lock;
  588. }
  589. fnhe->fnhe_expires = max(1UL, expires);
  590. /* Update all cached dsts too */
  591. rt = rcu_dereference(fnhe->fnhe_rth_input);
  592. if (rt)
  593. fill_route_from_fnhe(rt, fnhe);
  594. rt = rcu_dereference(fnhe->fnhe_rth_output);
  595. if (rt)
  596. fill_route_from_fnhe(rt, fnhe);
  597. } else {
  598. /* Randomize max depth to avoid some side channels attacks. */
  599. int max_depth = FNHE_RECLAIM_DEPTH +
  600. prandom_u32_max(FNHE_RECLAIM_DEPTH);
  601. while (depth > max_depth) {
  602. fnhe_remove_oldest(hash);
  603. depth--;
  604. }
  605. fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
  606. if (!fnhe)
  607. goto out_unlock;
  608. fnhe->fnhe_next = hash->chain;
  609. fnhe->fnhe_genid = genid;
  610. fnhe->fnhe_daddr = daddr;
  611. fnhe->fnhe_gw = gw;
  612. fnhe->fnhe_pmtu = pmtu;
  613. fnhe->fnhe_mtu_locked = lock;
  614. fnhe->fnhe_expires = max(1UL, expires);
  615. rcu_assign_pointer(hash->chain, fnhe);
  616. /* Exception created; mark the cached routes for the nexthop
  617. * stale, so anyone caching it rechecks if this exception
  618. * applies to them.
  619. */
  620. rt = rcu_dereference(nhc->nhc_rth_input);
  621. if (rt)
  622. rt->dst.obsolete = DST_OBSOLETE_KILL;
  623. for_each_possible_cpu(i) {
  624. struct rtable __rcu **prt;
  625. prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
  626. rt = rcu_dereference(*prt);
  627. if (rt)
  628. rt->dst.obsolete = DST_OBSOLETE_KILL;
  629. }
  630. }
  631. fnhe->fnhe_stamp = jiffies;
  632. out_unlock:
  633. spin_unlock_bh(&fnhe_lock);
  634. }
  635. static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
  636. bool kill_route)
  637. {
  638. __be32 new_gw = icmp_hdr(skb)->un.gateway;
  639. __be32 old_gw = ip_hdr(skb)->saddr;
  640. struct net_device *dev = skb->dev;
  641. struct in_device *in_dev;
  642. struct fib_result res;
  643. struct neighbour *n;
  644. struct net *net;
  645. switch (icmp_hdr(skb)->code & 7) {
  646. case ICMP_REDIR_NET:
  647. case ICMP_REDIR_NETTOS:
  648. case ICMP_REDIR_HOST:
  649. case ICMP_REDIR_HOSTTOS:
  650. break;
  651. default:
  652. return;
  653. }
  654. if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
  655. return;
  656. in_dev = __in_dev_get_rcu(dev);
  657. if (!in_dev)
  658. return;
  659. net = dev_net(dev);
  660. if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
  661. ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
  662. ipv4_is_zeronet(new_gw))
  663. goto reject_redirect;
  664. if (!IN_DEV_SHARED_MEDIA(in_dev)) {
  665. if (!inet_addr_onlink(in_dev, new_gw, old_gw))
  666. goto reject_redirect;
  667. if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
  668. goto reject_redirect;
  669. } else {
  670. if (inet_addr_type(net, new_gw) != RTN_UNICAST)
  671. goto reject_redirect;
  672. }
  673. n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
  674. if (!n)
  675. n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
  676. if (!IS_ERR(n)) {
  677. if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
  678. neigh_event_send(n, NULL);
  679. } else {
  680. if (fib_lookup(net, fl4, &res, 0) == 0) {
  681. struct fib_nh_common *nhc;
  682. fib_select_path(net, &res, fl4, skb);
  683. nhc = FIB_RES_NHC(res);
  684. update_or_create_fnhe(nhc, fl4->daddr, new_gw,
  685. 0, false,
  686. jiffies + ip_rt_gc_timeout);
  687. }
  688. if (kill_route)
  689. rt->dst.obsolete = DST_OBSOLETE_KILL;
  690. call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
  691. }
  692. neigh_release(n);
  693. }
  694. return;
  695. reject_redirect:
  696. #ifdef CONFIG_IP_ROUTE_VERBOSE
  697. if (IN_DEV_LOG_MARTIANS(in_dev)) {
  698. const struct iphdr *iph = (const struct iphdr *) skb->data;
  699. __be32 daddr = iph->daddr;
  700. __be32 saddr = iph->saddr;
  701. net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
  702. " Advised path = %pI4 -> %pI4\n",
  703. &old_gw, dev->name, &new_gw,
  704. &saddr, &daddr);
  705. }
  706. #endif
  707. ;
  708. }
  709. static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
  710. {
  711. struct rtable *rt;
  712. struct flowi4 fl4;
  713. const struct iphdr *iph = (const struct iphdr *) skb->data;
  714. struct net *net = dev_net(skb->dev);
  715. int oif = skb->dev->ifindex;
  716. u8 prot = iph->protocol;
  717. u32 mark = skb->mark;
  718. __u8 tos = iph->tos;
  719. rt = (struct rtable *) dst;
  720. __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
  721. __ip_do_redirect(rt, skb, &fl4, true);
  722. }
  723. static void ipv4_negative_advice(struct sock *sk,
  724. struct dst_entry *dst)
  725. {
  726. struct rtable *rt = (struct rtable *)dst;
  727. if ((dst->obsolete > 0) ||
  728. (rt->rt_flags & RTCF_REDIRECTED) ||
  729. rt->dst.expires)
  730. sk_dst_reset(sk);
  731. }
  732. /*
  733. * Algorithm:
  734. * 1. The first ip_rt_redirect_number redirects are sent
  735. * with exponential backoff, then we stop sending them at all,
  736. * assuming that the host ignores our redirects.
  737. * 2. If we did not see packets requiring redirects
  738. * during ip_rt_redirect_silence, we assume that the host
  739. * forgot redirected route and start to send redirects again.
  740. *
  741. * This algorithm is much cheaper and more intelligent than dumb load limiting
  742. * in icmp.c.
  743. *
  744. * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
  745. * and "frag. need" (breaks PMTU discovery) in icmp.c.
  746. */
  747. void ip_rt_send_redirect(struct sk_buff *skb)
  748. {
  749. struct rtable *rt = skb_rtable(skb);
  750. struct in_device *in_dev;
  751. struct inet_peer *peer;
  752. struct net *net;
  753. int log_martians;
  754. int vif;
  755. rcu_read_lock();
  756. in_dev = __in_dev_get_rcu(rt->dst.dev);
  757. if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
  758. rcu_read_unlock();
  759. return;
  760. }
  761. log_martians = IN_DEV_LOG_MARTIANS(in_dev);
  762. vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
  763. rcu_read_unlock();
  764. net = dev_net(rt->dst.dev);
  765. peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
  766. if (!peer) {
  767. icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
  768. rt_nexthop(rt, ip_hdr(skb)->daddr));
  769. return;
  770. }
  771. /* No redirected packets during ip_rt_redirect_silence;
  772. * reset the algorithm.
  773. */
  774. if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
  775. peer->rate_tokens = 0;
  776. peer->n_redirects = 0;
  777. }
  778. /* Too many ignored redirects; do not send anything
  779. * set dst.rate_last to the last seen redirected packet.
  780. */
  781. if (peer->n_redirects >= ip_rt_redirect_number) {
  782. peer->rate_last = jiffies;
  783. goto out_put_peer;
  784. }
  785. /* Check for load limit; set rate_last to the latest sent
  786. * redirect.
  787. */
  788. if (peer->n_redirects == 0 ||
  789. time_after(jiffies,
  790. (peer->rate_last +
  791. (ip_rt_redirect_load << peer->n_redirects)))) {
  792. __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
  793. icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
  794. peer->rate_last = jiffies;
  795. ++peer->n_redirects;
  796. #ifdef CONFIG_IP_ROUTE_VERBOSE
  797. if (log_martians &&
  798. peer->n_redirects == ip_rt_redirect_number)
  799. net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
  800. &ip_hdr(skb)->saddr, inet_iif(skb),
  801. &ip_hdr(skb)->daddr, &gw);
  802. #endif
  803. }
  804. out_put_peer:
  805. inet_putpeer(peer);
  806. }
  807. static int ip_error(struct sk_buff *skb)
  808. {
  809. struct rtable *rt = skb_rtable(skb);
  810. struct net_device *dev = skb->dev;
  811. struct in_device *in_dev;
  812. struct inet_peer *peer;
  813. unsigned long now;
  814. struct net *net;
  815. SKB_DR(reason);
  816. bool send;
  817. int code;
  818. if (netif_is_l3_master(skb->dev)) {
  819. dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
  820. if (!dev)
  821. goto out;
  822. }
  823. in_dev = __in_dev_get_rcu(dev);
  824. /* IP on this device is disabled. */
  825. if (!in_dev)
  826. goto out;
  827. net = dev_net(rt->dst.dev);
  828. if (!IN_DEV_FORWARD(in_dev)) {
  829. switch (rt->dst.error) {
  830. case EHOSTUNREACH:
  831. SKB_DR_SET(reason, IP_INADDRERRORS);
  832. __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
  833. break;
  834. case ENETUNREACH:
  835. SKB_DR_SET(reason, IP_INNOROUTES);
  836. __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
  837. break;
  838. }
  839. goto out;
  840. }
  841. switch (rt->dst.error) {
  842. case EINVAL:
  843. default:
  844. goto out;
  845. case EHOSTUNREACH:
  846. code = ICMP_HOST_UNREACH;
  847. break;
  848. case ENETUNREACH:
  849. code = ICMP_NET_UNREACH;
  850. SKB_DR_SET(reason, IP_INNOROUTES);
  851. __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
  852. break;
  853. case EACCES:
  854. code = ICMP_PKT_FILTERED;
  855. break;
  856. }
  857. peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
  858. l3mdev_master_ifindex(skb->dev), 1);
  859. send = true;
  860. if (peer) {
  861. now = jiffies;
  862. peer->rate_tokens += now - peer->rate_last;
  863. if (peer->rate_tokens > ip_rt_error_burst)
  864. peer->rate_tokens = ip_rt_error_burst;
  865. peer->rate_last = now;
  866. if (peer->rate_tokens >= ip_rt_error_cost)
  867. peer->rate_tokens -= ip_rt_error_cost;
  868. else
  869. send = false;
  870. inet_putpeer(peer);
  871. }
  872. if (send)
  873. icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
  874. out: kfree_skb_reason(skb, reason);
  875. return 0;
  876. }
  877. static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
  878. {
  879. struct dst_entry *dst = &rt->dst;
  880. struct net *net = dev_net(dst->dev);
  881. struct fib_result res;
  882. bool lock = false;
  883. u32 old_mtu;
  884. if (ip_mtu_locked(dst))
  885. return;
  886. old_mtu = ipv4_mtu(dst);
  887. if (old_mtu < mtu)
  888. return;
  889. if (mtu < net->ipv4.ip_rt_min_pmtu) {
  890. lock = true;
  891. mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
  892. }
  893. if (rt->rt_pmtu == mtu && !lock &&
  894. time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
  895. return;
  896. rcu_read_lock();
  897. if (fib_lookup(net, fl4, &res, 0) == 0) {
  898. struct fib_nh_common *nhc;
  899. fib_select_path(net, &res, fl4, NULL);
  900. nhc = FIB_RES_NHC(res);
  901. update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
  902. jiffies + net->ipv4.ip_rt_mtu_expires);
  903. }
  904. rcu_read_unlock();
  905. }
  906. static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
  907. struct sk_buff *skb, u32 mtu,
  908. bool confirm_neigh)
  909. {
  910. struct rtable *rt = (struct rtable *) dst;
  911. struct flowi4 fl4;
  912. ip_rt_build_flow_key(&fl4, sk, skb);
  913. /* Don't make lookup fail for bridged encapsulations */
  914. if (skb && netif_is_any_bridge_port(skb->dev))
  915. fl4.flowi4_oif = 0;
  916. __ip_rt_update_pmtu(rt, &fl4, mtu);
  917. }
  918. void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
  919. int oif, u8 protocol)
  920. {
  921. const struct iphdr *iph = (const struct iphdr *)skb->data;
  922. struct flowi4 fl4;
  923. struct rtable *rt;
  924. u32 mark = IP4_REPLY_MARK(net, skb->mark);
  925. __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
  926. 0);
  927. rt = __ip_route_output_key(net, &fl4);
  928. if (!IS_ERR(rt)) {
  929. __ip_rt_update_pmtu(rt, &fl4, mtu);
  930. ip_rt_put(rt);
  931. }
  932. }
  933. EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
  934. static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
  935. {
  936. const struct iphdr *iph = (const struct iphdr *)skb->data;
  937. struct flowi4 fl4;
  938. struct rtable *rt;
  939. __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
  940. if (!fl4.flowi4_mark)
  941. fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
  942. rt = __ip_route_output_key(sock_net(sk), &fl4);
  943. if (!IS_ERR(rt)) {
  944. __ip_rt_update_pmtu(rt, &fl4, mtu);
  945. ip_rt_put(rt);
  946. }
  947. }
  948. void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
  949. {
  950. const struct iphdr *iph = (const struct iphdr *)skb->data;
  951. struct flowi4 fl4;
  952. struct rtable *rt;
  953. struct dst_entry *odst = NULL;
  954. bool new = false;
  955. struct net *net = sock_net(sk);
  956. bh_lock_sock(sk);
  957. if (!ip_sk_accept_pmtu(sk))
  958. goto out;
  959. odst = sk_dst_get(sk);
  960. if (sock_owned_by_user(sk) || !odst) {
  961. __ipv4_sk_update_pmtu(skb, sk, mtu);
  962. goto out;
  963. }
  964. __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
  965. rt = (struct rtable *)odst;
  966. if (odst->obsolete && !odst->ops->check(odst, 0)) {
  967. rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
  968. if (IS_ERR(rt))
  969. goto out;
  970. new = true;
  971. }
  972. __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
  973. if (!dst_check(&rt->dst, 0)) {
  974. if (new)
  975. dst_release(&rt->dst);
  976. rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
  977. if (IS_ERR(rt))
  978. goto out;
  979. new = true;
  980. }
  981. if (new)
  982. sk_dst_set(sk, &rt->dst);
  983. out:
  984. bh_unlock_sock(sk);
  985. dst_release(odst);
  986. }
  987. EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
  988. void ipv4_redirect(struct sk_buff *skb, struct net *net,
  989. int oif, u8 protocol)
  990. {
  991. const struct iphdr *iph = (const struct iphdr *)skb->data;
  992. struct flowi4 fl4;
  993. struct rtable *rt;
  994. __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
  995. rt = __ip_route_output_key(net, &fl4);
  996. if (!IS_ERR(rt)) {
  997. __ip_do_redirect(rt, skb, &fl4, false);
  998. ip_rt_put(rt);
  999. }
  1000. }
  1001. EXPORT_SYMBOL_GPL(ipv4_redirect);
  1002. void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
  1003. {
  1004. const struct iphdr *iph = (const struct iphdr *)skb->data;
  1005. struct flowi4 fl4;
  1006. struct rtable *rt;
  1007. struct net *net = sock_net(sk);
  1008. __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
  1009. rt = __ip_route_output_key(net, &fl4);
  1010. if (!IS_ERR(rt)) {
  1011. __ip_do_redirect(rt, skb, &fl4, false);
  1012. ip_rt_put(rt);
  1013. }
  1014. }
  1015. EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
  1016. INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
  1017. u32 cookie)
  1018. {
  1019. struct rtable *rt = (struct rtable *) dst;
  1020. /* All IPV4 dsts are created with ->obsolete set to the value
  1021. * DST_OBSOLETE_FORCE_CHK which forces validation calls down
  1022. * into this function always.
  1023. *
  1024. * When a PMTU/redirect information update invalidates a route,
  1025. * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
  1026. * DST_OBSOLETE_DEAD.
  1027. */
  1028. if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
  1029. return NULL;
  1030. return dst;
  1031. }
  1032. EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
  1033. static void ipv4_send_dest_unreach(struct sk_buff *skb)
  1034. {
  1035. struct net_device *dev;
  1036. struct ip_options opt;
  1037. int res;
  1038. /* Recompile ip options since IPCB may not be valid anymore.
  1039. * Also check we have a reasonable ipv4 header.
  1040. */
  1041. if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
  1042. ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
  1043. return;
  1044. memset(&opt, 0, sizeof(opt));
  1045. if (ip_hdr(skb)->ihl > 5) {
  1046. if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
  1047. return;
  1048. opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
  1049. rcu_read_lock();
  1050. dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
  1051. res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
  1052. rcu_read_unlock();
  1053. if (res)
  1054. return;
  1055. }
  1056. __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
  1057. }
  1058. static void ipv4_link_failure(struct sk_buff *skb)
  1059. {
  1060. struct rtable *rt;
  1061. ipv4_send_dest_unreach(skb);
  1062. rt = skb_rtable(skb);
  1063. if (rt)
  1064. dst_set_expires(&rt->dst, 0);
  1065. }
  1066. static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
  1067. {
  1068. pr_debug("%s: %pI4 -> %pI4, %s\n",
  1069. __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
  1070. skb->dev ? skb->dev->name : "?");
  1071. kfree_skb(skb);
  1072. WARN_ON(1);
  1073. return 0;
  1074. }
  1075. /*
  1076. * We do not cache source address of outgoing interface,
  1077. * because it is used only by IP RR, TS and SRR options,
  1078. * so that it out of fast path.
  1079. *
  1080. * BTW remember: "addr" is allowed to be not aligned
  1081. * in IP options!
  1082. */
  1083. void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
  1084. {
  1085. __be32 src;
  1086. if (rt_is_output_route(rt))
  1087. src = ip_hdr(skb)->saddr;
  1088. else {
  1089. struct fib_result res;
  1090. struct iphdr *iph = ip_hdr(skb);
  1091. struct flowi4 fl4 = {
  1092. .daddr = iph->daddr,
  1093. .saddr = iph->saddr,
  1094. .flowi4_tos = RT_TOS(iph->tos),
  1095. .flowi4_oif = rt->dst.dev->ifindex,
  1096. .flowi4_iif = skb->dev->ifindex,
  1097. .flowi4_mark = skb->mark,
  1098. };
  1099. rcu_read_lock();
  1100. if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
  1101. src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
  1102. else
  1103. src = inet_select_addr(rt->dst.dev,
  1104. rt_nexthop(rt, iph->daddr),
  1105. RT_SCOPE_UNIVERSE);
  1106. rcu_read_unlock();
  1107. }
  1108. memcpy(addr, &src, 4);
  1109. }
  1110. #ifdef CONFIG_IP_ROUTE_CLASSID
  1111. static void set_class_tag(struct rtable *rt, u32 tag)
  1112. {
  1113. if (!(rt->dst.tclassid & 0xFFFF))
  1114. rt->dst.tclassid |= tag & 0xFFFF;
  1115. if (!(rt->dst.tclassid & 0xFFFF0000))
  1116. rt->dst.tclassid |= tag & 0xFFFF0000;
  1117. }
  1118. #endif
  1119. static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
  1120. {
  1121. struct net *net = dev_net(dst->dev);
  1122. unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
  1123. unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
  1124. net->ipv4.ip_rt_min_advmss);
  1125. return min(advmss, IPV4_MAX_PMTU - header_size);
  1126. }
  1127. INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
  1128. {
  1129. return ip_dst_mtu_maybe_forward(dst, false);
  1130. }
  1131. EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
  1132. static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
  1133. {
  1134. struct fnhe_hash_bucket *hash;
  1135. struct fib_nh_exception *fnhe, __rcu **fnhe_p;
  1136. u32 hval = fnhe_hashfun(daddr);
  1137. spin_lock_bh(&fnhe_lock);
  1138. hash = rcu_dereference_protected(nhc->nhc_exceptions,
  1139. lockdep_is_held(&fnhe_lock));
  1140. hash += hval;
  1141. fnhe_p = &hash->chain;
  1142. fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
  1143. while (fnhe) {
  1144. if (fnhe->fnhe_daddr == daddr) {
  1145. rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
  1146. fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
  1147. /* set fnhe_daddr to 0 to ensure it won't bind with
  1148. * new dsts in rt_bind_exception().
  1149. */
  1150. fnhe->fnhe_daddr = 0;
  1151. fnhe_flush_routes(fnhe);
  1152. kfree_rcu(fnhe, rcu);
  1153. break;
  1154. }
  1155. fnhe_p = &fnhe->fnhe_next;
  1156. fnhe = rcu_dereference_protected(fnhe->fnhe_next,
  1157. lockdep_is_held(&fnhe_lock));
  1158. }
  1159. spin_unlock_bh(&fnhe_lock);
  1160. }
  1161. static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
  1162. __be32 daddr)
  1163. {
  1164. struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
  1165. struct fib_nh_exception *fnhe;
  1166. u32 hval;
  1167. if (!hash)
  1168. return NULL;
  1169. hval = fnhe_hashfun(daddr);
  1170. for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
  1171. fnhe = rcu_dereference(fnhe->fnhe_next)) {
  1172. if (fnhe->fnhe_daddr == daddr) {
  1173. if (fnhe->fnhe_expires &&
  1174. time_after(jiffies, fnhe->fnhe_expires)) {
  1175. ip_del_fnhe(nhc, daddr);
  1176. break;
  1177. }
  1178. return fnhe;
  1179. }
  1180. }
  1181. return NULL;
  1182. }
  1183. /* MTU selection:
  1184. * 1. mtu on route is locked - use it
  1185. * 2. mtu from nexthop exception
  1186. * 3. mtu from egress device
  1187. */
  1188. u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
  1189. {
  1190. struct fib_nh_common *nhc = res->nhc;
  1191. struct net_device *dev = nhc->nhc_dev;
  1192. struct fib_info *fi = res->fi;
  1193. u32 mtu = 0;
  1194. if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
  1195. fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
  1196. mtu = fi->fib_mtu;
  1197. if (likely(!mtu)) {
  1198. struct fib_nh_exception *fnhe;
  1199. fnhe = find_exception(nhc, daddr);
  1200. if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
  1201. mtu = fnhe->fnhe_pmtu;
  1202. }
  1203. if (likely(!mtu))
  1204. mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
  1205. return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
  1206. }
  1207. static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
  1208. __be32 daddr, const bool do_cache)
  1209. {
  1210. bool ret = false;
  1211. spin_lock_bh(&fnhe_lock);
  1212. if (daddr == fnhe->fnhe_daddr) {
  1213. struct rtable __rcu **porig;
  1214. struct rtable *orig;
  1215. int genid = fnhe_genid(dev_net(rt->dst.dev));
  1216. if (rt_is_input_route(rt))
  1217. porig = &fnhe->fnhe_rth_input;
  1218. else
  1219. porig = &fnhe->fnhe_rth_output;
  1220. orig = rcu_dereference(*porig);
  1221. if (fnhe->fnhe_genid != genid) {
  1222. fnhe->fnhe_genid = genid;
  1223. fnhe->fnhe_gw = 0;
  1224. fnhe->fnhe_pmtu = 0;
  1225. fnhe->fnhe_expires = 0;
  1226. fnhe->fnhe_mtu_locked = false;
  1227. fnhe_flush_routes(fnhe);
  1228. orig = NULL;
  1229. }
  1230. fill_route_from_fnhe(rt, fnhe);
  1231. if (!rt->rt_gw4) {
  1232. rt->rt_gw4 = daddr;
  1233. rt->rt_gw_family = AF_INET;
  1234. }
  1235. if (do_cache) {
  1236. dst_hold(&rt->dst);
  1237. rcu_assign_pointer(*porig, rt);
  1238. if (orig) {
  1239. dst_dev_put(&orig->dst);
  1240. dst_release(&orig->dst);
  1241. }
  1242. ret = true;
  1243. }
  1244. fnhe->fnhe_stamp = jiffies;
  1245. }
  1246. spin_unlock_bh(&fnhe_lock);
  1247. return ret;
  1248. }
  1249. static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
  1250. {
  1251. struct rtable *orig, *prev, **p;
  1252. bool ret = true;
  1253. if (rt_is_input_route(rt)) {
  1254. p = (struct rtable **)&nhc->nhc_rth_input;
  1255. } else {
  1256. p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
  1257. }
  1258. orig = *p;
  1259. /* hold dst before doing cmpxchg() to avoid race condition
  1260. * on this dst
  1261. */
  1262. dst_hold(&rt->dst);
  1263. prev = cmpxchg(p, orig, rt);
  1264. if (prev == orig) {
  1265. if (orig) {
  1266. rt_add_uncached_list(orig);
  1267. dst_release(&orig->dst);
  1268. }
  1269. } else {
  1270. dst_release(&rt->dst);
  1271. ret = false;
  1272. }
  1273. return ret;
  1274. }
  1275. struct uncached_list {
  1276. spinlock_t lock;
  1277. struct list_head head;
  1278. struct list_head quarantine;
  1279. };
  1280. static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
  1281. void rt_add_uncached_list(struct rtable *rt)
  1282. {
  1283. struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
  1284. rt->rt_uncached_list = ul;
  1285. spin_lock_bh(&ul->lock);
  1286. list_add_tail(&rt->rt_uncached, &ul->head);
  1287. spin_unlock_bh(&ul->lock);
  1288. }
  1289. void rt_del_uncached_list(struct rtable *rt)
  1290. {
  1291. if (!list_empty(&rt->rt_uncached)) {
  1292. struct uncached_list *ul = rt->rt_uncached_list;
  1293. spin_lock_bh(&ul->lock);
  1294. list_del_init(&rt->rt_uncached);
  1295. spin_unlock_bh(&ul->lock);
  1296. }
  1297. }
  1298. static void ipv4_dst_destroy(struct dst_entry *dst)
  1299. {
  1300. struct rtable *rt = (struct rtable *)dst;
  1301. ip_dst_metrics_put(dst);
  1302. rt_del_uncached_list(rt);
  1303. }
  1304. void rt_flush_dev(struct net_device *dev)
  1305. {
  1306. struct rtable *rt, *safe;
  1307. int cpu;
  1308. for_each_possible_cpu(cpu) {
  1309. struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
  1310. if (list_empty(&ul->head))
  1311. continue;
  1312. spin_lock_bh(&ul->lock);
  1313. list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
  1314. if (rt->dst.dev != dev)
  1315. continue;
  1316. rt->dst.dev = blackhole_netdev;
  1317. netdev_ref_replace(dev, blackhole_netdev,
  1318. &rt->dst.dev_tracker, GFP_ATOMIC);
  1319. list_move(&rt->rt_uncached, &ul->quarantine);
  1320. }
  1321. spin_unlock_bh(&ul->lock);
  1322. }
  1323. }
  1324. static bool rt_cache_valid(const struct rtable *rt)
  1325. {
  1326. return rt &&
  1327. rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
  1328. !rt_is_expired(rt);
  1329. }
  1330. static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
  1331. const struct fib_result *res,
  1332. struct fib_nh_exception *fnhe,
  1333. struct fib_info *fi, u16 type, u32 itag,
  1334. const bool do_cache)
  1335. {
  1336. bool cached = false;
  1337. if (fi) {
  1338. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  1339. if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
  1340. rt->rt_uses_gateway = 1;
  1341. rt->rt_gw_family = nhc->nhc_gw_family;
  1342. /* only INET and INET6 are supported */
  1343. if (likely(nhc->nhc_gw_family == AF_INET))
  1344. rt->rt_gw4 = nhc->nhc_gw.ipv4;
  1345. else
  1346. rt->rt_gw6 = nhc->nhc_gw.ipv6;
  1347. }
  1348. ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
  1349. #ifdef CONFIG_IP_ROUTE_CLASSID
  1350. if (nhc->nhc_family == AF_INET) {
  1351. struct fib_nh *nh;
  1352. nh = container_of(nhc, struct fib_nh, nh_common);
  1353. rt->dst.tclassid = nh->nh_tclassid;
  1354. }
  1355. #endif
  1356. rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
  1357. if (unlikely(fnhe))
  1358. cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
  1359. else if (do_cache)
  1360. cached = rt_cache_route(nhc, rt);
  1361. if (unlikely(!cached)) {
  1362. /* Routes we intend to cache in nexthop exception or
  1363. * FIB nexthop have the DST_NOCACHE bit clear.
  1364. * However, if we are unsuccessful at storing this
  1365. * route into the cache we really need to set it.
  1366. */
  1367. if (!rt->rt_gw4) {
  1368. rt->rt_gw_family = AF_INET;
  1369. rt->rt_gw4 = daddr;
  1370. }
  1371. rt_add_uncached_list(rt);
  1372. }
  1373. } else
  1374. rt_add_uncached_list(rt);
  1375. #ifdef CONFIG_IP_ROUTE_CLASSID
  1376. #ifdef CONFIG_IP_MULTIPLE_TABLES
  1377. set_class_tag(rt, res->tclassid);
  1378. #endif
  1379. set_class_tag(rt, itag);
  1380. #endif
  1381. }
  1382. struct rtable *rt_dst_alloc(struct net_device *dev,
  1383. unsigned int flags, u16 type,
  1384. bool noxfrm)
  1385. {
  1386. struct rtable *rt;
  1387. rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
  1388. (noxfrm ? DST_NOXFRM : 0));
  1389. if (rt) {
  1390. rt->rt_genid = rt_genid_ipv4(dev_net(dev));
  1391. rt->rt_flags = flags;
  1392. rt->rt_type = type;
  1393. rt->rt_is_input = 0;
  1394. rt->rt_iif = 0;
  1395. rt->rt_pmtu = 0;
  1396. rt->rt_mtu_locked = 0;
  1397. rt->rt_uses_gateway = 0;
  1398. rt->rt_gw_family = 0;
  1399. rt->rt_gw4 = 0;
  1400. INIT_LIST_HEAD(&rt->rt_uncached);
  1401. rt->dst.output = ip_output;
  1402. if (flags & RTCF_LOCAL)
  1403. rt->dst.input = ip_local_deliver;
  1404. }
  1405. return rt;
  1406. }
  1407. EXPORT_SYMBOL(rt_dst_alloc);
  1408. struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
  1409. {
  1410. struct rtable *new_rt;
  1411. new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
  1412. rt->dst.flags);
  1413. if (new_rt) {
  1414. new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
  1415. new_rt->rt_flags = rt->rt_flags;
  1416. new_rt->rt_type = rt->rt_type;
  1417. new_rt->rt_is_input = rt->rt_is_input;
  1418. new_rt->rt_iif = rt->rt_iif;
  1419. new_rt->rt_pmtu = rt->rt_pmtu;
  1420. new_rt->rt_mtu_locked = rt->rt_mtu_locked;
  1421. new_rt->rt_gw_family = rt->rt_gw_family;
  1422. if (rt->rt_gw_family == AF_INET)
  1423. new_rt->rt_gw4 = rt->rt_gw4;
  1424. else if (rt->rt_gw_family == AF_INET6)
  1425. new_rt->rt_gw6 = rt->rt_gw6;
  1426. INIT_LIST_HEAD(&new_rt->rt_uncached);
  1427. new_rt->dst.input = rt->dst.input;
  1428. new_rt->dst.output = rt->dst.output;
  1429. new_rt->dst.error = rt->dst.error;
  1430. new_rt->dst.lastuse = jiffies;
  1431. new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
  1432. }
  1433. return new_rt;
  1434. }
  1435. EXPORT_SYMBOL(rt_dst_clone);
  1436. /* called in rcu_read_lock() section */
  1437. int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  1438. u8 tos, struct net_device *dev,
  1439. struct in_device *in_dev, u32 *itag)
  1440. {
  1441. int err;
  1442. /* Primary sanity checks. */
  1443. if (!in_dev)
  1444. return -EINVAL;
  1445. if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
  1446. skb->protocol != htons(ETH_P_IP))
  1447. return -EINVAL;
  1448. if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
  1449. return -EINVAL;
  1450. if (ipv4_is_zeronet(saddr)) {
  1451. if (!ipv4_is_local_multicast(daddr) &&
  1452. ip_hdr(skb)->protocol != IPPROTO_IGMP)
  1453. return -EINVAL;
  1454. } else {
  1455. err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
  1456. in_dev, itag);
  1457. if (err < 0)
  1458. return err;
  1459. }
  1460. return 0;
  1461. }
  1462. /* called in rcu_read_lock() section */
  1463. static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  1464. u8 tos, struct net_device *dev, int our)
  1465. {
  1466. struct in_device *in_dev = __in_dev_get_rcu(dev);
  1467. unsigned int flags = RTCF_MULTICAST;
  1468. struct rtable *rth;
  1469. u32 itag = 0;
  1470. int err;
  1471. err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
  1472. if (err)
  1473. return err;
  1474. if (our)
  1475. flags |= RTCF_LOCAL;
  1476. if (IN_DEV_ORCONF(in_dev, NOPOLICY))
  1477. IPCB(skb)->flags |= IPSKB_NOPOLICY;
  1478. rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
  1479. false);
  1480. if (!rth)
  1481. return -ENOBUFS;
  1482. #ifdef CONFIG_IP_ROUTE_CLASSID
  1483. rth->dst.tclassid = itag;
  1484. #endif
  1485. rth->dst.output = ip_rt_bug;
  1486. rth->rt_is_input= 1;
  1487. #ifdef CONFIG_IP_MROUTE
  1488. if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
  1489. rth->dst.input = ip_mr_input;
  1490. #endif
  1491. RT_CACHE_STAT_INC(in_slow_mc);
  1492. skb_dst_drop(skb);
  1493. skb_dst_set(skb, &rth->dst);
  1494. return 0;
  1495. }
  1496. static void ip_handle_martian_source(struct net_device *dev,
  1497. struct in_device *in_dev,
  1498. struct sk_buff *skb,
  1499. __be32 daddr,
  1500. __be32 saddr)
  1501. {
  1502. RT_CACHE_STAT_INC(in_martian_src);
  1503. #ifdef CONFIG_IP_ROUTE_VERBOSE
  1504. if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
  1505. /*
  1506. * RFC1812 recommendation, if source is martian,
  1507. * the only hint is MAC header.
  1508. */
  1509. pr_warn("martian source %pI4 from %pI4, on dev %s\n",
  1510. &daddr, &saddr, dev->name);
  1511. if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
  1512. print_hex_dump(KERN_WARNING, "ll header: ",
  1513. DUMP_PREFIX_OFFSET, 16, 1,
  1514. skb_mac_header(skb),
  1515. dev->hard_header_len, false);
  1516. }
  1517. }
  1518. #endif
  1519. }
  1520. /* called in rcu_read_lock() section */
  1521. static int __mkroute_input(struct sk_buff *skb,
  1522. const struct fib_result *res,
  1523. struct in_device *in_dev,
  1524. __be32 daddr, __be32 saddr, u32 tos)
  1525. {
  1526. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  1527. struct net_device *dev = nhc->nhc_dev;
  1528. struct fib_nh_exception *fnhe;
  1529. struct rtable *rth;
  1530. int err;
  1531. struct in_device *out_dev;
  1532. bool do_cache;
  1533. u32 itag = 0;
  1534. /* get a working reference to the output device */
  1535. out_dev = __in_dev_get_rcu(dev);
  1536. if (!out_dev) {
  1537. net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
  1538. return -EINVAL;
  1539. }
  1540. err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
  1541. in_dev->dev, in_dev, &itag);
  1542. if (err < 0) {
  1543. ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
  1544. saddr);
  1545. goto cleanup;
  1546. }
  1547. do_cache = res->fi && !itag;
  1548. if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
  1549. skb->protocol == htons(ETH_P_IP)) {
  1550. __be32 gw;
  1551. gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
  1552. if (IN_DEV_SHARED_MEDIA(out_dev) ||
  1553. inet_addr_onlink(out_dev, saddr, gw))
  1554. IPCB(skb)->flags |= IPSKB_DOREDIRECT;
  1555. }
  1556. if (skb->protocol != htons(ETH_P_IP)) {
  1557. /* Not IP (i.e. ARP). Do not create route, if it is
  1558. * invalid for proxy arp. DNAT routes are always valid.
  1559. *
  1560. * Proxy arp feature have been extended to allow, ARP
  1561. * replies back to the same interface, to support
  1562. * Private VLAN switch technologies. See arp.c.
  1563. */
  1564. if (out_dev == in_dev &&
  1565. IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
  1566. err = -EINVAL;
  1567. goto cleanup;
  1568. }
  1569. }
  1570. if (IN_DEV_ORCONF(in_dev, NOPOLICY))
  1571. IPCB(skb)->flags |= IPSKB_NOPOLICY;
  1572. fnhe = find_exception(nhc, daddr);
  1573. if (do_cache) {
  1574. if (fnhe)
  1575. rth = rcu_dereference(fnhe->fnhe_rth_input);
  1576. else
  1577. rth = rcu_dereference(nhc->nhc_rth_input);
  1578. if (rt_cache_valid(rth)) {
  1579. skb_dst_set_noref(skb, &rth->dst);
  1580. goto out;
  1581. }
  1582. }
  1583. rth = rt_dst_alloc(out_dev->dev, 0, res->type,
  1584. IN_DEV_ORCONF(out_dev, NOXFRM));
  1585. if (!rth) {
  1586. err = -ENOBUFS;
  1587. goto cleanup;
  1588. }
  1589. rth->rt_is_input = 1;
  1590. RT_CACHE_STAT_INC(in_slow_tot);
  1591. rth->dst.input = ip_forward;
  1592. rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
  1593. do_cache);
  1594. lwtunnel_set_redirect(&rth->dst);
  1595. skb_dst_set(skb, &rth->dst);
  1596. out:
  1597. err = 0;
  1598. cleanup:
  1599. return err;
  1600. }
  1601. #ifdef CONFIG_IP_ROUTE_MULTIPATH
  1602. /* To make ICMP packets follow the right flow, the multipath hash is
  1603. * calculated from the inner IP addresses.
  1604. */
  1605. static void ip_multipath_l3_keys(const struct sk_buff *skb,
  1606. struct flow_keys *hash_keys)
  1607. {
  1608. const struct iphdr *outer_iph = ip_hdr(skb);
  1609. const struct iphdr *key_iph = outer_iph;
  1610. const struct iphdr *inner_iph;
  1611. const struct icmphdr *icmph;
  1612. struct iphdr _inner_iph;
  1613. struct icmphdr _icmph;
  1614. if (likely(outer_iph->protocol != IPPROTO_ICMP))
  1615. goto out;
  1616. if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
  1617. goto out;
  1618. icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
  1619. &_icmph);
  1620. if (!icmph)
  1621. goto out;
  1622. if (!icmp_is_err(icmph->type))
  1623. goto out;
  1624. inner_iph = skb_header_pointer(skb,
  1625. outer_iph->ihl * 4 + sizeof(_icmph),
  1626. sizeof(_inner_iph), &_inner_iph);
  1627. if (!inner_iph)
  1628. goto out;
  1629. key_iph = inner_iph;
  1630. out:
  1631. hash_keys->addrs.v4addrs.src = key_iph->saddr;
  1632. hash_keys->addrs.v4addrs.dst = key_iph->daddr;
  1633. }
  1634. static u32 fib_multipath_custom_hash_outer(const struct net *net,
  1635. const struct sk_buff *skb,
  1636. bool *p_has_inner)
  1637. {
  1638. u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
  1639. struct flow_keys keys, hash_keys;
  1640. if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
  1641. return 0;
  1642. memset(&hash_keys, 0, sizeof(hash_keys));
  1643. skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
  1644. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1645. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
  1646. hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
  1647. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
  1648. hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
  1649. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
  1650. hash_keys.basic.ip_proto = keys.basic.ip_proto;
  1651. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
  1652. hash_keys.ports.src = keys.ports.src;
  1653. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
  1654. hash_keys.ports.dst = keys.ports.dst;
  1655. *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
  1656. return flow_hash_from_keys(&hash_keys);
  1657. }
  1658. static u32 fib_multipath_custom_hash_inner(const struct net *net,
  1659. const struct sk_buff *skb,
  1660. bool has_inner)
  1661. {
  1662. u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
  1663. struct flow_keys keys, hash_keys;
  1664. /* We assume the packet carries an encapsulation, but if none was
  1665. * encountered during dissection of the outer flow, then there is no
  1666. * point in calling the flow dissector again.
  1667. */
  1668. if (!has_inner)
  1669. return 0;
  1670. if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
  1671. return 0;
  1672. memset(&hash_keys, 0, sizeof(hash_keys));
  1673. skb_flow_dissect_flow_keys(skb, &keys, 0);
  1674. if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
  1675. return 0;
  1676. if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
  1677. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1678. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
  1679. hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
  1680. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
  1681. hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
  1682. } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
  1683. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
  1684. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
  1685. hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
  1686. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
  1687. hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
  1688. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
  1689. hash_keys.tags.flow_label = keys.tags.flow_label;
  1690. }
  1691. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
  1692. hash_keys.basic.ip_proto = keys.basic.ip_proto;
  1693. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
  1694. hash_keys.ports.src = keys.ports.src;
  1695. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
  1696. hash_keys.ports.dst = keys.ports.dst;
  1697. return flow_hash_from_keys(&hash_keys);
  1698. }
  1699. static u32 fib_multipath_custom_hash_skb(const struct net *net,
  1700. const struct sk_buff *skb)
  1701. {
  1702. u32 mhash, mhash_inner;
  1703. bool has_inner = true;
  1704. mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
  1705. mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
  1706. return jhash_2words(mhash, mhash_inner, 0);
  1707. }
  1708. static u32 fib_multipath_custom_hash_fl4(const struct net *net,
  1709. const struct flowi4 *fl4)
  1710. {
  1711. u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
  1712. struct flow_keys hash_keys;
  1713. if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
  1714. return 0;
  1715. memset(&hash_keys, 0, sizeof(hash_keys));
  1716. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1717. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
  1718. hash_keys.addrs.v4addrs.src = fl4->saddr;
  1719. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
  1720. hash_keys.addrs.v4addrs.dst = fl4->daddr;
  1721. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
  1722. hash_keys.basic.ip_proto = fl4->flowi4_proto;
  1723. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
  1724. hash_keys.ports.src = fl4->fl4_sport;
  1725. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
  1726. hash_keys.ports.dst = fl4->fl4_dport;
  1727. return flow_hash_from_keys(&hash_keys);
  1728. }
  1729. /* if skb is set it will be used and fl4 can be NULL */
  1730. int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
  1731. const struct sk_buff *skb, struct flow_keys *flkeys)
  1732. {
  1733. u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
  1734. struct flow_keys hash_keys;
  1735. u32 mhash = 0;
  1736. switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
  1737. case 0:
  1738. memset(&hash_keys, 0, sizeof(hash_keys));
  1739. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1740. if (skb) {
  1741. ip_multipath_l3_keys(skb, &hash_keys);
  1742. } else {
  1743. hash_keys.addrs.v4addrs.src = fl4->saddr;
  1744. hash_keys.addrs.v4addrs.dst = fl4->daddr;
  1745. }
  1746. mhash = flow_hash_from_keys(&hash_keys);
  1747. break;
  1748. case 1:
  1749. /* skb is currently provided only when forwarding */
  1750. if (skb) {
  1751. unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
  1752. struct flow_keys keys;
  1753. /* short-circuit if we already have L4 hash present */
  1754. if (skb->l4_hash)
  1755. return skb_get_hash_raw(skb) >> 1;
  1756. memset(&hash_keys, 0, sizeof(hash_keys));
  1757. if (!flkeys) {
  1758. skb_flow_dissect_flow_keys(skb, &keys, flag);
  1759. flkeys = &keys;
  1760. }
  1761. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1762. hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
  1763. hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
  1764. hash_keys.ports.src = flkeys->ports.src;
  1765. hash_keys.ports.dst = flkeys->ports.dst;
  1766. hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
  1767. } else {
  1768. memset(&hash_keys, 0, sizeof(hash_keys));
  1769. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1770. hash_keys.addrs.v4addrs.src = fl4->saddr;
  1771. hash_keys.addrs.v4addrs.dst = fl4->daddr;
  1772. hash_keys.ports.src = fl4->fl4_sport;
  1773. hash_keys.ports.dst = fl4->fl4_dport;
  1774. hash_keys.basic.ip_proto = fl4->flowi4_proto;
  1775. }
  1776. mhash = flow_hash_from_keys(&hash_keys);
  1777. break;
  1778. case 2:
  1779. memset(&hash_keys, 0, sizeof(hash_keys));
  1780. /* skb is currently provided only when forwarding */
  1781. if (skb) {
  1782. struct flow_keys keys;
  1783. skb_flow_dissect_flow_keys(skb, &keys, 0);
  1784. /* Inner can be v4 or v6 */
  1785. if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
  1786. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1787. hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
  1788. hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
  1789. } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
  1790. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
  1791. hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
  1792. hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
  1793. hash_keys.tags.flow_label = keys.tags.flow_label;
  1794. hash_keys.basic.ip_proto = keys.basic.ip_proto;
  1795. } else {
  1796. /* Same as case 0 */
  1797. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1798. ip_multipath_l3_keys(skb, &hash_keys);
  1799. }
  1800. } else {
  1801. /* Same as case 0 */
  1802. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1803. hash_keys.addrs.v4addrs.src = fl4->saddr;
  1804. hash_keys.addrs.v4addrs.dst = fl4->daddr;
  1805. }
  1806. mhash = flow_hash_from_keys(&hash_keys);
  1807. break;
  1808. case 3:
  1809. if (skb)
  1810. mhash = fib_multipath_custom_hash_skb(net, skb);
  1811. else
  1812. mhash = fib_multipath_custom_hash_fl4(net, fl4);
  1813. break;
  1814. }
  1815. if (multipath_hash)
  1816. mhash = jhash_2words(mhash, multipath_hash, 0);
  1817. return mhash >> 1;
  1818. }
  1819. #endif /* CONFIG_IP_ROUTE_MULTIPATH */
  1820. static int ip_mkroute_input(struct sk_buff *skb,
  1821. struct fib_result *res,
  1822. struct in_device *in_dev,
  1823. __be32 daddr, __be32 saddr, u32 tos,
  1824. struct flow_keys *hkeys)
  1825. {
  1826. #ifdef CONFIG_IP_ROUTE_MULTIPATH
  1827. if (res->fi && fib_info_num_path(res->fi) > 1) {
  1828. int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
  1829. fib_select_multipath(res, h);
  1830. IPCB(skb)->flags |= IPSKB_MULTIPATH;
  1831. }
  1832. #endif
  1833. /* create a routing cache entry */
  1834. return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
  1835. }
  1836. /* Implements all the saddr-related checks as ip_route_input_slow(),
  1837. * assuming daddr is valid and the destination is not a local broadcast one.
  1838. * Uses the provided hint instead of performing a route lookup.
  1839. */
  1840. int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  1841. u8 tos, struct net_device *dev,
  1842. const struct sk_buff *hint)
  1843. {
  1844. struct in_device *in_dev = __in_dev_get_rcu(dev);
  1845. struct rtable *rt = skb_rtable(hint);
  1846. struct net *net = dev_net(dev);
  1847. int err = -EINVAL;
  1848. u32 tag = 0;
  1849. if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
  1850. goto martian_source;
  1851. if (ipv4_is_zeronet(saddr))
  1852. goto martian_source;
  1853. if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
  1854. goto martian_source;
  1855. if (rt->rt_type != RTN_LOCAL)
  1856. goto skip_validate_source;
  1857. tos &= IPTOS_RT_MASK;
  1858. err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
  1859. if (err < 0)
  1860. goto martian_source;
  1861. skip_validate_source:
  1862. skb_dst_copy(skb, hint);
  1863. return 0;
  1864. martian_source:
  1865. ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
  1866. return err;
  1867. }
  1868. /* get device for dst_alloc with local routes */
  1869. static struct net_device *ip_rt_get_dev(struct net *net,
  1870. const struct fib_result *res)
  1871. {
  1872. struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
  1873. struct net_device *dev = NULL;
  1874. if (nhc)
  1875. dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
  1876. return dev ? : net->loopback_dev;
  1877. }
  1878. /*
  1879. * NOTE. We drop all the packets that has local source
  1880. * addresses, because every properly looped back packet
  1881. * must have correct destination already attached by output routine.
  1882. * Changes in the enforced policies must be applied also to
  1883. * ip_route_use_hint().
  1884. *
  1885. * Such approach solves two big problems:
  1886. * 1. Not simplex devices are handled properly.
  1887. * 2. IP spoofing attempts are filtered with 100% of guarantee.
  1888. * called with rcu_read_lock()
  1889. */
  1890. static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  1891. u8 tos, struct net_device *dev,
  1892. struct fib_result *res)
  1893. {
  1894. struct in_device *in_dev = __in_dev_get_rcu(dev);
  1895. struct flow_keys *flkeys = NULL, _flkeys;
  1896. struct net *net = dev_net(dev);
  1897. struct ip_tunnel_info *tun_info;
  1898. int err = -EINVAL;
  1899. unsigned int flags = 0;
  1900. u32 itag = 0;
  1901. struct rtable *rth;
  1902. struct flowi4 fl4;
  1903. bool do_cache = true;
  1904. /* IP on this device is disabled. */
  1905. if (!in_dev)
  1906. goto out;
  1907. /* Check for the most weird martians, which can be not detected
  1908. * by fib_lookup.
  1909. */
  1910. tun_info = skb_tunnel_info(skb);
  1911. if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
  1912. fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
  1913. else
  1914. fl4.flowi4_tun_key.tun_id = 0;
  1915. skb_dst_drop(skb);
  1916. if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
  1917. goto martian_source;
  1918. res->fi = NULL;
  1919. res->table = NULL;
  1920. if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
  1921. goto brd_input;
  1922. /* Accept zero addresses only to limited broadcast;
  1923. * I even do not know to fix it or not. Waiting for complains :-)
  1924. */
  1925. if (ipv4_is_zeronet(saddr))
  1926. goto martian_source;
  1927. if (ipv4_is_zeronet(daddr))
  1928. goto martian_destination;
  1929. /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
  1930. * and call it once if daddr or/and saddr are loopback addresses
  1931. */
  1932. if (ipv4_is_loopback(daddr)) {
  1933. if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
  1934. goto martian_destination;
  1935. } else if (ipv4_is_loopback(saddr)) {
  1936. if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
  1937. goto martian_source;
  1938. }
  1939. /*
  1940. * Now we are ready to route packet.
  1941. */
  1942. fl4.flowi4_l3mdev = 0;
  1943. fl4.flowi4_oif = 0;
  1944. fl4.flowi4_iif = dev->ifindex;
  1945. fl4.flowi4_mark = skb->mark;
  1946. fl4.flowi4_tos = tos;
  1947. fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
  1948. fl4.flowi4_flags = 0;
  1949. fl4.daddr = daddr;
  1950. fl4.saddr = saddr;
  1951. fl4.flowi4_uid = sock_net_uid(net, NULL);
  1952. fl4.flowi4_multipath_hash = 0;
  1953. if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
  1954. flkeys = &_flkeys;
  1955. } else {
  1956. fl4.flowi4_proto = 0;
  1957. fl4.fl4_sport = 0;
  1958. fl4.fl4_dport = 0;
  1959. }
  1960. err = fib_lookup(net, &fl4, res, 0);
  1961. if (err != 0) {
  1962. if (!IN_DEV_FORWARD(in_dev))
  1963. err = -EHOSTUNREACH;
  1964. goto no_route;
  1965. }
  1966. if (res->type == RTN_BROADCAST) {
  1967. if (IN_DEV_BFORWARD(in_dev))
  1968. goto make_route;
  1969. /* not do cache if bc_forwarding is enabled */
  1970. if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
  1971. do_cache = false;
  1972. goto brd_input;
  1973. }
  1974. if (res->type == RTN_LOCAL) {
  1975. err = fib_validate_source(skb, saddr, daddr, tos,
  1976. 0, dev, in_dev, &itag);
  1977. if (err < 0)
  1978. goto martian_source;
  1979. goto local_input;
  1980. }
  1981. if (!IN_DEV_FORWARD(in_dev)) {
  1982. err = -EHOSTUNREACH;
  1983. goto no_route;
  1984. }
  1985. if (res->type != RTN_UNICAST)
  1986. goto martian_destination;
  1987. make_route:
  1988. err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
  1989. out: return err;
  1990. brd_input:
  1991. if (skb->protocol != htons(ETH_P_IP))
  1992. goto e_inval;
  1993. if (!ipv4_is_zeronet(saddr)) {
  1994. err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
  1995. in_dev, &itag);
  1996. if (err < 0)
  1997. goto martian_source;
  1998. }
  1999. flags |= RTCF_BROADCAST;
  2000. res->type = RTN_BROADCAST;
  2001. RT_CACHE_STAT_INC(in_brd);
  2002. local_input:
  2003. if (IN_DEV_ORCONF(in_dev, NOPOLICY))
  2004. IPCB(skb)->flags |= IPSKB_NOPOLICY;
  2005. do_cache &= res->fi && !itag;
  2006. if (do_cache) {
  2007. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  2008. rth = rcu_dereference(nhc->nhc_rth_input);
  2009. if (rt_cache_valid(rth)) {
  2010. skb_dst_set_noref(skb, &rth->dst);
  2011. err = 0;
  2012. goto out;
  2013. }
  2014. }
  2015. rth = rt_dst_alloc(ip_rt_get_dev(net, res),
  2016. flags | RTCF_LOCAL, res->type, false);
  2017. if (!rth)
  2018. goto e_nobufs;
  2019. rth->dst.output= ip_rt_bug;
  2020. #ifdef CONFIG_IP_ROUTE_CLASSID
  2021. rth->dst.tclassid = itag;
  2022. #endif
  2023. rth->rt_is_input = 1;
  2024. RT_CACHE_STAT_INC(in_slow_tot);
  2025. if (res->type == RTN_UNREACHABLE) {
  2026. rth->dst.input= ip_error;
  2027. rth->dst.error= -err;
  2028. rth->rt_flags &= ~RTCF_LOCAL;
  2029. }
  2030. if (do_cache) {
  2031. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  2032. rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
  2033. if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
  2034. WARN_ON(rth->dst.input == lwtunnel_input);
  2035. rth->dst.lwtstate->orig_input = rth->dst.input;
  2036. rth->dst.input = lwtunnel_input;
  2037. }
  2038. if (unlikely(!rt_cache_route(nhc, rth)))
  2039. rt_add_uncached_list(rth);
  2040. }
  2041. skb_dst_set(skb, &rth->dst);
  2042. err = 0;
  2043. goto out;
  2044. no_route:
  2045. RT_CACHE_STAT_INC(in_no_route);
  2046. res->type = RTN_UNREACHABLE;
  2047. res->fi = NULL;
  2048. res->table = NULL;
  2049. goto local_input;
  2050. /*
  2051. * Do not cache martian addresses: they should be logged (RFC1812)
  2052. */
  2053. martian_destination:
  2054. RT_CACHE_STAT_INC(in_martian_dst);
  2055. #ifdef CONFIG_IP_ROUTE_VERBOSE
  2056. if (IN_DEV_LOG_MARTIANS(in_dev))
  2057. net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
  2058. &daddr, &saddr, dev->name);
  2059. #endif
  2060. e_inval:
  2061. err = -EINVAL;
  2062. goto out;
  2063. e_nobufs:
  2064. err = -ENOBUFS;
  2065. goto out;
  2066. martian_source:
  2067. ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
  2068. goto out;
  2069. }
  2070. /* called with rcu_read_lock held */
  2071. static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  2072. u8 tos, struct net_device *dev, struct fib_result *res)
  2073. {
  2074. /* Multicast recognition logic is moved from route cache to here.
  2075. * The problem was that too many Ethernet cards have broken/missing
  2076. * hardware multicast filters :-( As result the host on multicasting
  2077. * network acquires a lot of useless route cache entries, sort of
  2078. * SDR messages from all the world. Now we try to get rid of them.
  2079. * Really, provided software IP multicast filter is organized
  2080. * reasonably (at least, hashed), it does not result in a slowdown
  2081. * comparing with route cache reject entries.
  2082. * Note, that multicast routers are not affected, because
  2083. * route cache entry is created eventually.
  2084. */
  2085. if (ipv4_is_multicast(daddr)) {
  2086. struct in_device *in_dev = __in_dev_get_rcu(dev);
  2087. int our = 0;
  2088. int err = -EINVAL;
  2089. if (!in_dev)
  2090. return err;
  2091. our = ip_check_mc_rcu(in_dev, daddr, saddr,
  2092. ip_hdr(skb)->protocol);
  2093. /* check l3 master if no match yet */
  2094. if (!our && netif_is_l3_slave(dev)) {
  2095. struct in_device *l3_in_dev;
  2096. l3_in_dev = __in_dev_get_rcu(skb->dev);
  2097. if (l3_in_dev)
  2098. our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
  2099. ip_hdr(skb)->protocol);
  2100. }
  2101. if (our
  2102. #ifdef CONFIG_IP_MROUTE
  2103. ||
  2104. (!ipv4_is_local_multicast(daddr) &&
  2105. IN_DEV_MFORWARD(in_dev))
  2106. #endif
  2107. ) {
  2108. err = ip_route_input_mc(skb, daddr, saddr,
  2109. tos, dev, our);
  2110. }
  2111. return err;
  2112. }
  2113. return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
  2114. }
  2115. int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  2116. u8 tos, struct net_device *dev)
  2117. {
  2118. struct fib_result res;
  2119. int err;
  2120. tos &= IPTOS_RT_MASK;
  2121. rcu_read_lock();
  2122. err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
  2123. rcu_read_unlock();
  2124. return err;
  2125. }
  2126. EXPORT_SYMBOL(ip_route_input_noref);
  2127. /* called with rcu_read_lock() */
  2128. static struct rtable *__mkroute_output(const struct fib_result *res,
  2129. const struct flowi4 *fl4, int orig_oif,
  2130. struct net_device *dev_out,
  2131. unsigned int flags)
  2132. {
  2133. struct fib_info *fi = res->fi;
  2134. struct fib_nh_exception *fnhe;
  2135. struct in_device *in_dev;
  2136. u16 type = res->type;
  2137. struct rtable *rth;
  2138. bool do_cache;
  2139. in_dev = __in_dev_get_rcu(dev_out);
  2140. if (!in_dev)
  2141. return ERR_PTR(-EINVAL);
  2142. if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
  2143. if (ipv4_is_loopback(fl4->saddr) &&
  2144. !(dev_out->flags & IFF_LOOPBACK) &&
  2145. !netif_is_l3_master(dev_out))
  2146. return ERR_PTR(-EINVAL);
  2147. if (ipv4_is_lbcast(fl4->daddr))
  2148. type = RTN_BROADCAST;
  2149. else if (ipv4_is_multicast(fl4->daddr))
  2150. type = RTN_MULTICAST;
  2151. else if (ipv4_is_zeronet(fl4->daddr))
  2152. return ERR_PTR(-EINVAL);
  2153. if (dev_out->flags & IFF_LOOPBACK)
  2154. flags |= RTCF_LOCAL;
  2155. do_cache = true;
  2156. if (type == RTN_BROADCAST) {
  2157. flags |= RTCF_BROADCAST | RTCF_LOCAL;
  2158. fi = NULL;
  2159. } else if (type == RTN_MULTICAST) {
  2160. flags |= RTCF_MULTICAST | RTCF_LOCAL;
  2161. if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
  2162. fl4->flowi4_proto))
  2163. flags &= ~RTCF_LOCAL;
  2164. else
  2165. do_cache = false;
  2166. /* If multicast route do not exist use
  2167. * default one, but do not gateway in this case.
  2168. * Yes, it is hack.
  2169. */
  2170. if (fi && res->prefixlen < 4)
  2171. fi = NULL;
  2172. } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
  2173. (orig_oif != dev_out->ifindex)) {
  2174. /* For local routes that require a particular output interface
  2175. * we do not want to cache the result. Caching the result
  2176. * causes incorrect behaviour when there are multiple source
  2177. * addresses on the interface, the end result being that if the
  2178. * intended recipient is waiting on that interface for the
  2179. * packet he won't receive it because it will be delivered on
  2180. * the loopback interface and the IP_PKTINFO ipi_ifindex will
  2181. * be set to the loopback interface as well.
  2182. */
  2183. do_cache = false;
  2184. }
  2185. fnhe = NULL;
  2186. do_cache &= fi != NULL;
  2187. if (fi) {
  2188. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  2189. struct rtable __rcu **prth;
  2190. fnhe = find_exception(nhc, fl4->daddr);
  2191. if (!do_cache)
  2192. goto add;
  2193. if (fnhe) {
  2194. prth = &fnhe->fnhe_rth_output;
  2195. } else {
  2196. if (unlikely(fl4->flowi4_flags &
  2197. FLOWI_FLAG_KNOWN_NH &&
  2198. !(nhc->nhc_gw_family &&
  2199. nhc->nhc_scope == RT_SCOPE_LINK))) {
  2200. do_cache = false;
  2201. goto add;
  2202. }
  2203. prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
  2204. }
  2205. rth = rcu_dereference(*prth);
  2206. if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
  2207. return rth;
  2208. }
  2209. add:
  2210. rth = rt_dst_alloc(dev_out, flags, type,
  2211. IN_DEV_ORCONF(in_dev, NOXFRM));
  2212. if (!rth)
  2213. return ERR_PTR(-ENOBUFS);
  2214. rth->rt_iif = orig_oif;
  2215. RT_CACHE_STAT_INC(out_slow_tot);
  2216. if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
  2217. if (flags & RTCF_LOCAL &&
  2218. !(dev_out->flags & IFF_LOOPBACK)) {
  2219. rth->dst.output = ip_mc_output;
  2220. RT_CACHE_STAT_INC(out_slow_mc);
  2221. }
  2222. #ifdef CONFIG_IP_MROUTE
  2223. if (type == RTN_MULTICAST) {
  2224. if (IN_DEV_MFORWARD(in_dev) &&
  2225. !ipv4_is_local_multicast(fl4->daddr)) {
  2226. rth->dst.input = ip_mr_input;
  2227. rth->dst.output = ip_mc_output;
  2228. }
  2229. }
  2230. #endif
  2231. }
  2232. rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
  2233. lwtunnel_set_redirect(&rth->dst);
  2234. return rth;
  2235. }
  2236. /*
  2237. * Major route resolver routine.
  2238. */
  2239. struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
  2240. const struct sk_buff *skb)
  2241. {
  2242. struct fib_result res = {
  2243. .type = RTN_UNSPEC,
  2244. .fi = NULL,
  2245. .table = NULL,
  2246. .tclassid = 0,
  2247. };
  2248. struct rtable *rth;
  2249. fl4->flowi4_iif = LOOPBACK_IFINDEX;
  2250. ip_rt_fix_tos(fl4);
  2251. rcu_read_lock();
  2252. rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
  2253. rcu_read_unlock();
  2254. return rth;
  2255. }
  2256. EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
  2257. struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
  2258. struct fib_result *res,
  2259. const struct sk_buff *skb)
  2260. {
  2261. struct net_device *dev_out = NULL;
  2262. int orig_oif = fl4->flowi4_oif;
  2263. unsigned int flags = 0;
  2264. struct rtable *rth;
  2265. int err;
  2266. if (fl4->saddr) {
  2267. if (ipv4_is_multicast(fl4->saddr) ||
  2268. ipv4_is_lbcast(fl4->saddr) ||
  2269. ipv4_is_zeronet(fl4->saddr)) {
  2270. rth = ERR_PTR(-EINVAL);
  2271. goto out;
  2272. }
  2273. rth = ERR_PTR(-ENETUNREACH);
  2274. /* I removed check for oif == dev_out->oif here.
  2275. * It was wrong for two reasons:
  2276. * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
  2277. * is assigned to multiple interfaces.
  2278. * 2. Moreover, we are allowed to send packets with saddr
  2279. * of another iface. --ANK
  2280. */
  2281. if (fl4->flowi4_oif == 0 &&
  2282. (ipv4_is_multicast(fl4->daddr) ||
  2283. ipv4_is_lbcast(fl4->daddr))) {
  2284. /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
  2285. dev_out = __ip_dev_find(net, fl4->saddr, false);
  2286. if (!dev_out)
  2287. goto out;
  2288. /* Special hack: user can direct multicasts
  2289. * and limited broadcast via necessary interface
  2290. * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
  2291. * This hack is not just for fun, it allows
  2292. * vic,vat and friends to work.
  2293. * They bind socket to loopback, set ttl to zero
  2294. * and expect that it will work.
  2295. * From the viewpoint of routing cache they are broken,
  2296. * because we are not allowed to build multicast path
  2297. * with loopback source addr (look, routing cache
  2298. * cannot know, that ttl is zero, so that packet
  2299. * will not leave this host and route is valid).
  2300. * Luckily, this hack is good workaround.
  2301. */
  2302. fl4->flowi4_oif = dev_out->ifindex;
  2303. goto make_route;
  2304. }
  2305. if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
  2306. /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
  2307. if (!__ip_dev_find(net, fl4->saddr, false))
  2308. goto out;
  2309. }
  2310. }
  2311. if (fl4->flowi4_oif) {
  2312. dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
  2313. rth = ERR_PTR(-ENODEV);
  2314. if (!dev_out)
  2315. goto out;
  2316. /* RACE: Check return value of inet_select_addr instead. */
  2317. if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
  2318. rth = ERR_PTR(-ENETUNREACH);
  2319. goto out;
  2320. }
  2321. if (ipv4_is_local_multicast(fl4->daddr) ||
  2322. ipv4_is_lbcast(fl4->daddr) ||
  2323. fl4->flowi4_proto == IPPROTO_IGMP) {
  2324. if (!fl4->saddr)
  2325. fl4->saddr = inet_select_addr(dev_out, 0,
  2326. RT_SCOPE_LINK);
  2327. goto make_route;
  2328. }
  2329. if (!fl4->saddr) {
  2330. if (ipv4_is_multicast(fl4->daddr))
  2331. fl4->saddr = inet_select_addr(dev_out, 0,
  2332. fl4->flowi4_scope);
  2333. else if (!fl4->daddr)
  2334. fl4->saddr = inet_select_addr(dev_out, 0,
  2335. RT_SCOPE_HOST);
  2336. }
  2337. }
  2338. if (!fl4->daddr) {
  2339. fl4->daddr = fl4->saddr;
  2340. if (!fl4->daddr)
  2341. fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
  2342. dev_out = net->loopback_dev;
  2343. fl4->flowi4_oif = LOOPBACK_IFINDEX;
  2344. res->type = RTN_LOCAL;
  2345. flags |= RTCF_LOCAL;
  2346. goto make_route;
  2347. }
  2348. err = fib_lookup(net, fl4, res, 0);
  2349. if (err) {
  2350. res->fi = NULL;
  2351. res->table = NULL;
  2352. if (fl4->flowi4_oif &&
  2353. (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
  2354. /* Apparently, routing tables are wrong. Assume,
  2355. * that the destination is on link.
  2356. *
  2357. * WHY? DW.
  2358. * Because we are allowed to send to iface
  2359. * even if it has NO routes and NO assigned
  2360. * addresses. When oif is specified, routing
  2361. * tables are looked up with only one purpose:
  2362. * to catch if destination is gatewayed, rather than
  2363. * direct. Moreover, if MSG_DONTROUTE is set,
  2364. * we send packet, ignoring both routing tables
  2365. * and ifaddr state. --ANK
  2366. *
  2367. *
  2368. * We could make it even if oif is unknown,
  2369. * likely IPv6, but we do not.
  2370. */
  2371. if (fl4->saddr == 0)
  2372. fl4->saddr = inet_select_addr(dev_out, 0,
  2373. RT_SCOPE_LINK);
  2374. res->type = RTN_UNICAST;
  2375. goto make_route;
  2376. }
  2377. rth = ERR_PTR(err);
  2378. goto out;
  2379. }
  2380. if (res->type == RTN_LOCAL) {
  2381. if (!fl4->saddr) {
  2382. if (res->fi->fib_prefsrc)
  2383. fl4->saddr = res->fi->fib_prefsrc;
  2384. else
  2385. fl4->saddr = fl4->daddr;
  2386. }
  2387. /* L3 master device is the loopback for that domain */
  2388. dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
  2389. net->loopback_dev;
  2390. /* make sure orig_oif points to fib result device even
  2391. * though packet rx/tx happens over loopback or l3mdev
  2392. */
  2393. orig_oif = FIB_RES_OIF(*res);
  2394. fl4->flowi4_oif = dev_out->ifindex;
  2395. flags |= RTCF_LOCAL;
  2396. goto make_route;
  2397. }
  2398. fib_select_path(net, res, fl4, skb);
  2399. dev_out = FIB_RES_DEV(*res);
  2400. make_route:
  2401. rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
  2402. out:
  2403. return rth;
  2404. }
  2405. static struct dst_ops ipv4_dst_blackhole_ops = {
  2406. .family = AF_INET,
  2407. .default_advmss = ipv4_default_advmss,
  2408. .neigh_lookup = ipv4_neigh_lookup,
  2409. .check = dst_blackhole_check,
  2410. .cow_metrics = dst_blackhole_cow_metrics,
  2411. .update_pmtu = dst_blackhole_update_pmtu,
  2412. .redirect = dst_blackhole_redirect,
  2413. .mtu = dst_blackhole_mtu,
  2414. };
  2415. struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
  2416. {
  2417. struct rtable *ort = (struct rtable *) dst_orig;
  2418. struct rtable *rt;
  2419. rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
  2420. if (rt) {
  2421. struct dst_entry *new = &rt->dst;
  2422. new->__use = 1;
  2423. new->input = dst_discard;
  2424. new->output = dst_discard_out;
  2425. new->dev = net->loopback_dev;
  2426. netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
  2427. rt->rt_is_input = ort->rt_is_input;
  2428. rt->rt_iif = ort->rt_iif;
  2429. rt->rt_pmtu = ort->rt_pmtu;
  2430. rt->rt_mtu_locked = ort->rt_mtu_locked;
  2431. rt->rt_genid = rt_genid_ipv4(net);
  2432. rt->rt_flags = ort->rt_flags;
  2433. rt->rt_type = ort->rt_type;
  2434. rt->rt_uses_gateway = ort->rt_uses_gateway;
  2435. rt->rt_gw_family = ort->rt_gw_family;
  2436. if (rt->rt_gw_family == AF_INET)
  2437. rt->rt_gw4 = ort->rt_gw4;
  2438. else if (rt->rt_gw_family == AF_INET6)
  2439. rt->rt_gw6 = ort->rt_gw6;
  2440. INIT_LIST_HEAD(&rt->rt_uncached);
  2441. }
  2442. dst_release(dst_orig);
  2443. return rt ? &rt->dst : ERR_PTR(-ENOMEM);
  2444. }
  2445. struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
  2446. const struct sock *sk)
  2447. {
  2448. struct rtable *rt = __ip_route_output_key(net, flp4);
  2449. if (IS_ERR(rt))
  2450. return rt;
  2451. if (flp4->flowi4_proto) {
  2452. flp4->flowi4_oif = rt->dst.dev->ifindex;
  2453. rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
  2454. flowi4_to_flowi(flp4),
  2455. sk, 0);
  2456. }
  2457. return rt;
  2458. }
  2459. EXPORT_SYMBOL_GPL(ip_route_output_flow);
  2460. struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
  2461. struct net_device *dev,
  2462. struct net *net, __be32 *saddr,
  2463. const struct ip_tunnel_info *info,
  2464. u8 protocol, bool use_cache)
  2465. {
  2466. #ifdef CONFIG_DST_CACHE
  2467. struct dst_cache *dst_cache;
  2468. #endif
  2469. struct rtable *rt = NULL;
  2470. struct flowi4 fl4;
  2471. __u8 tos;
  2472. #ifdef CONFIG_DST_CACHE
  2473. dst_cache = (struct dst_cache *)&info->dst_cache;
  2474. if (use_cache) {
  2475. rt = dst_cache_get_ip4(dst_cache, saddr);
  2476. if (rt)
  2477. return rt;
  2478. }
  2479. #endif
  2480. memset(&fl4, 0, sizeof(fl4));
  2481. fl4.flowi4_mark = skb->mark;
  2482. fl4.flowi4_proto = protocol;
  2483. fl4.daddr = info->key.u.ipv4.dst;
  2484. fl4.saddr = info->key.u.ipv4.src;
  2485. tos = info->key.tos;
  2486. fl4.flowi4_tos = RT_TOS(tos);
  2487. rt = ip_route_output_key(net, &fl4);
  2488. if (IS_ERR(rt)) {
  2489. netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
  2490. return ERR_PTR(-ENETUNREACH);
  2491. }
  2492. if (rt->dst.dev == dev) { /* is this necessary? */
  2493. netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
  2494. ip_rt_put(rt);
  2495. return ERR_PTR(-ELOOP);
  2496. }
  2497. #ifdef CONFIG_DST_CACHE
  2498. if (use_cache)
  2499. dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
  2500. #endif
  2501. *saddr = fl4.saddr;
  2502. return rt;
  2503. }
  2504. EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
  2505. /* called with rcu_read_lock held */
  2506. static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
  2507. struct rtable *rt, u32 table_id, struct flowi4 *fl4,
  2508. struct sk_buff *skb, u32 portid, u32 seq,
  2509. unsigned int flags)
  2510. {
  2511. struct rtmsg *r;
  2512. struct nlmsghdr *nlh;
  2513. unsigned long expires = 0;
  2514. u32 error;
  2515. u32 metrics[RTAX_MAX];
  2516. nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
  2517. if (!nlh)
  2518. return -EMSGSIZE;
  2519. r = nlmsg_data(nlh);
  2520. r->rtm_family = AF_INET;
  2521. r->rtm_dst_len = 32;
  2522. r->rtm_src_len = 0;
  2523. r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
  2524. r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
  2525. if (nla_put_u32(skb, RTA_TABLE, table_id))
  2526. goto nla_put_failure;
  2527. r->rtm_type = rt->rt_type;
  2528. r->rtm_scope = RT_SCOPE_UNIVERSE;
  2529. r->rtm_protocol = RTPROT_UNSPEC;
  2530. r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
  2531. if (rt->rt_flags & RTCF_NOTIFY)
  2532. r->rtm_flags |= RTM_F_NOTIFY;
  2533. if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
  2534. r->rtm_flags |= RTCF_DOREDIRECT;
  2535. if (nla_put_in_addr(skb, RTA_DST, dst))
  2536. goto nla_put_failure;
  2537. if (src) {
  2538. r->rtm_src_len = 32;
  2539. if (nla_put_in_addr(skb, RTA_SRC, src))
  2540. goto nla_put_failure;
  2541. }
  2542. if (rt->dst.dev &&
  2543. nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
  2544. goto nla_put_failure;
  2545. if (rt->dst.lwtstate &&
  2546. lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
  2547. goto nla_put_failure;
  2548. #ifdef CONFIG_IP_ROUTE_CLASSID
  2549. if (rt->dst.tclassid &&
  2550. nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
  2551. goto nla_put_failure;
  2552. #endif
  2553. if (fl4 && !rt_is_input_route(rt) &&
  2554. fl4->saddr != src) {
  2555. if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
  2556. goto nla_put_failure;
  2557. }
  2558. if (rt->rt_uses_gateway) {
  2559. if (rt->rt_gw_family == AF_INET &&
  2560. nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
  2561. goto nla_put_failure;
  2562. } else if (rt->rt_gw_family == AF_INET6) {
  2563. int alen = sizeof(struct in6_addr);
  2564. struct nlattr *nla;
  2565. struct rtvia *via;
  2566. nla = nla_reserve(skb, RTA_VIA, alen + 2);
  2567. if (!nla)
  2568. goto nla_put_failure;
  2569. via = nla_data(nla);
  2570. via->rtvia_family = AF_INET6;
  2571. memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
  2572. }
  2573. }
  2574. expires = rt->dst.expires;
  2575. if (expires) {
  2576. unsigned long now = jiffies;
  2577. if (time_before(now, expires))
  2578. expires -= now;
  2579. else
  2580. expires = 0;
  2581. }
  2582. memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
  2583. if (rt->rt_pmtu && expires)
  2584. metrics[RTAX_MTU - 1] = rt->rt_pmtu;
  2585. if (rt->rt_mtu_locked && expires)
  2586. metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
  2587. if (rtnetlink_put_metrics(skb, metrics) < 0)
  2588. goto nla_put_failure;
  2589. if (fl4) {
  2590. if (fl4->flowi4_mark &&
  2591. nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
  2592. goto nla_put_failure;
  2593. if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
  2594. nla_put_u32(skb, RTA_UID,
  2595. from_kuid_munged(current_user_ns(),
  2596. fl4->flowi4_uid)))
  2597. goto nla_put_failure;
  2598. if (rt_is_input_route(rt)) {
  2599. #ifdef CONFIG_IP_MROUTE
  2600. if (ipv4_is_multicast(dst) &&
  2601. !ipv4_is_local_multicast(dst) &&
  2602. IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
  2603. int err = ipmr_get_route(net, skb,
  2604. fl4->saddr, fl4->daddr,
  2605. r, portid);
  2606. if (err <= 0) {
  2607. if (err == 0)
  2608. return 0;
  2609. goto nla_put_failure;
  2610. }
  2611. } else
  2612. #endif
  2613. if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
  2614. goto nla_put_failure;
  2615. }
  2616. }
  2617. error = rt->dst.error;
  2618. if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
  2619. goto nla_put_failure;
  2620. nlmsg_end(skb, nlh);
  2621. return 0;
  2622. nla_put_failure:
  2623. nlmsg_cancel(skb, nlh);
  2624. return -EMSGSIZE;
  2625. }
  2626. static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
  2627. struct netlink_callback *cb, u32 table_id,
  2628. struct fnhe_hash_bucket *bucket, int genid,
  2629. int *fa_index, int fa_start, unsigned int flags)
  2630. {
  2631. int i;
  2632. for (i = 0; i < FNHE_HASH_SIZE; i++) {
  2633. struct fib_nh_exception *fnhe;
  2634. for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
  2635. fnhe = rcu_dereference(fnhe->fnhe_next)) {
  2636. struct rtable *rt;
  2637. int err;
  2638. if (*fa_index < fa_start)
  2639. goto next;
  2640. if (fnhe->fnhe_genid != genid)
  2641. goto next;
  2642. if (fnhe->fnhe_expires &&
  2643. time_after(jiffies, fnhe->fnhe_expires))
  2644. goto next;
  2645. rt = rcu_dereference(fnhe->fnhe_rth_input);
  2646. if (!rt)
  2647. rt = rcu_dereference(fnhe->fnhe_rth_output);
  2648. if (!rt)
  2649. goto next;
  2650. err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
  2651. table_id, NULL, skb,
  2652. NETLINK_CB(cb->skb).portid,
  2653. cb->nlh->nlmsg_seq, flags);
  2654. if (err)
  2655. return err;
  2656. next:
  2657. (*fa_index)++;
  2658. }
  2659. }
  2660. return 0;
  2661. }
  2662. int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
  2663. u32 table_id, struct fib_info *fi,
  2664. int *fa_index, int fa_start, unsigned int flags)
  2665. {
  2666. struct net *net = sock_net(cb->skb->sk);
  2667. int nhsel, genid = fnhe_genid(net);
  2668. for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
  2669. struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
  2670. struct fnhe_hash_bucket *bucket;
  2671. int err;
  2672. if (nhc->nhc_flags & RTNH_F_DEAD)
  2673. continue;
  2674. rcu_read_lock();
  2675. bucket = rcu_dereference(nhc->nhc_exceptions);
  2676. err = 0;
  2677. if (bucket)
  2678. err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
  2679. genid, fa_index, fa_start,
  2680. flags);
  2681. rcu_read_unlock();
  2682. if (err)
  2683. return err;
  2684. }
  2685. return 0;
  2686. }
  2687. static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
  2688. u8 ip_proto, __be16 sport,
  2689. __be16 dport)
  2690. {
  2691. struct sk_buff *skb;
  2692. struct iphdr *iph;
  2693. skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
  2694. if (!skb)
  2695. return NULL;
  2696. /* Reserve room for dummy headers, this skb can pass
  2697. * through good chunk of routing engine.
  2698. */
  2699. skb_reset_mac_header(skb);
  2700. skb_reset_network_header(skb);
  2701. skb->protocol = htons(ETH_P_IP);
  2702. iph = skb_put(skb, sizeof(struct iphdr));
  2703. iph->protocol = ip_proto;
  2704. iph->saddr = src;
  2705. iph->daddr = dst;
  2706. iph->version = 0x4;
  2707. iph->frag_off = 0;
  2708. iph->ihl = 0x5;
  2709. skb_set_transport_header(skb, skb->len);
  2710. switch (iph->protocol) {
  2711. case IPPROTO_UDP: {
  2712. struct udphdr *udph;
  2713. udph = skb_put_zero(skb, sizeof(struct udphdr));
  2714. udph->source = sport;
  2715. udph->dest = dport;
  2716. udph->len = htons(sizeof(struct udphdr));
  2717. udph->check = 0;
  2718. break;
  2719. }
  2720. case IPPROTO_TCP: {
  2721. struct tcphdr *tcph;
  2722. tcph = skb_put_zero(skb, sizeof(struct tcphdr));
  2723. tcph->source = sport;
  2724. tcph->dest = dport;
  2725. tcph->doff = sizeof(struct tcphdr) / 4;
  2726. tcph->rst = 1;
  2727. tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
  2728. src, dst, 0);
  2729. break;
  2730. }
  2731. case IPPROTO_ICMP: {
  2732. struct icmphdr *icmph;
  2733. icmph = skb_put_zero(skb, sizeof(struct icmphdr));
  2734. icmph->type = ICMP_ECHO;
  2735. icmph->code = 0;
  2736. }
  2737. }
  2738. return skb;
  2739. }
  2740. static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
  2741. const struct nlmsghdr *nlh,
  2742. struct nlattr **tb,
  2743. struct netlink_ext_ack *extack)
  2744. {
  2745. struct rtmsg *rtm;
  2746. int i, err;
  2747. if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
  2748. NL_SET_ERR_MSG(extack,
  2749. "ipv4: Invalid header for route get request");
  2750. return -EINVAL;
  2751. }
  2752. if (!netlink_strict_get_check(skb))
  2753. return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
  2754. rtm_ipv4_policy, extack);
  2755. rtm = nlmsg_data(nlh);
  2756. if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
  2757. (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
  2758. rtm->rtm_table || rtm->rtm_protocol ||
  2759. rtm->rtm_scope || rtm->rtm_type) {
  2760. NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
  2761. return -EINVAL;
  2762. }
  2763. if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
  2764. RTM_F_LOOKUP_TABLE |
  2765. RTM_F_FIB_MATCH)) {
  2766. NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
  2767. return -EINVAL;
  2768. }
  2769. err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
  2770. rtm_ipv4_policy, extack);
  2771. if (err)
  2772. return err;
  2773. if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
  2774. (tb[RTA_DST] && !rtm->rtm_dst_len)) {
  2775. NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
  2776. return -EINVAL;
  2777. }
  2778. for (i = 0; i <= RTA_MAX; i++) {
  2779. if (!tb[i])
  2780. continue;
  2781. switch (i) {
  2782. case RTA_IIF:
  2783. case RTA_OIF:
  2784. case RTA_SRC:
  2785. case RTA_DST:
  2786. case RTA_IP_PROTO:
  2787. case RTA_SPORT:
  2788. case RTA_DPORT:
  2789. case RTA_MARK:
  2790. case RTA_UID:
  2791. break;
  2792. default:
  2793. NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
  2794. return -EINVAL;
  2795. }
  2796. }
  2797. return 0;
  2798. }
  2799. static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
  2800. struct netlink_ext_ack *extack)
  2801. {
  2802. struct net *net = sock_net(in_skb->sk);
  2803. struct nlattr *tb[RTA_MAX+1];
  2804. u32 table_id = RT_TABLE_MAIN;
  2805. __be16 sport = 0, dport = 0;
  2806. struct fib_result res = {};
  2807. u8 ip_proto = IPPROTO_UDP;
  2808. struct rtable *rt = NULL;
  2809. struct sk_buff *skb;
  2810. struct rtmsg *rtm;
  2811. struct flowi4 fl4 = {};
  2812. __be32 dst = 0;
  2813. __be32 src = 0;
  2814. kuid_t uid;
  2815. u32 iif;
  2816. int err;
  2817. int mark;
  2818. err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
  2819. if (err < 0)
  2820. return err;
  2821. rtm = nlmsg_data(nlh);
  2822. src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
  2823. dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
  2824. iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
  2825. mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
  2826. if (tb[RTA_UID])
  2827. uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
  2828. else
  2829. uid = (iif ? INVALID_UID : current_uid());
  2830. if (tb[RTA_IP_PROTO]) {
  2831. err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
  2832. &ip_proto, AF_INET, extack);
  2833. if (err)
  2834. return err;
  2835. }
  2836. if (tb[RTA_SPORT])
  2837. sport = nla_get_be16(tb[RTA_SPORT]);
  2838. if (tb[RTA_DPORT])
  2839. dport = nla_get_be16(tb[RTA_DPORT]);
  2840. skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
  2841. if (!skb)
  2842. return -ENOBUFS;
  2843. fl4.daddr = dst;
  2844. fl4.saddr = src;
  2845. fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
  2846. fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
  2847. fl4.flowi4_mark = mark;
  2848. fl4.flowi4_uid = uid;
  2849. if (sport)
  2850. fl4.fl4_sport = sport;
  2851. if (dport)
  2852. fl4.fl4_dport = dport;
  2853. fl4.flowi4_proto = ip_proto;
  2854. rcu_read_lock();
  2855. if (iif) {
  2856. struct net_device *dev;
  2857. dev = dev_get_by_index_rcu(net, iif);
  2858. if (!dev) {
  2859. err = -ENODEV;
  2860. goto errout_rcu;
  2861. }
  2862. fl4.flowi4_iif = iif; /* for rt_fill_info */
  2863. skb->dev = dev;
  2864. skb->mark = mark;
  2865. err = ip_route_input_rcu(skb, dst, src,
  2866. rtm->rtm_tos & IPTOS_RT_MASK, dev,
  2867. &res);
  2868. rt = skb_rtable(skb);
  2869. if (err == 0 && rt->dst.error)
  2870. err = -rt->dst.error;
  2871. } else {
  2872. fl4.flowi4_iif = LOOPBACK_IFINDEX;
  2873. skb->dev = net->loopback_dev;
  2874. rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
  2875. err = 0;
  2876. if (IS_ERR(rt))
  2877. err = PTR_ERR(rt);
  2878. else
  2879. skb_dst_set(skb, &rt->dst);
  2880. }
  2881. if (err)
  2882. goto errout_rcu;
  2883. if (rtm->rtm_flags & RTM_F_NOTIFY)
  2884. rt->rt_flags |= RTCF_NOTIFY;
  2885. if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
  2886. table_id = res.table ? res.table->tb_id : 0;
  2887. /* reset skb for netlink reply msg */
  2888. skb_trim(skb, 0);
  2889. skb_reset_network_header(skb);
  2890. skb_reset_transport_header(skb);
  2891. skb_reset_mac_header(skb);
  2892. if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
  2893. struct fib_rt_info fri;
  2894. if (!res.fi) {
  2895. err = fib_props[res.type].error;
  2896. if (!err)
  2897. err = -EHOSTUNREACH;
  2898. goto errout_rcu;
  2899. }
  2900. fri.fi = res.fi;
  2901. fri.tb_id = table_id;
  2902. fri.dst = res.prefix;
  2903. fri.dst_len = res.prefixlen;
  2904. fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
  2905. fri.type = rt->rt_type;
  2906. fri.offload = 0;
  2907. fri.trap = 0;
  2908. fri.offload_failed = 0;
  2909. if (res.fa_head) {
  2910. struct fib_alias *fa;
  2911. hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
  2912. u8 slen = 32 - fri.dst_len;
  2913. if (fa->fa_slen == slen &&
  2914. fa->tb_id == fri.tb_id &&
  2915. fa->fa_dscp == fri.dscp &&
  2916. fa->fa_info == res.fi &&
  2917. fa->fa_type == fri.type) {
  2918. fri.offload = READ_ONCE(fa->offload);
  2919. fri.trap = READ_ONCE(fa->trap);
  2920. fri.offload_failed =
  2921. READ_ONCE(fa->offload_failed);
  2922. break;
  2923. }
  2924. }
  2925. }
  2926. err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
  2927. nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
  2928. } else {
  2929. err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
  2930. NETLINK_CB(in_skb).portid,
  2931. nlh->nlmsg_seq, 0);
  2932. }
  2933. if (err < 0)
  2934. goto errout_rcu;
  2935. rcu_read_unlock();
  2936. err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
  2937. errout_free:
  2938. return err;
  2939. errout_rcu:
  2940. rcu_read_unlock();
  2941. kfree_skb(skb);
  2942. goto errout_free;
  2943. }
  2944. void ip_rt_multicast_event(struct in_device *in_dev)
  2945. {
  2946. rt_cache_flush(dev_net(in_dev->dev));
  2947. }
  2948. #ifdef CONFIG_SYSCTL
  2949. static int ip_rt_gc_interval __read_mostly = 60 * HZ;
  2950. static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
  2951. static int ip_rt_gc_elasticity __read_mostly = 8;
  2952. static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
  2953. static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
  2954. void *buffer, size_t *lenp, loff_t *ppos)
  2955. {
  2956. struct net *net = (struct net *)__ctl->extra1;
  2957. if (write) {
  2958. rt_cache_flush(net);
  2959. fnhe_genid_bump(net);
  2960. return 0;
  2961. }
  2962. return -EINVAL;
  2963. }
  2964. static struct ctl_table ipv4_route_table[] = {
  2965. {
  2966. .procname = "gc_thresh",
  2967. .data = &ipv4_dst_ops.gc_thresh,
  2968. .maxlen = sizeof(int),
  2969. .mode = 0644,
  2970. .proc_handler = proc_dointvec,
  2971. },
  2972. {
  2973. .procname = "max_size",
  2974. .data = &ip_rt_max_size,
  2975. .maxlen = sizeof(int),
  2976. .mode = 0644,
  2977. .proc_handler = proc_dointvec,
  2978. },
  2979. {
  2980. /* Deprecated. Use gc_min_interval_ms */
  2981. .procname = "gc_min_interval",
  2982. .data = &ip_rt_gc_min_interval,
  2983. .maxlen = sizeof(int),
  2984. .mode = 0644,
  2985. .proc_handler = proc_dointvec_jiffies,
  2986. },
  2987. {
  2988. .procname = "gc_min_interval_ms",
  2989. .data = &ip_rt_gc_min_interval,
  2990. .maxlen = sizeof(int),
  2991. .mode = 0644,
  2992. .proc_handler = proc_dointvec_ms_jiffies,
  2993. },
  2994. {
  2995. .procname = "gc_timeout",
  2996. .data = &ip_rt_gc_timeout,
  2997. .maxlen = sizeof(int),
  2998. .mode = 0644,
  2999. .proc_handler = proc_dointvec_jiffies,
  3000. },
  3001. {
  3002. .procname = "gc_interval",
  3003. .data = &ip_rt_gc_interval,
  3004. .maxlen = sizeof(int),
  3005. .mode = 0644,
  3006. .proc_handler = proc_dointvec_jiffies,
  3007. },
  3008. {
  3009. .procname = "redirect_load",
  3010. .data = &ip_rt_redirect_load,
  3011. .maxlen = sizeof(int),
  3012. .mode = 0644,
  3013. .proc_handler = proc_dointvec,
  3014. },
  3015. {
  3016. .procname = "redirect_number",
  3017. .data = &ip_rt_redirect_number,
  3018. .maxlen = sizeof(int),
  3019. .mode = 0644,
  3020. .proc_handler = proc_dointvec,
  3021. },
  3022. {
  3023. .procname = "redirect_silence",
  3024. .data = &ip_rt_redirect_silence,
  3025. .maxlen = sizeof(int),
  3026. .mode = 0644,
  3027. .proc_handler = proc_dointvec,
  3028. },
  3029. {
  3030. .procname = "error_cost",
  3031. .data = &ip_rt_error_cost,
  3032. .maxlen = sizeof(int),
  3033. .mode = 0644,
  3034. .proc_handler = proc_dointvec,
  3035. },
  3036. {
  3037. .procname = "error_burst",
  3038. .data = &ip_rt_error_burst,
  3039. .maxlen = sizeof(int),
  3040. .mode = 0644,
  3041. .proc_handler = proc_dointvec,
  3042. },
  3043. {
  3044. .procname = "gc_elasticity",
  3045. .data = &ip_rt_gc_elasticity,
  3046. .maxlen = sizeof(int),
  3047. .mode = 0644,
  3048. .proc_handler = proc_dointvec,
  3049. },
  3050. { }
  3051. };
  3052. static const char ipv4_route_flush_procname[] = "flush";
  3053. static struct ctl_table ipv4_route_netns_table[] = {
  3054. {
  3055. .procname = ipv4_route_flush_procname,
  3056. .maxlen = sizeof(int),
  3057. .mode = 0200,
  3058. .proc_handler = ipv4_sysctl_rtcache_flush,
  3059. },
  3060. {
  3061. .procname = "min_pmtu",
  3062. .data = &init_net.ipv4.ip_rt_min_pmtu,
  3063. .maxlen = sizeof(int),
  3064. .mode = 0644,
  3065. .proc_handler = proc_dointvec_minmax,
  3066. .extra1 = &ip_min_valid_pmtu,
  3067. },
  3068. {
  3069. .procname = "mtu_expires",
  3070. .data = &init_net.ipv4.ip_rt_mtu_expires,
  3071. .maxlen = sizeof(int),
  3072. .mode = 0644,
  3073. .proc_handler = proc_dointvec_jiffies,
  3074. },
  3075. {
  3076. .procname = "min_adv_mss",
  3077. .data = &init_net.ipv4.ip_rt_min_advmss,
  3078. .maxlen = sizeof(int),
  3079. .mode = 0644,
  3080. .proc_handler = proc_dointvec,
  3081. },
  3082. { },
  3083. };
  3084. static __net_init int sysctl_route_net_init(struct net *net)
  3085. {
  3086. struct ctl_table *tbl;
  3087. tbl = ipv4_route_netns_table;
  3088. if (!net_eq(net, &init_net)) {
  3089. int i;
  3090. tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
  3091. if (!tbl)
  3092. goto err_dup;
  3093. /* Don't export non-whitelisted sysctls to unprivileged users */
  3094. if (net->user_ns != &init_user_ns) {
  3095. if (tbl[0].procname != ipv4_route_flush_procname)
  3096. tbl[0].procname = NULL;
  3097. }
  3098. /* Update the variables to point into the current struct net
  3099. * except for the first element flush
  3100. */
  3101. for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
  3102. tbl[i].data += (void *)net - (void *)&init_net;
  3103. }
  3104. tbl[0].extra1 = net;
  3105. net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
  3106. if (!net->ipv4.route_hdr)
  3107. goto err_reg;
  3108. return 0;
  3109. err_reg:
  3110. if (tbl != ipv4_route_netns_table)
  3111. kfree(tbl);
  3112. err_dup:
  3113. return -ENOMEM;
  3114. }
  3115. static __net_exit void sysctl_route_net_exit(struct net *net)
  3116. {
  3117. struct ctl_table *tbl;
  3118. tbl = net->ipv4.route_hdr->ctl_table_arg;
  3119. unregister_net_sysctl_table(net->ipv4.route_hdr);
  3120. BUG_ON(tbl == ipv4_route_netns_table);
  3121. kfree(tbl);
  3122. }
  3123. static __net_initdata struct pernet_operations sysctl_route_ops = {
  3124. .init = sysctl_route_net_init,
  3125. .exit = sysctl_route_net_exit,
  3126. };
  3127. #endif
  3128. static __net_init int netns_ip_rt_init(struct net *net)
  3129. {
  3130. /* Set default value for namespaceified sysctls */
  3131. net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
  3132. net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
  3133. net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
  3134. return 0;
  3135. }
  3136. static struct pernet_operations __net_initdata ip_rt_ops = {
  3137. .init = netns_ip_rt_init,
  3138. };
  3139. static __net_init int rt_genid_init(struct net *net)
  3140. {
  3141. atomic_set(&net->ipv4.rt_genid, 0);
  3142. atomic_set(&net->fnhe_genid, 0);
  3143. atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
  3144. return 0;
  3145. }
  3146. static __net_initdata struct pernet_operations rt_genid_ops = {
  3147. .init = rt_genid_init,
  3148. };
  3149. static int __net_init ipv4_inetpeer_init(struct net *net)
  3150. {
  3151. struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
  3152. if (!bp)
  3153. return -ENOMEM;
  3154. inet_peer_base_init(bp);
  3155. net->ipv4.peers = bp;
  3156. return 0;
  3157. }
  3158. static void __net_exit ipv4_inetpeer_exit(struct net *net)
  3159. {
  3160. struct inet_peer_base *bp = net->ipv4.peers;
  3161. net->ipv4.peers = NULL;
  3162. inetpeer_invalidate_tree(bp);
  3163. kfree(bp);
  3164. }
  3165. static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
  3166. .init = ipv4_inetpeer_init,
  3167. .exit = ipv4_inetpeer_exit,
  3168. };
  3169. #ifdef CONFIG_IP_ROUTE_CLASSID
  3170. struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
  3171. #endif /* CONFIG_IP_ROUTE_CLASSID */
  3172. int __init ip_rt_init(void)
  3173. {
  3174. void *idents_hash;
  3175. int cpu;
  3176. /* For modern hosts, this will use 2 MB of memory */
  3177. idents_hash = alloc_large_system_hash("IP idents",
  3178. sizeof(*ip_idents) + sizeof(*ip_tstamps),
  3179. 0,
  3180. 16, /* one bucket per 64 KB */
  3181. HASH_ZERO,
  3182. NULL,
  3183. &ip_idents_mask,
  3184. 2048,
  3185. 256*1024);
  3186. ip_idents = idents_hash;
  3187. get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
  3188. ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
  3189. for_each_possible_cpu(cpu) {
  3190. struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
  3191. INIT_LIST_HEAD(&ul->head);
  3192. INIT_LIST_HEAD(&ul->quarantine);
  3193. spin_lock_init(&ul->lock);
  3194. }
  3195. #ifdef CONFIG_IP_ROUTE_CLASSID
  3196. ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
  3197. if (!ip_rt_acct)
  3198. panic("IP: failed to allocate ip_rt_acct\n");
  3199. #endif
  3200. ipv4_dst_ops.kmem_cachep =
  3201. kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
  3202. SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
  3203. ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
  3204. if (dst_entries_init(&ipv4_dst_ops) < 0)
  3205. panic("IP: failed to allocate ipv4_dst_ops counter\n");
  3206. if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
  3207. panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
  3208. ipv4_dst_ops.gc_thresh = ~0;
  3209. ip_rt_max_size = INT_MAX;
  3210. devinet_init();
  3211. ip_fib_init();
  3212. if (ip_rt_proc_init())
  3213. pr_err("Unable to create route proc files\n");
  3214. #ifdef CONFIG_XFRM
  3215. xfrm_init();
  3216. xfrm4_init();
  3217. #endif
  3218. rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
  3219. RTNL_FLAG_DOIT_UNLOCKED);
  3220. #ifdef CONFIG_SYSCTL
  3221. register_pernet_subsys(&sysctl_route_ops);
  3222. #endif
  3223. register_pernet_subsys(&ip_rt_ops);
  3224. register_pernet_subsys(&rt_genid_ops);
  3225. register_pernet_subsys(&ipv4_inetpeer_ops);
  3226. return 0;
  3227. }
  3228. #ifdef CONFIG_SYSCTL
  3229. /*
  3230. * We really need to sanitize the damn ipv4 init order, then all
  3231. * this nonsense will go away.
  3232. */
  3233. void __init ip_static_sysctl_init(void)
  3234. {
  3235. register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
  3236. }
  3237. #endif