bpf_jit_comp.c 70 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * BPF JIT compiler
  4. *
  5. * Copyright (C) 2011-2013 Eric Dumazet ([email protected])
  6. * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  7. */
  8. #include <linux/netdevice.h>
  9. #include <linux/filter.h>
  10. #include <linux/if_vlan.h>
  11. #include <linux/bpf.h>
  12. #include <linux/memory.h>
  13. #include <linux/sort.h>
  14. #include <asm/extable.h>
  15. #include <asm/set_memory.h>
  16. #include <asm/nospec-branch.h>
  17. #include <asm/text-patching.h>
  18. static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
  19. {
  20. if (len == 1)
  21. *ptr = bytes;
  22. else if (len == 2)
  23. *(u16 *)ptr = bytes;
  24. else {
  25. *(u32 *)ptr = bytes;
  26. barrier();
  27. }
  28. return ptr + len;
  29. }
  30. #define EMIT(bytes, len) \
  31. do { prog = emit_code(prog, bytes, len); } while (0)
  32. #define EMIT1(b1) EMIT(b1, 1)
  33. #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
  34. #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
  35. #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
  36. #define EMIT1_off32(b1, off) \
  37. do { EMIT1(b1); EMIT(off, 4); } while (0)
  38. #define EMIT2_off32(b1, b2, off) \
  39. do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
  40. #define EMIT3_off32(b1, b2, b3, off) \
  41. do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
  42. #define EMIT4_off32(b1, b2, b3, b4, off) \
  43. do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
  44. #ifdef CONFIG_X86_KERNEL_IBT
  45. #define EMIT_ENDBR() EMIT(gen_endbr(), 4)
  46. #else
  47. #define EMIT_ENDBR()
  48. #endif
  49. static bool is_imm8(int value)
  50. {
  51. return value <= 127 && value >= -128;
  52. }
  53. static bool is_simm32(s64 value)
  54. {
  55. return value == (s64)(s32)value;
  56. }
  57. static bool is_uimm32(u64 value)
  58. {
  59. return value == (u64)(u32)value;
  60. }
  61. /* mov dst, src */
  62. #define EMIT_mov(DST, SRC) \
  63. do { \
  64. if (DST != SRC) \
  65. EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
  66. } while (0)
  67. static int bpf_size_to_x86_bytes(int bpf_size)
  68. {
  69. if (bpf_size == BPF_W)
  70. return 4;
  71. else if (bpf_size == BPF_H)
  72. return 2;
  73. else if (bpf_size == BPF_B)
  74. return 1;
  75. else if (bpf_size == BPF_DW)
  76. return 4; /* imm32 */
  77. else
  78. return 0;
  79. }
  80. /*
  81. * List of x86 cond jumps opcodes (. + s8)
  82. * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
  83. */
  84. #define X86_JB 0x72
  85. #define X86_JAE 0x73
  86. #define X86_JE 0x74
  87. #define X86_JNE 0x75
  88. #define X86_JBE 0x76
  89. #define X86_JA 0x77
  90. #define X86_JL 0x7C
  91. #define X86_JGE 0x7D
  92. #define X86_JLE 0x7E
  93. #define X86_JG 0x7F
  94. /* Pick a register outside of BPF range for JIT internal work */
  95. #define AUX_REG (MAX_BPF_JIT_REG + 1)
  96. #define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
  97. /*
  98. * The following table maps BPF registers to x86-64 registers.
  99. *
  100. * x86-64 register R12 is unused, since if used as base address
  101. * register in load/store instructions, it always needs an
  102. * extra byte of encoding and is callee saved.
  103. *
  104. * x86-64 register R9 is not used by BPF programs, but can be used by BPF
  105. * trampoline. x86-64 register R10 is used for blinding (if enabled).
  106. */
  107. static const int reg2hex[] = {
  108. [BPF_REG_0] = 0, /* RAX */
  109. [BPF_REG_1] = 7, /* RDI */
  110. [BPF_REG_2] = 6, /* RSI */
  111. [BPF_REG_3] = 2, /* RDX */
  112. [BPF_REG_4] = 1, /* RCX */
  113. [BPF_REG_5] = 0, /* R8 */
  114. [BPF_REG_6] = 3, /* RBX callee saved */
  115. [BPF_REG_7] = 5, /* R13 callee saved */
  116. [BPF_REG_8] = 6, /* R14 callee saved */
  117. [BPF_REG_9] = 7, /* R15 callee saved */
  118. [BPF_REG_FP] = 5, /* RBP readonly */
  119. [BPF_REG_AX] = 2, /* R10 temp register */
  120. [AUX_REG] = 3, /* R11 temp register */
  121. [X86_REG_R9] = 1, /* R9 register, 6th function argument */
  122. };
  123. static const int reg2pt_regs[] = {
  124. [BPF_REG_0] = offsetof(struct pt_regs, ax),
  125. [BPF_REG_1] = offsetof(struct pt_regs, di),
  126. [BPF_REG_2] = offsetof(struct pt_regs, si),
  127. [BPF_REG_3] = offsetof(struct pt_regs, dx),
  128. [BPF_REG_4] = offsetof(struct pt_regs, cx),
  129. [BPF_REG_5] = offsetof(struct pt_regs, r8),
  130. [BPF_REG_6] = offsetof(struct pt_regs, bx),
  131. [BPF_REG_7] = offsetof(struct pt_regs, r13),
  132. [BPF_REG_8] = offsetof(struct pt_regs, r14),
  133. [BPF_REG_9] = offsetof(struct pt_regs, r15),
  134. };
  135. /*
  136. * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
  137. * which need extra byte of encoding.
  138. * rax,rcx,...,rbp have simpler encoding
  139. */
  140. static bool is_ereg(u32 reg)
  141. {
  142. return (1 << reg) & (BIT(BPF_REG_5) |
  143. BIT(AUX_REG) |
  144. BIT(BPF_REG_7) |
  145. BIT(BPF_REG_8) |
  146. BIT(BPF_REG_9) |
  147. BIT(X86_REG_R9) |
  148. BIT(BPF_REG_AX));
  149. }
  150. /*
  151. * is_ereg_8l() == true if BPF register 'reg' is mapped to access x86-64
  152. * lower 8-bit registers dil,sil,bpl,spl,r8b..r15b, which need extra byte
  153. * of encoding. al,cl,dl,bl have simpler encoding.
  154. */
  155. static bool is_ereg_8l(u32 reg)
  156. {
  157. return is_ereg(reg) ||
  158. (1 << reg) & (BIT(BPF_REG_1) |
  159. BIT(BPF_REG_2) |
  160. BIT(BPF_REG_FP));
  161. }
  162. static bool is_axreg(u32 reg)
  163. {
  164. return reg == BPF_REG_0;
  165. }
  166. /* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
  167. static u8 add_1mod(u8 byte, u32 reg)
  168. {
  169. if (is_ereg(reg))
  170. byte |= 1;
  171. return byte;
  172. }
  173. static u8 add_2mod(u8 byte, u32 r1, u32 r2)
  174. {
  175. if (is_ereg(r1))
  176. byte |= 1;
  177. if (is_ereg(r2))
  178. byte |= 4;
  179. return byte;
  180. }
  181. /* Encode 'dst_reg' register into x86-64 opcode 'byte' */
  182. static u8 add_1reg(u8 byte, u32 dst_reg)
  183. {
  184. return byte + reg2hex[dst_reg];
  185. }
  186. /* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
  187. static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
  188. {
  189. return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
  190. }
  191. /* Some 1-byte opcodes for binary ALU operations */
  192. static u8 simple_alu_opcodes[] = {
  193. [BPF_ADD] = 0x01,
  194. [BPF_SUB] = 0x29,
  195. [BPF_AND] = 0x21,
  196. [BPF_OR] = 0x09,
  197. [BPF_XOR] = 0x31,
  198. [BPF_LSH] = 0xE0,
  199. [BPF_RSH] = 0xE8,
  200. [BPF_ARSH] = 0xF8,
  201. };
  202. static void jit_fill_hole(void *area, unsigned int size)
  203. {
  204. /* Fill whole space with INT3 instructions */
  205. memset(area, 0xcc, size);
  206. }
  207. int bpf_arch_text_invalidate(void *dst, size_t len)
  208. {
  209. return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len));
  210. }
  211. struct jit_context {
  212. int cleanup_addr; /* Epilogue code offset */
  213. /*
  214. * Program specific offsets of labels in the code; these rely on the
  215. * JIT doing at least 2 passes, recording the position on the first
  216. * pass, only to generate the correct offset on the second pass.
  217. */
  218. int tail_call_direct_label;
  219. int tail_call_indirect_label;
  220. };
  221. /* Maximum number of bytes emitted while JITing one eBPF insn */
  222. #define BPF_MAX_INSN_SIZE 128
  223. #define BPF_INSN_SAFETY 64
  224. /* Number of bytes emit_patch() needs to generate instructions */
  225. #define X86_PATCH_SIZE 5
  226. /* Number of bytes that will be skipped on tailcall */
  227. #define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE)
  228. static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
  229. {
  230. u8 *prog = *pprog;
  231. if (callee_regs_used[0])
  232. EMIT1(0x53); /* push rbx */
  233. if (callee_regs_used[1])
  234. EMIT2(0x41, 0x55); /* push r13 */
  235. if (callee_regs_used[2])
  236. EMIT2(0x41, 0x56); /* push r14 */
  237. if (callee_regs_used[3])
  238. EMIT2(0x41, 0x57); /* push r15 */
  239. *pprog = prog;
  240. }
  241. static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
  242. {
  243. u8 *prog = *pprog;
  244. if (callee_regs_used[3])
  245. EMIT2(0x41, 0x5F); /* pop r15 */
  246. if (callee_regs_used[2])
  247. EMIT2(0x41, 0x5E); /* pop r14 */
  248. if (callee_regs_used[1])
  249. EMIT2(0x41, 0x5D); /* pop r13 */
  250. if (callee_regs_used[0])
  251. EMIT1(0x5B); /* pop rbx */
  252. *pprog = prog;
  253. }
  254. /*
  255. * Emit x86-64 prologue code for BPF program.
  256. * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
  257. * while jumping to another program
  258. */
  259. static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
  260. bool tail_call_reachable, bool is_subprog)
  261. {
  262. u8 *prog = *pprog;
  263. /* BPF trampoline can be made to work without these nops,
  264. * but let's waste 5 bytes for now and optimize later
  265. */
  266. EMIT_ENDBR();
  267. memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
  268. prog += X86_PATCH_SIZE;
  269. if (!ebpf_from_cbpf) {
  270. if (tail_call_reachable && !is_subprog)
  271. EMIT2(0x31, 0xC0); /* xor eax, eax */
  272. else
  273. EMIT2(0x66, 0x90); /* nop2 */
  274. }
  275. EMIT1(0x55); /* push rbp */
  276. EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
  277. /* X86_TAIL_CALL_OFFSET is here */
  278. EMIT_ENDBR();
  279. /* sub rsp, rounded_stack_depth */
  280. if (stack_depth)
  281. EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
  282. if (tail_call_reachable)
  283. EMIT1(0x50); /* push rax */
  284. *pprog = prog;
  285. }
  286. static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
  287. {
  288. u8 *prog = *pprog;
  289. s64 offset;
  290. offset = func - (ip + X86_PATCH_SIZE);
  291. if (!is_simm32(offset)) {
  292. pr_err("Target call %p is out of range\n", func);
  293. return -ERANGE;
  294. }
  295. EMIT1_off32(opcode, offset);
  296. *pprog = prog;
  297. return 0;
  298. }
  299. static int emit_call(u8 **pprog, void *func, void *ip)
  300. {
  301. return emit_patch(pprog, func, ip, 0xE8);
  302. }
  303. static int emit_jump(u8 **pprog, void *func, void *ip)
  304. {
  305. return emit_patch(pprog, func, ip, 0xE9);
  306. }
  307. static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
  308. void *old_addr, void *new_addr)
  309. {
  310. const u8 *nop_insn = x86_nops[5];
  311. u8 old_insn[X86_PATCH_SIZE];
  312. u8 new_insn[X86_PATCH_SIZE];
  313. u8 *prog;
  314. int ret;
  315. memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
  316. if (old_addr) {
  317. prog = old_insn;
  318. ret = t == BPF_MOD_CALL ?
  319. emit_call(&prog, old_addr, ip) :
  320. emit_jump(&prog, old_addr, ip);
  321. if (ret)
  322. return ret;
  323. }
  324. memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
  325. if (new_addr) {
  326. prog = new_insn;
  327. ret = t == BPF_MOD_CALL ?
  328. emit_call(&prog, new_addr, ip) :
  329. emit_jump(&prog, new_addr, ip);
  330. if (ret)
  331. return ret;
  332. }
  333. ret = -EBUSY;
  334. mutex_lock(&text_mutex);
  335. if (memcmp(ip, old_insn, X86_PATCH_SIZE))
  336. goto out;
  337. ret = 1;
  338. if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
  339. text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
  340. ret = 0;
  341. }
  342. out:
  343. mutex_unlock(&text_mutex);
  344. return ret;
  345. }
  346. int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
  347. void *old_addr, void *new_addr)
  348. {
  349. if (!is_kernel_text((long)ip) &&
  350. !is_bpf_text_address((long)ip))
  351. /* BPF poking in modules is not supported */
  352. return -EINVAL;
  353. /*
  354. * See emit_prologue(), for IBT builds the trampoline hook is preceded
  355. * with an ENDBR instruction.
  356. */
  357. if (is_endbr(*(u32 *)ip))
  358. ip += ENDBR_INSN_SIZE;
  359. return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
  360. }
  361. #define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8)
  362. static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
  363. {
  364. u8 *prog = *pprog;
  365. if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
  366. EMIT_LFENCE();
  367. EMIT2(0xFF, 0xE0 + reg);
  368. } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
  369. OPTIMIZER_HIDE_VAR(reg);
  370. emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
  371. } else {
  372. EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */
  373. if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))
  374. EMIT1(0xCC); /* int3 */
  375. }
  376. *pprog = prog;
  377. }
  378. static void emit_return(u8 **pprog, u8 *ip)
  379. {
  380. u8 *prog = *pprog;
  381. if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
  382. emit_jump(&prog, &__x86_return_thunk, ip);
  383. } else {
  384. EMIT1(0xC3); /* ret */
  385. if (IS_ENABLED(CONFIG_SLS))
  386. EMIT1(0xCC); /* int3 */
  387. }
  388. *pprog = prog;
  389. }
  390. /*
  391. * Generate the following code:
  392. *
  393. * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
  394. * if (index >= array->map.max_entries)
  395. * goto out;
  396. * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
  397. * goto out;
  398. * prog = array->ptrs[index];
  399. * if (prog == NULL)
  400. * goto out;
  401. * goto *(prog->bpf_func + prologue_size);
  402. * out:
  403. */
  404. static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
  405. u32 stack_depth, u8 *ip,
  406. struct jit_context *ctx)
  407. {
  408. int tcc_off = -4 - round_up(stack_depth, 8);
  409. u8 *prog = *pprog, *start = *pprog;
  410. int offset;
  411. /*
  412. * rdi - pointer to ctx
  413. * rsi - pointer to bpf_array
  414. * rdx - index in bpf_array
  415. */
  416. /*
  417. * if (index >= array->map.max_entries)
  418. * goto out;
  419. */
  420. EMIT2(0x89, 0xD2); /* mov edx, edx */
  421. EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */
  422. offsetof(struct bpf_array, map.max_entries));
  423. offset = ctx->tail_call_indirect_label - (prog + 2 - start);
  424. EMIT2(X86_JBE, offset); /* jbe out */
  425. /*
  426. * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
  427. * goto out;
  428. */
  429. EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
  430. EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
  431. offset = ctx->tail_call_indirect_label - (prog + 2 - start);
  432. EMIT2(X86_JAE, offset); /* jae out */
  433. EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
  434. EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
  435. /* prog = array->ptrs[index]; */
  436. EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */
  437. offsetof(struct bpf_array, ptrs));
  438. /*
  439. * if (prog == NULL)
  440. * goto out;
  441. */
  442. EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */
  443. offset = ctx->tail_call_indirect_label - (prog + 2 - start);
  444. EMIT2(X86_JE, offset); /* je out */
  445. pop_callee_regs(&prog, callee_regs_used);
  446. EMIT1(0x58); /* pop rax */
  447. if (stack_depth)
  448. EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */
  449. round_up(stack_depth, 8));
  450. /* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */
  451. EMIT4(0x48, 0x8B, 0x49, /* mov rcx, qword ptr [rcx + 32] */
  452. offsetof(struct bpf_prog, bpf_func));
  453. EMIT4(0x48, 0x83, 0xC1, /* add rcx, X86_TAIL_CALL_OFFSET */
  454. X86_TAIL_CALL_OFFSET);
  455. /*
  456. * Now we're ready to jump into next BPF program
  457. * rdi == ctx (1st arg)
  458. * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET
  459. */
  460. emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start));
  461. /* out: */
  462. ctx->tail_call_indirect_label = prog - start;
  463. *pprog = prog;
  464. }
  465. static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
  466. u8 **pprog, u8 *ip,
  467. bool *callee_regs_used, u32 stack_depth,
  468. struct jit_context *ctx)
  469. {
  470. int tcc_off = -4 - round_up(stack_depth, 8);
  471. u8 *prog = *pprog, *start = *pprog;
  472. int offset;
  473. /*
  474. * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
  475. * goto out;
  476. */
  477. EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
  478. EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
  479. offset = ctx->tail_call_direct_label - (prog + 2 - start);
  480. EMIT2(X86_JAE, offset); /* jae out */
  481. EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
  482. EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
  483. poke->tailcall_bypass = ip + (prog - start);
  484. poke->adj_off = X86_TAIL_CALL_OFFSET;
  485. poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE;
  486. poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE;
  487. emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
  488. poke->tailcall_bypass);
  489. pop_callee_regs(&prog, callee_regs_used);
  490. EMIT1(0x58); /* pop rax */
  491. if (stack_depth)
  492. EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
  493. memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
  494. prog += X86_PATCH_SIZE;
  495. /* out: */
  496. ctx->tail_call_direct_label = prog - start;
  497. *pprog = prog;
  498. }
  499. static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
  500. {
  501. struct bpf_jit_poke_descriptor *poke;
  502. struct bpf_array *array;
  503. struct bpf_prog *target;
  504. int i, ret;
  505. for (i = 0; i < prog->aux->size_poke_tab; i++) {
  506. poke = &prog->aux->poke_tab[i];
  507. if (poke->aux && poke->aux != prog->aux)
  508. continue;
  509. WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));
  510. if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
  511. continue;
  512. array = container_of(poke->tail_call.map, struct bpf_array, map);
  513. mutex_lock(&array->aux->poke_mutex);
  514. target = array->ptrs[poke->tail_call.key];
  515. if (target) {
  516. ret = __bpf_arch_text_poke(poke->tailcall_target,
  517. BPF_MOD_JUMP, NULL,
  518. (u8 *)target->bpf_func +
  519. poke->adj_off);
  520. BUG_ON(ret < 0);
  521. ret = __bpf_arch_text_poke(poke->tailcall_bypass,
  522. BPF_MOD_JUMP,
  523. (u8 *)poke->tailcall_target +
  524. X86_PATCH_SIZE, NULL);
  525. BUG_ON(ret < 0);
  526. }
  527. WRITE_ONCE(poke->tailcall_target_stable, true);
  528. mutex_unlock(&array->aux->poke_mutex);
  529. }
  530. }
  531. static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
  532. u32 dst_reg, const u32 imm32)
  533. {
  534. u8 *prog = *pprog;
  535. u8 b1, b2, b3;
  536. /*
  537. * Optimization: if imm32 is positive, use 'mov %eax, imm32'
  538. * (which zero-extends imm32) to save 2 bytes.
  539. */
  540. if (sign_propagate && (s32)imm32 < 0) {
  541. /* 'mov %rax, imm32' sign extends imm32 */
  542. b1 = add_1mod(0x48, dst_reg);
  543. b2 = 0xC7;
  544. b3 = 0xC0;
  545. EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
  546. goto done;
  547. }
  548. /*
  549. * Optimization: if imm32 is zero, use 'xor %eax, %eax'
  550. * to save 3 bytes.
  551. */
  552. if (imm32 == 0) {
  553. if (is_ereg(dst_reg))
  554. EMIT1(add_2mod(0x40, dst_reg, dst_reg));
  555. b2 = 0x31; /* xor */
  556. b3 = 0xC0;
  557. EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
  558. goto done;
  559. }
  560. /* mov %eax, imm32 */
  561. if (is_ereg(dst_reg))
  562. EMIT1(add_1mod(0x40, dst_reg));
  563. EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
  564. done:
  565. *pprog = prog;
  566. }
  567. static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
  568. const u32 imm32_hi, const u32 imm32_lo)
  569. {
  570. u8 *prog = *pprog;
  571. if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
  572. /*
  573. * For emitting plain u32, where sign bit must not be
  574. * propagated LLVM tends to load imm64 over mov32
  575. * directly, so save couple of bytes by just doing
  576. * 'mov %eax, imm32' instead.
  577. */
  578. emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
  579. } else {
  580. /* movabsq rax, imm64 */
  581. EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
  582. EMIT(imm32_lo, 4);
  583. EMIT(imm32_hi, 4);
  584. }
  585. *pprog = prog;
  586. }
  587. static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
  588. {
  589. u8 *prog = *pprog;
  590. if (is64) {
  591. /* mov dst, src */
  592. EMIT_mov(dst_reg, src_reg);
  593. } else {
  594. /* mov32 dst, src */
  595. if (is_ereg(dst_reg) || is_ereg(src_reg))
  596. EMIT1(add_2mod(0x40, dst_reg, src_reg));
  597. EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
  598. }
  599. *pprog = prog;
  600. }
  601. /* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */
  602. static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
  603. {
  604. u8 *prog = *pprog;
  605. if (is_imm8(off)) {
  606. /* 1-byte signed displacement.
  607. *
  608. * If off == 0 we could skip this and save one extra byte, but
  609. * special case of x86 R13 which always needs an offset is not
  610. * worth the hassle
  611. */
  612. EMIT2(add_2reg(0x40, ptr_reg, val_reg), off);
  613. } else {
  614. /* 4-byte signed displacement */
  615. EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off);
  616. }
  617. *pprog = prog;
  618. }
  619. /*
  620. * Emit a REX byte if it will be necessary to address these registers
  621. */
  622. static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
  623. {
  624. u8 *prog = *pprog;
  625. if (is64)
  626. EMIT1(add_2mod(0x48, dst_reg, src_reg));
  627. else if (is_ereg(dst_reg) || is_ereg(src_reg))
  628. EMIT1(add_2mod(0x40, dst_reg, src_reg));
  629. *pprog = prog;
  630. }
  631. /*
  632. * Similar version of maybe_emit_mod() for a single register
  633. */
  634. static void maybe_emit_1mod(u8 **pprog, u32 reg, bool is64)
  635. {
  636. u8 *prog = *pprog;
  637. if (is64)
  638. EMIT1(add_1mod(0x48, reg));
  639. else if (is_ereg(reg))
  640. EMIT1(add_1mod(0x40, reg));
  641. *pprog = prog;
  642. }
  643. /* LDX: dst_reg = *(u8*)(src_reg + off) */
  644. static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
  645. {
  646. u8 *prog = *pprog;
  647. switch (size) {
  648. case BPF_B:
  649. /* Emit 'movzx rax, byte ptr [rax + off]' */
  650. EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
  651. break;
  652. case BPF_H:
  653. /* Emit 'movzx rax, word ptr [rax + off]' */
  654. EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
  655. break;
  656. case BPF_W:
  657. /* Emit 'mov eax, dword ptr [rax+0x14]' */
  658. if (is_ereg(dst_reg) || is_ereg(src_reg))
  659. EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
  660. else
  661. EMIT1(0x8B);
  662. break;
  663. case BPF_DW:
  664. /* Emit 'mov rax, qword ptr [rax+0x14]' */
  665. EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
  666. break;
  667. }
  668. emit_insn_suffix(&prog, src_reg, dst_reg, off);
  669. *pprog = prog;
  670. }
  671. /* STX: *(u8*)(dst_reg + off) = src_reg */
  672. static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
  673. {
  674. u8 *prog = *pprog;
  675. switch (size) {
  676. case BPF_B:
  677. /* Emit 'mov byte ptr [rax + off], al' */
  678. if (is_ereg(dst_reg) || is_ereg_8l(src_reg))
  679. /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */
  680. EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
  681. else
  682. EMIT1(0x88);
  683. break;
  684. case BPF_H:
  685. if (is_ereg(dst_reg) || is_ereg(src_reg))
  686. EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
  687. else
  688. EMIT2(0x66, 0x89);
  689. break;
  690. case BPF_W:
  691. if (is_ereg(dst_reg) || is_ereg(src_reg))
  692. EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
  693. else
  694. EMIT1(0x89);
  695. break;
  696. case BPF_DW:
  697. EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
  698. break;
  699. }
  700. emit_insn_suffix(&prog, dst_reg, src_reg, off);
  701. *pprog = prog;
  702. }
  703. static int emit_atomic(u8 **pprog, u8 atomic_op,
  704. u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
  705. {
  706. u8 *prog = *pprog;
  707. EMIT1(0xF0); /* lock prefix */
  708. maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW);
  709. /* emit opcode */
  710. switch (atomic_op) {
  711. case BPF_ADD:
  712. case BPF_AND:
  713. case BPF_OR:
  714. case BPF_XOR:
  715. /* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
  716. EMIT1(simple_alu_opcodes[atomic_op]);
  717. break;
  718. case BPF_ADD | BPF_FETCH:
  719. /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
  720. EMIT2(0x0F, 0xC1);
  721. break;
  722. case BPF_XCHG:
  723. /* src_reg = atomic_xchg(dst_reg + off, src_reg); */
  724. EMIT1(0x87);
  725. break;
  726. case BPF_CMPXCHG:
  727. /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */
  728. EMIT2(0x0F, 0xB1);
  729. break;
  730. default:
  731. pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
  732. return -EFAULT;
  733. }
  734. emit_insn_suffix(&prog, dst_reg, src_reg, off);
  735. *pprog = prog;
  736. return 0;
  737. }
  738. bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
  739. {
  740. u32 reg = x->fixup >> 8;
  741. /* jump over faulting load and clear dest register */
  742. *(unsigned long *)((void *)regs + reg) = 0;
  743. regs->ip += x->fixup & 0xff;
  744. return true;
  745. }
  746. static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
  747. bool *regs_used, bool *tail_call_seen)
  748. {
  749. int i;
  750. for (i = 1; i <= insn_cnt; i++, insn++) {
  751. if (insn->code == (BPF_JMP | BPF_TAIL_CALL))
  752. *tail_call_seen = true;
  753. if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6)
  754. regs_used[0] = true;
  755. if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7)
  756. regs_used[1] = true;
  757. if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8)
  758. regs_used[2] = true;
  759. if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9)
  760. regs_used[3] = true;
  761. }
  762. }
  763. static void emit_nops(u8 **pprog, int len)
  764. {
  765. u8 *prog = *pprog;
  766. int i, noplen;
  767. while (len > 0) {
  768. noplen = len;
  769. if (noplen > ASM_NOP_MAX)
  770. noplen = ASM_NOP_MAX;
  771. for (i = 0; i < noplen; i++)
  772. EMIT1(x86_nops[noplen][i]);
  773. len -= noplen;
  774. }
  775. *pprog = prog;
  776. }
  777. #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
  778. static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
  779. int oldproglen, struct jit_context *ctx, bool jmp_padding)
  780. {
  781. bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
  782. struct bpf_insn *insn = bpf_prog->insnsi;
  783. bool callee_regs_used[4] = {};
  784. int insn_cnt = bpf_prog->len;
  785. bool tail_call_seen = false;
  786. bool seen_exit = false;
  787. u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
  788. int i, excnt = 0;
  789. int ilen, proglen = 0;
  790. u8 *prog = temp;
  791. int err;
  792. detect_reg_usage(insn, insn_cnt, callee_regs_used,
  793. &tail_call_seen);
  794. /* tail call's presence in current prog implies it is reachable */
  795. tail_call_reachable |= tail_call_seen;
  796. emit_prologue(&prog, bpf_prog->aux->stack_depth,
  797. bpf_prog_was_classic(bpf_prog), tail_call_reachable,
  798. bpf_prog->aux->func_idx != 0);
  799. push_callee_regs(&prog, callee_regs_used);
  800. ilen = prog - temp;
  801. if (rw_image)
  802. memcpy(rw_image + proglen, temp, ilen);
  803. proglen += ilen;
  804. addrs[0] = proglen;
  805. prog = temp;
  806. for (i = 1; i <= insn_cnt; i++, insn++) {
  807. const s32 imm32 = insn->imm;
  808. u32 dst_reg = insn->dst_reg;
  809. u32 src_reg = insn->src_reg;
  810. u8 b2 = 0, b3 = 0;
  811. u8 *start_of_ldx;
  812. s64 jmp_offset;
  813. u8 jmp_cond;
  814. u8 *func;
  815. int nops;
  816. switch (insn->code) {
  817. /* ALU */
  818. case BPF_ALU | BPF_ADD | BPF_X:
  819. case BPF_ALU | BPF_SUB | BPF_X:
  820. case BPF_ALU | BPF_AND | BPF_X:
  821. case BPF_ALU | BPF_OR | BPF_X:
  822. case BPF_ALU | BPF_XOR | BPF_X:
  823. case BPF_ALU64 | BPF_ADD | BPF_X:
  824. case BPF_ALU64 | BPF_SUB | BPF_X:
  825. case BPF_ALU64 | BPF_AND | BPF_X:
  826. case BPF_ALU64 | BPF_OR | BPF_X:
  827. case BPF_ALU64 | BPF_XOR | BPF_X:
  828. maybe_emit_mod(&prog, dst_reg, src_reg,
  829. BPF_CLASS(insn->code) == BPF_ALU64);
  830. b2 = simple_alu_opcodes[BPF_OP(insn->code)];
  831. EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
  832. break;
  833. case BPF_ALU64 | BPF_MOV | BPF_X:
  834. case BPF_ALU | BPF_MOV | BPF_X:
  835. emit_mov_reg(&prog,
  836. BPF_CLASS(insn->code) == BPF_ALU64,
  837. dst_reg, src_reg);
  838. break;
  839. /* neg dst */
  840. case BPF_ALU | BPF_NEG:
  841. case BPF_ALU64 | BPF_NEG:
  842. maybe_emit_1mod(&prog, dst_reg,
  843. BPF_CLASS(insn->code) == BPF_ALU64);
  844. EMIT2(0xF7, add_1reg(0xD8, dst_reg));
  845. break;
  846. case BPF_ALU | BPF_ADD | BPF_K:
  847. case BPF_ALU | BPF_SUB | BPF_K:
  848. case BPF_ALU | BPF_AND | BPF_K:
  849. case BPF_ALU | BPF_OR | BPF_K:
  850. case BPF_ALU | BPF_XOR | BPF_K:
  851. case BPF_ALU64 | BPF_ADD | BPF_K:
  852. case BPF_ALU64 | BPF_SUB | BPF_K:
  853. case BPF_ALU64 | BPF_AND | BPF_K:
  854. case BPF_ALU64 | BPF_OR | BPF_K:
  855. case BPF_ALU64 | BPF_XOR | BPF_K:
  856. maybe_emit_1mod(&prog, dst_reg,
  857. BPF_CLASS(insn->code) == BPF_ALU64);
  858. /*
  859. * b3 holds 'normal' opcode, b2 short form only valid
  860. * in case dst is eax/rax.
  861. */
  862. switch (BPF_OP(insn->code)) {
  863. case BPF_ADD:
  864. b3 = 0xC0;
  865. b2 = 0x05;
  866. break;
  867. case BPF_SUB:
  868. b3 = 0xE8;
  869. b2 = 0x2D;
  870. break;
  871. case BPF_AND:
  872. b3 = 0xE0;
  873. b2 = 0x25;
  874. break;
  875. case BPF_OR:
  876. b3 = 0xC8;
  877. b2 = 0x0D;
  878. break;
  879. case BPF_XOR:
  880. b3 = 0xF0;
  881. b2 = 0x35;
  882. break;
  883. }
  884. if (is_imm8(imm32))
  885. EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
  886. else if (is_axreg(dst_reg))
  887. EMIT1_off32(b2, imm32);
  888. else
  889. EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
  890. break;
  891. case BPF_ALU64 | BPF_MOV | BPF_K:
  892. case BPF_ALU | BPF_MOV | BPF_K:
  893. emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64,
  894. dst_reg, imm32);
  895. break;
  896. case BPF_LD | BPF_IMM | BPF_DW:
  897. emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm);
  898. insn++;
  899. i++;
  900. break;
  901. /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */
  902. case BPF_ALU | BPF_MOD | BPF_X:
  903. case BPF_ALU | BPF_DIV | BPF_X:
  904. case BPF_ALU | BPF_MOD | BPF_K:
  905. case BPF_ALU | BPF_DIV | BPF_K:
  906. case BPF_ALU64 | BPF_MOD | BPF_X:
  907. case BPF_ALU64 | BPF_DIV | BPF_X:
  908. case BPF_ALU64 | BPF_MOD | BPF_K:
  909. case BPF_ALU64 | BPF_DIV | BPF_K: {
  910. bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
  911. if (dst_reg != BPF_REG_0)
  912. EMIT1(0x50); /* push rax */
  913. if (dst_reg != BPF_REG_3)
  914. EMIT1(0x52); /* push rdx */
  915. if (BPF_SRC(insn->code) == BPF_X) {
  916. if (src_reg == BPF_REG_0 ||
  917. src_reg == BPF_REG_3) {
  918. /* mov r11, src_reg */
  919. EMIT_mov(AUX_REG, src_reg);
  920. src_reg = AUX_REG;
  921. }
  922. } else {
  923. /* mov r11, imm32 */
  924. EMIT3_off32(0x49, 0xC7, 0xC3, imm32);
  925. src_reg = AUX_REG;
  926. }
  927. if (dst_reg != BPF_REG_0)
  928. /* mov rax, dst_reg */
  929. emit_mov_reg(&prog, is64, BPF_REG_0, dst_reg);
  930. /*
  931. * xor edx, edx
  932. * equivalent to 'xor rdx, rdx', but one byte less
  933. */
  934. EMIT2(0x31, 0xd2);
  935. /* div src_reg */
  936. maybe_emit_1mod(&prog, src_reg, is64);
  937. EMIT2(0xF7, add_1reg(0xF0, src_reg));
  938. if (BPF_OP(insn->code) == BPF_MOD &&
  939. dst_reg != BPF_REG_3)
  940. /* mov dst_reg, rdx */
  941. emit_mov_reg(&prog, is64, dst_reg, BPF_REG_3);
  942. else if (BPF_OP(insn->code) == BPF_DIV &&
  943. dst_reg != BPF_REG_0)
  944. /* mov dst_reg, rax */
  945. emit_mov_reg(&prog, is64, dst_reg, BPF_REG_0);
  946. if (dst_reg != BPF_REG_3)
  947. EMIT1(0x5A); /* pop rdx */
  948. if (dst_reg != BPF_REG_0)
  949. EMIT1(0x58); /* pop rax */
  950. break;
  951. }
  952. case BPF_ALU | BPF_MUL | BPF_K:
  953. case BPF_ALU64 | BPF_MUL | BPF_K:
  954. maybe_emit_mod(&prog, dst_reg, dst_reg,
  955. BPF_CLASS(insn->code) == BPF_ALU64);
  956. if (is_imm8(imm32))
  957. /* imul dst_reg, dst_reg, imm8 */
  958. EMIT3(0x6B, add_2reg(0xC0, dst_reg, dst_reg),
  959. imm32);
  960. else
  961. /* imul dst_reg, dst_reg, imm32 */
  962. EMIT2_off32(0x69,
  963. add_2reg(0xC0, dst_reg, dst_reg),
  964. imm32);
  965. break;
  966. case BPF_ALU | BPF_MUL | BPF_X:
  967. case BPF_ALU64 | BPF_MUL | BPF_X:
  968. maybe_emit_mod(&prog, src_reg, dst_reg,
  969. BPF_CLASS(insn->code) == BPF_ALU64);
  970. /* imul dst_reg, src_reg */
  971. EMIT3(0x0F, 0xAF, add_2reg(0xC0, src_reg, dst_reg));
  972. break;
  973. /* Shifts */
  974. case BPF_ALU | BPF_LSH | BPF_K:
  975. case BPF_ALU | BPF_RSH | BPF_K:
  976. case BPF_ALU | BPF_ARSH | BPF_K:
  977. case BPF_ALU64 | BPF_LSH | BPF_K:
  978. case BPF_ALU64 | BPF_RSH | BPF_K:
  979. case BPF_ALU64 | BPF_ARSH | BPF_K:
  980. maybe_emit_1mod(&prog, dst_reg,
  981. BPF_CLASS(insn->code) == BPF_ALU64);
  982. b3 = simple_alu_opcodes[BPF_OP(insn->code)];
  983. if (imm32 == 1)
  984. EMIT2(0xD1, add_1reg(b3, dst_reg));
  985. else
  986. EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
  987. break;
  988. case BPF_ALU | BPF_LSH | BPF_X:
  989. case BPF_ALU | BPF_RSH | BPF_X:
  990. case BPF_ALU | BPF_ARSH | BPF_X:
  991. case BPF_ALU64 | BPF_LSH | BPF_X:
  992. case BPF_ALU64 | BPF_RSH | BPF_X:
  993. case BPF_ALU64 | BPF_ARSH | BPF_X:
  994. /* Check for bad case when dst_reg == rcx */
  995. if (dst_reg == BPF_REG_4) {
  996. /* mov r11, dst_reg */
  997. EMIT_mov(AUX_REG, dst_reg);
  998. dst_reg = AUX_REG;
  999. }
  1000. if (src_reg != BPF_REG_4) { /* common case */
  1001. EMIT1(0x51); /* push rcx */
  1002. /* mov rcx, src_reg */
  1003. EMIT_mov(BPF_REG_4, src_reg);
  1004. }
  1005. /* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */
  1006. maybe_emit_1mod(&prog, dst_reg,
  1007. BPF_CLASS(insn->code) == BPF_ALU64);
  1008. b3 = simple_alu_opcodes[BPF_OP(insn->code)];
  1009. EMIT2(0xD3, add_1reg(b3, dst_reg));
  1010. if (src_reg != BPF_REG_4)
  1011. EMIT1(0x59); /* pop rcx */
  1012. if (insn->dst_reg == BPF_REG_4)
  1013. /* mov dst_reg, r11 */
  1014. EMIT_mov(insn->dst_reg, AUX_REG);
  1015. break;
  1016. case BPF_ALU | BPF_END | BPF_FROM_BE:
  1017. switch (imm32) {
  1018. case 16:
  1019. /* Emit 'ror %ax, 8' to swap lower 2 bytes */
  1020. EMIT1(0x66);
  1021. if (is_ereg(dst_reg))
  1022. EMIT1(0x41);
  1023. EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
  1024. /* Emit 'movzwl eax, ax' */
  1025. if (is_ereg(dst_reg))
  1026. EMIT3(0x45, 0x0F, 0xB7);
  1027. else
  1028. EMIT2(0x0F, 0xB7);
  1029. EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
  1030. break;
  1031. case 32:
  1032. /* Emit 'bswap eax' to swap lower 4 bytes */
  1033. if (is_ereg(dst_reg))
  1034. EMIT2(0x41, 0x0F);
  1035. else
  1036. EMIT1(0x0F);
  1037. EMIT1(add_1reg(0xC8, dst_reg));
  1038. break;
  1039. case 64:
  1040. /* Emit 'bswap rax' to swap 8 bytes */
  1041. EMIT3(add_1mod(0x48, dst_reg), 0x0F,
  1042. add_1reg(0xC8, dst_reg));
  1043. break;
  1044. }
  1045. break;
  1046. case BPF_ALU | BPF_END | BPF_FROM_LE:
  1047. switch (imm32) {
  1048. case 16:
  1049. /*
  1050. * Emit 'movzwl eax, ax' to zero extend 16-bit
  1051. * into 64 bit
  1052. */
  1053. if (is_ereg(dst_reg))
  1054. EMIT3(0x45, 0x0F, 0xB7);
  1055. else
  1056. EMIT2(0x0F, 0xB7);
  1057. EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
  1058. break;
  1059. case 32:
  1060. /* Emit 'mov eax, eax' to clear upper 32-bits */
  1061. if (is_ereg(dst_reg))
  1062. EMIT1(0x45);
  1063. EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
  1064. break;
  1065. case 64:
  1066. /* nop */
  1067. break;
  1068. }
  1069. break;
  1070. /* speculation barrier */
  1071. case BPF_ST | BPF_NOSPEC:
  1072. if (boot_cpu_has(X86_FEATURE_XMM2))
  1073. EMIT_LFENCE();
  1074. break;
  1075. /* ST: *(u8*)(dst_reg + off) = imm */
  1076. case BPF_ST | BPF_MEM | BPF_B:
  1077. if (is_ereg(dst_reg))
  1078. EMIT2(0x41, 0xC6);
  1079. else
  1080. EMIT1(0xC6);
  1081. goto st;
  1082. case BPF_ST | BPF_MEM | BPF_H:
  1083. if (is_ereg(dst_reg))
  1084. EMIT3(0x66, 0x41, 0xC7);
  1085. else
  1086. EMIT2(0x66, 0xC7);
  1087. goto st;
  1088. case BPF_ST | BPF_MEM | BPF_W:
  1089. if (is_ereg(dst_reg))
  1090. EMIT2(0x41, 0xC7);
  1091. else
  1092. EMIT1(0xC7);
  1093. goto st;
  1094. case BPF_ST | BPF_MEM | BPF_DW:
  1095. EMIT2(add_1mod(0x48, dst_reg), 0xC7);
  1096. st: if (is_imm8(insn->off))
  1097. EMIT2(add_1reg(0x40, dst_reg), insn->off);
  1098. else
  1099. EMIT1_off32(add_1reg(0x80, dst_reg), insn->off);
  1100. EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
  1101. break;
  1102. /* STX: *(u8*)(dst_reg + off) = src_reg */
  1103. case BPF_STX | BPF_MEM | BPF_B:
  1104. case BPF_STX | BPF_MEM | BPF_H:
  1105. case BPF_STX | BPF_MEM | BPF_W:
  1106. case BPF_STX | BPF_MEM | BPF_DW:
  1107. emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
  1108. break;
  1109. /* LDX: dst_reg = *(u8*)(src_reg + off) */
  1110. case BPF_LDX | BPF_MEM | BPF_B:
  1111. case BPF_LDX | BPF_PROBE_MEM | BPF_B:
  1112. case BPF_LDX | BPF_MEM | BPF_H:
  1113. case BPF_LDX | BPF_PROBE_MEM | BPF_H:
  1114. case BPF_LDX | BPF_MEM | BPF_W:
  1115. case BPF_LDX | BPF_PROBE_MEM | BPF_W:
  1116. case BPF_LDX | BPF_MEM | BPF_DW:
  1117. case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
  1118. if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
  1119. /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
  1120. * add abs(insn->off) to the limit to make sure that negative
  1121. * offset won't be an issue.
  1122. * insn->off is s16, so it won't affect valid pointers.
  1123. */
  1124. u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
  1125. u8 *end_of_jmp1, *end_of_jmp2;
  1126. /* Conservatively check that src_reg + insn->off is a kernel address:
  1127. * 1. src_reg + insn->off >= limit
  1128. * 2. src_reg + insn->off doesn't become small positive.
  1129. * Cannot do src_reg + insn->off >= limit in one branch,
  1130. * since it needs two spare registers, but JIT has only one.
  1131. */
  1132. /* movabsq r11, limit */
  1133. EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
  1134. EMIT((u32)limit, 4);
  1135. EMIT(limit >> 32, 4);
  1136. /* cmp src_reg, r11 */
  1137. maybe_emit_mod(&prog, src_reg, AUX_REG, true);
  1138. EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
  1139. /* if unsigned '<' goto end_of_jmp2 */
  1140. EMIT2(X86_JB, 0);
  1141. end_of_jmp1 = prog;
  1142. /* mov r11, src_reg */
  1143. emit_mov_reg(&prog, true, AUX_REG, src_reg);
  1144. /* add r11, insn->off */
  1145. maybe_emit_1mod(&prog, AUX_REG, true);
  1146. EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
  1147. /* jmp if not carry to start_of_ldx
  1148. * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
  1149. * that has to be rejected.
  1150. */
  1151. EMIT2(0x73 /* JNC */, 0);
  1152. end_of_jmp2 = prog;
  1153. /* xor dst_reg, dst_reg */
  1154. emit_mov_imm32(&prog, false, dst_reg, 0);
  1155. /* jmp byte_after_ldx */
  1156. EMIT2(0xEB, 0);
  1157. /* populate jmp_offset for JB above to jump to xor dst_reg */
  1158. end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
  1159. /* populate jmp_offset for JNC above to jump to start_of_ldx */
  1160. start_of_ldx = prog;
  1161. end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
  1162. }
  1163. emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
  1164. if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
  1165. struct exception_table_entry *ex;
  1166. u8 *_insn = image + proglen + (start_of_ldx - temp);
  1167. s64 delta;
  1168. /* populate jmp_offset for JMP above */
  1169. start_of_ldx[-1] = prog - start_of_ldx;
  1170. if (!bpf_prog->aux->extable)
  1171. break;
  1172. if (excnt >= bpf_prog->aux->num_exentries) {
  1173. pr_err("ex gen bug\n");
  1174. return -EFAULT;
  1175. }
  1176. ex = &bpf_prog->aux->extable[excnt++];
  1177. delta = _insn - (u8 *)&ex->insn;
  1178. if (!is_simm32(delta)) {
  1179. pr_err("extable->insn doesn't fit into 32-bit\n");
  1180. return -EFAULT;
  1181. }
  1182. /* switch ex to rw buffer for writes */
  1183. ex = (void *)rw_image + ((void *)ex - (void *)image);
  1184. ex->insn = delta;
  1185. ex->data = EX_TYPE_BPF;
  1186. if (dst_reg > BPF_REG_9) {
  1187. pr_err("verifier error\n");
  1188. return -EFAULT;
  1189. }
  1190. /*
  1191. * Compute size of x86 insn and its target dest x86 register.
  1192. * ex_handler_bpf() will use lower 8 bits to adjust
  1193. * pt_regs->ip to jump over this x86 instruction
  1194. * and upper bits to figure out which pt_regs to zero out.
  1195. * End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
  1196. * of 4 bytes will be ignored and rbx will be zero inited.
  1197. */
  1198. ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
  1199. }
  1200. break;
  1201. case BPF_STX | BPF_ATOMIC | BPF_W:
  1202. case BPF_STX | BPF_ATOMIC | BPF_DW:
  1203. if (insn->imm == (BPF_AND | BPF_FETCH) ||
  1204. insn->imm == (BPF_OR | BPF_FETCH) ||
  1205. insn->imm == (BPF_XOR | BPF_FETCH)) {
  1206. bool is64 = BPF_SIZE(insn->code) == BPF_DW;
  1207. u32 real_src_reg = src_reg;
  1208. u32 real_dst_reg = dst_reg;
  1209. u8 *branch_target;
  1210. /*
  1211. * Can't be implemented with a single x86 insn.
  1212. * Need to do a CMPXCHG loop.
  1213. */
  1214. /* Will need RAX as a CMPXCHG operand so save R0 */
  1215. emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
  1216. if (src_reg == BPF_REG_0)
  1217. real_src_reg = BPF_REG_AX;
  1218. if (dst_reg == BPF_REG_0)
  1219. real_dst_reg = BPF_REG_AX;
  1220. branch_target = prog;
  1221. /* Load old value */
  1222. emit_ldx(&prog, BPF_SIZE(insn->code),
  1223. BPF_REG_0, real_dst_reg, insn->off);
  1224. /*
  1225. * Perform the (commutative) operation locally,
  1226. * put the result in the AUX_REG.
  1227. */
  1228. emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
  1229. maybe_emit_mod(&prog, AUX_REG, real_src_reg, is64);
  1230. EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
  1231. add_2reg(0xC0, AUX_REG, real_src_reg));
  1232. /* Attempt to swap in new value */
  1233. err = emit_atomic(&prog, BPF_CMPXCHG,
  1234. real_dst_reg, AUX_REG,
  1235. insn->off,
  1236. BPF_SIZE(insn->code));
  1237. if (WARN_ON(err))
  1238. return err;
  1239. /*
  1240. * ZF tells us whether we won the race. If it's
  1241. * cleared we need to try again.
  1242. */
  1243. EMIT2(X86_JNE, -(prog - branch_target) - 2);
  1244. /* Return the pre-modification value */
  1245. emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0);
  1246. /* Restore R0 after clobbering RAX */
  1247. emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
  1248. break;
  1249. }
  1250. err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
  1251. insn->off, BPF_SIZE(insn->code));
  1252. if (err)
  1253. return err;
  1254. break;
  1255. /* call */
  1256. case BPF_JMP | BPF_CALL:
  1257. func = (u8 *) __bpf_call_base + imm32;
  1258. if (tail_call_reachable) {
  1259. /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
  1260. EMIT3_off32(0x48, 0x8B, 0x85,
  1261. -round_up(bpf_prog->aux->stack_depth, 8) - 8);
  1262. if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7))
  1263. return -EINVAL;
  1264. } else {
  1265. if (!imm32 || emit_call(&prog, func, image + addrs[i - 1]))
  1266. return -EINVAL;
  1267. }
  1268. break;
  1269. case BPF_JMP | BPF_TAIL_CALL:
  1270. if (imm32)
  1271. emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
  1272. &prog, image + addrs[i - 1],
  1273. callee_regs_used,
  1274. bpf_prog->aux->stack_depth,
  1275. ctx);
  1276. else
  1277. emit_bpf_tail_call_indirect(&prog,
  1278. callee_regs_used,
  1279. bpf_prog->aux->stack_depth,
  1280. image + addrs[i - 1],
  1281. ctx);
  1282. break;
  1283. /* cond jump */
  1284. case BPF_JMP | BPF_JEQ | BPF_X:
  1285. case BPF_JMP | BPF_JNE | BPF_X:
  1286. case BPF_JMP | BPF_JGT | BPF_X:
  1287. case BPF_JMP | BPF_JLT | BPF_X:
  1288. case BPF_JMP | BPF_JGE | BPF_X:
  1289. case BPF_JMP | BPF_JLE | BPF_X:
  1290. case BPF_JMP | BPF_JSGT | BPF_X:
  1291. case BPF_JMP | BPF_JSLT | BPF_X:
  1292. case BPF_JMP | BPF_JSGE | BPF_X:
  1293. case BPF_JMP | BPF_JSLE | BPF_X:
  1294. case BPF_JMP32 | BPF_JEQ | BPF_X:
  1295. case BPF_JMP32 | BPF_JNE | BPF_X:
  1296. case BPF_JMP32 | BPF_JGT | BPF_X:
  1297. case BPF_JMP32 | BPF_JLT | BPF_X:
  1298. case BPF_JMP32 | BPF_JGE | BPF_X:
  1299. case BPF_JMP32 | BPF_JLE | BPF_X:
  1300. case BPF_JMP32 | BPF_JSGT | BPF_X:
  1301. case BPF_JMP32 | BPF_JSLT | BPF_X:
  1302. case BPF_JMP32 | BPF_JSGE | BPF_X:
  1303. case BPF_JMP32 | BPF_JSLE | BPF_X:
  1304. /* cmp dst_reg, src_reg */
  1305. maybe_emit_mod(&prog, dst_reg, src_reg,
  1306. BPF_CLASS(insn->code) == BPF_JMP);
  1307. EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg));
  1308. goto emit_cond_jmp;
  1309. case BPF_JMP | BPF_JSET | BPF_X:
  1310. case BPF_JMP32 | BPF_JSET | BPF_X:
  1311. /* test dst_reg, src_reg */
  1312. maybe_emit_mod(&prog, dst_reg, src_reg,
  1313. BPF_CLASS(insn->code) == BPF_JMP);
  1314. EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg));
  1315. goto emit_cond_jmp;
  1316. case BPF_JMP | BPF_JSET | BPF_K:
  1317. case BPF_JMP32 | BPF_JSET | BPF_K:
  1318. /* test dst_reg, imm32 */
  1319. maybe_emit_1mod(&prog, dst_reg,
  1320. BPF_CLASS(insn->code) == BPF_JMP);
  1321. EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
  1322. goto emit_cond_jmp;
  1323. case BPF_JMP | BPF_JEQ | BPF_K:
  1324. case BPF_JMP | BPF_JNE | BPF_K:
  1325. case BPF_JMP | BPF_JGT | BPF_K:
  1326. case BPF_JMP | BPF_JLT | BPF_K:
  1327. case BPF_JMP | BPF_JGE | BPF_K:
  1328. case BPF_JMP | BPF_JLE | BPF_K:
  1329. case BPF_JMP | BPF_JSGT | BPF_K:
  1330. case BPF_JMP | BPF_JSLT | BPF_K:
  1331. case BPF_JMP | BPF_JSGE | BPF_K:
  1332. case BPF_JMP | BPF_JSLE | BPF_K:
  1333. case BPF_JMP32 | BPF_JEQ | BPF_K:
  1334. case BPF_JMP32 | BPF_JNE | BPF_K:
  1335. case BPF_JMP32 | BPF_JGT | BPF_K:
  1336. case BPF_JMP32 | BPF_JLT | BPF_K:
  1337. case BPF_JMP32 | BPF_JGE | BPF_K:
  1338. case BPF_JMP32 | BPF_JLE | BPF_K:
  1339. case BPF_JMP32 | BPF_JSGT | BPF_K:
  1340. case BPF_JMP32 | BPF_JSLT | BPF_K:
  1341. case BPF_JMP32 | BPF_JSGE | BPF_K:
  1342. case BPF_JMP32 | BPF_JSLE | BPF_K:
  1343. /* test dst_reg, dst_reg to save one extra byte */
  1344. if (imm32 == 0) {
  1345. maybe_emit_mod(&prog, dst_reg, dst_reg,
  1346. BPF_CLASS(insn->code) == BPF_JMP);
  1347. EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
  1348. goto emit_cond_jmp;
  1349. }
  1350. /* cmp dst_reg, imm8/32 */
  1351. maybe_emit_1mod(&prog, dst_reg,
  1352. BPF_CLASS(insn->code) == BPF_JMP);
  1353. if (is_imm8(imm32))
  1354. EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
  1355. else
  1356. EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
  1357. emit_cond_jmp: /* Convert BPF opcode to x86 */
  1358. switch (BPF_OP(insn->code)) {
  1359. case BPF_JEQ:
  1360. jmp_cond = X86_JE;
  1361. break;
  1362. case BPF_JSET:
  1363. case BPF_JNE:
  1364. jmp_cond = X86_JNE;
  1365. break;
  1366. case BPF_JGT:
  1367. /* GT is unsigned '>', JA in x86 */
  1368. jmp_cond = X86_JA;
  1369. break;
  1370. case BPF_JLT:
  1371. /* LT is unsigned '<', JB in x86 */
  1372. jmp_cond = X86_JB;
  1373. break;
  1374. case BPF_JGE:
  1375. /* GE is unsigned '>=', JAE in x86 */
  1376. jmp_cond = X86_JAE;
  1377. break;
  1378. case BPF_JLE:
  1379. /* LE is unsigned '<=', JBE in x86 */
  1380. jmp_cond = X86_JBE;
  1381. break;
  1382. case BPF_JSGT:
  1383. /* Signed '>', GT in x86 */
  1384. jmp_cond = X86_JG;
  1385. break;
  1386. case BPF_JSLT:
  1387. /* Signed '<', LT in x86 */
  1388. jmp_cond = X86_JL;
  1389. break;
  1390. case BPF_JSGE:
  1391. /* Signed '>=', GE in x86 */
  1392. jmp_cond = X86_JGE;
  1393. break;
  1394. case BPF_JSLE:
  1395. /* Signed '<=', LE in x86 */
  1396. jmp_cond = X86_JLE;
  1397. break;
  1398. default: /* to silence GCC warning */
  1399. return -EFAULT;
  1400. }
  1401. jmp_offset = addrs[i + insn->off] - addrs[i];
  1402. if (is_imm8(jmp_offset)) {
  1403. if (jmp_padding) {
  1404. /* To keep the jmp_offset valid, the extra bytes are
  1405. * padded before the jump insn, so we subtract the
  1406. * 2 bytes of jmp_cond insn from INSN_SZ_DIFF.
  1407. *
  1408. * If the previous pass already emits an imm8
  1409. * jmp_cond, then this BPF insn won't shrink, so
  1410. * "nops" is 0.
  1411. *
  1412. * On the other hand, if the previous pass emits an
  1413. * imm32 jmp_cond, the extra 4 bytes(*) is padded to
  1414. * keep the image from shrinking further.
  1415. *
  1416. * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond
  1417. * is 2 bytes, so the size difference is 4 bytes.
  1418. */
  1419. nops = INSN_SZ_DIFF - 2;
  1420. if (nops != 0 && nops != 4) {
  1421. pr_err("unexpected jmp_cond padding: %d bytes\n",
  1422. nops);
  1423. return -EFAULT;
  1424. }
  1425. emit_nops(&prog, nops);
  1426. }
  1427. EMIT2(jmp_cond, jmp_offset);
  1428. } else if (is_simm32(jmp_offset)) {
  1429. EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
  1430. } else {
  1431. pr_err("cond_jmp gen bug %llx\n", jmp_offset);
  1432. return -EFAULT;
  1433. }
  1434. break;
  1435. case BPF_JMP | BPF_JA:
  1436. if (insn->off == -1)
  1437. /* -1 jmp instructions will always jump
  1438. * backwards two bytes. Explicitly handling
  1439. * this case avoids wasting too many passes
  1440. * when there are long sequences of replaced
  1441. * dead code.
  1442. */
  1443. jmp_offset = -2;
  1444. else
  1445. jmp_offset = addrs[i + insn->off] - addrs[i];
  1446. if (!jmp_offset) {
  1447. /*
  1448. * If jmp_padding is enabled, the extra nops will
  1449. * be inserted. Otherwise, optimize out nop jumps.
  1450. */
  1451. if (jmp_padding) {
  1452. /* There are 3 possible conditions.
  1453. * (1) This BPF_JA is already optimized out in
  1454. * the previous run, so there is no need
  1455. * to pad any extra byte (0 byte).
  1456. * (2) The previous pass emits an imm8 jmp,
  1457. * so we pad 2 bytes to match the previous
  1458. * insn size.
  1459. * (3) Similarly, the previous pass emits an
  1460. * imm32 jmp, and 5 bytes is padded.
  1461. */
  1462. nops = INSN_SZ_DIFF;
  1463. if (nops != 0 && nops != 2 && nops != 5) {
  1464. pr_err("unexpected nop jump padding: %d bytes\n",
  1465. nops);
  1466. return -EFAULT;
  1467. }
  1468. emit_nops(&prog, nops);
  1469. }
  1470. break;
  1471. }
  1472. emit_jmp:
  1473. if (is_imm8(jmp_offset)) {
  1474. if (jmp_padding) {
  1475. /* To avoid breaking jmp_offset, the extra bytes
  1476. * are padded before the actual jmp insn, so
  1477. * 2 bytes is subtracted from INSN_SZ_DIFF.
  1478. *
  1479. * If the previous pass already emits an imm8
  1480. * jmp, there is nothing to pad (0 byte).
  1481. *
  1482. * If it emits an imm32 jmp (5 bytes) previously
  1483. * and now an imm8 jmp (2 bytes), then we pad
  1484. * (5 - 2 = 3) bytes to stop the image from
  1485. * shrinking further.
  1486. */
  1487. nops = INSN_SZ_DIFF - 2;
  1488. if (nops != 0 && nops != 3) {
  1489. pr_err("unexpected jump padding: %d bytes\n",
  1490. nops);
  1491. return -EFAULT;
  1492. }
  1493. emit_nops(&prog, INSN_SZ_DIFF - 2);
  1494. }
  1495. EMIT2(0xEB, jmp_offset);
  1496. } else if (is_simm32(jmp_offset)) {
  1497. EMIT1_off32(0xE9, jmp_offset);
  1498. } else {
  1499. pr_err("jmp gen bug %llx\n", jmp_offset);
  1500. return -EFAULT;
  1501. }
  1502. break;
  1503. case BPF_JMP | BPF_EXIT:
  1504. if (seen_exit) {
  1505. jmp_offset = ctx->cleanup_addr - addrs[i];
  1506. goto emit_jmp;
  1507. }
  1508. seen_exit = true;
  1509. /* Update cleanup_addr */
  1510. ctx->cleanup_addr = proglen;
  1511. pop_callee_regs(&prog, callee_regs_used);
  1512. EMIT1(0xC9); /* leave */
  1513. emit_return(&prog, image + addrs[i - 1] + (prog - temp));
  1514. break;
  1515. default:
  1516. /*
  1517. * By design x86-64 JIT should support all BPF instructions.
  1518. * This error will be seen if new instruction was added
  1519. * to the interpreter, but not to the JIT, or if there is
  1520. * junk in bpf_prog.
  1521. */
  1522. pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
  1523. return -EINVAL;
  1524. }
  1525. ilen = prog - temp;
  1526. if (ilen > BPF_MAX_INSN_SIZE) {
  1527. pr_err("bpf_jit: fatal insn size error\n");
  1528. return -EFAULT;
  1529. }
  1530. if (image) {
  1531. /*
  1532. * When populating the image, assert that:
  1533. *
  1534. * i) We do not write beyond the allocated space, and
  1535. * ii) addrs[i] did not change from the prior run, in order
  1536. * to validate assumptions made for computing branch
  1537. * displacements.
  1538. */
  1539. if (unlikely(proglen + ilen > oldproglen ||
  1540. proglen + ilen != addrs[i])) {
  1541. pr_err("bpf_jit: fatal error\n");
  1542. return -EFAULT;
  1543. }
  1544. memcpy(rw_image + proglen, temp, ilen);
  1545. }
  1546. proglen += ilen;
  1547. addrs[i] = proglen;
  1548. prog = temp;
  1549. }
  1550. if (image && excnt != bpf_prog->aux->num_exentries) {
  1551. pr_err("extable is not populated\n");
  1552. return -EFAULT;
  1553. }
  1554. return proglen;
  1555. }
  1556. static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
  1557. int stack_size)
  1558. {
  1559. int i, j, arg_size, nr_regs;
  1560. /* Store function arguments to stack.
  1561. * For a function that accepts two pointers the sequence will be:
  1562. * mov QWORD PTR [rbp-0x10],rdi
  1563. * mov QWORD PTR [rbp-0x8],rsi
  1564. */
  1565. for (i = 0, j = 0; i < min(nr_args, 6); i++) {
  1566. if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
  1567. nr_regs = (m->arg_size[i] + 7) / 8;
  1568. arg_size = 8;
  1569. } else {
  1570. nr_regs = 1;
  1571. arg_size = m->arg_size[i];
  1572. }
  1573. while (nr_regs) {
  1574. emit_stx(prog, bytes_to_bpf_size(arg_size),
  1575. BPF_REG_FP,
  1576. j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
  1577. -(stack_size - j * 8));
  1578. nr_regs--;
  1579. j++;
  1580. }
  1581. }
  1582. }
  1583. static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
  1584. int stack_size)
  1585. {
  1586. int i, j, arg_size, nr_regs;
  1587. /* Restore function arguments from stack.
  1588. * For a function that accepts two pointers the sequence will be:
  1589. * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
  1590. * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
  1591. */
  1592. for (i = 0, j = 0; i < min(nr_args, 6); i++) {
  1593. if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
  1594. nr_regs = (m->arg_size[i] + 7) / 8;
  1595. arg_size = 8;
  1596. } else {
  1597. nr_regs = 1;
  1598. arg_size = m->arg_size[i];
  1599. }
  1600. while (nr_regs) {
  1601. emit_ldx(prog, bytes_to_bpf_size(arg_size),
  1602. j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
  1603. BPF_REG_FP,
  1604. -(stack_size - j * 8));
  1605. nr_regs--;
  1606. j++;
  1607. }
  1608. }
  1609. }
  1610. static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
  1611. struct bpf_tramp_link *l, int stack_size,
  1612. int run_ctx_off, bool save_ret)
  1613. {
  1614. u8 *prog = *pprog;
  1615. u8 *jmp_insn;
  1616. int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
  1617. struct bpf_prog *p = l->link.prog;
  1618. u64 cookie = l->cookie;
  1619. /* mov rdi, cookie */
  1620. emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie);
  1621. /* Prepare struct bpf_tramp_run_ctx.
  1622. *
  1623. * bpf_tramp_run_ctx is already preserved by
  1624. * arch_prepare_bpf_trampoline().
  1625. *
  1626. * mov QWORD PTR [rbp - run_ctx_off + ctx_cookie_off], rdi
  1627. */
  1628. emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);
  1629. /* arg1: mov rdi, progs[i] */
  1630. emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
  1631. /* arg2: lea rsi, [rbp - ctx_cookie_off] */
  1632. EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
  1633. if (emit_call(&prog, bpf_trampoline_enter(p), prog))
  1634. return -EINVAL;
  1635. /* remember prog start time returned by __bpf_prog_enter */
  1636. emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
  1637. /* if (__bpf_prog_enter*(prog) == 0)
  1638. * goto skip_exec_of_prog;
  1639. */
  1640. EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
  1641. /* emit 2 nops that will be replaced with JE insn */
  1642. jmp_insn = prog;
  1643. emit_nops(&prog, 2);
  1644. /* arg1: lea rdi, [rbp - stack_size] */
  1645. EMIT4(0x48, 0x8D, 0x7D, -stack_size);
  1646. /* arg2: progs[i]->insnsi for interpreter */
  1647. if (!p->jited)
  1648. emit_mov_imm64(&prog, BPF_REG_2,
  1649. (long) p->insnsi >> 32,
  1650. (u32) (long) p->insnsi);
  1651. /* call JITed bpf program or interpreter */
  1652. if (emit_call(&prog, p->bpf_func, prog))
  1653. return -EINVAL;
  1654. /*
  1655. * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return
  1656. * of the previous call which is then passed on the stack to
  1657. * the next BPF program.
  1658. *
  1659. * BPF_TRAMP_FENTRY trampoline may need to return the return
  1660. * value of BPF_PROG_TYPE_STRUCT_OPS prog.
  1661. */
  1662. if (save_ret)
  1663. emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
  1664. /* replace 2 nops with JE insn, since jmp target is known */
  1665. jmp_insn[0] = X86_JE;
  1666. jmp_insn[1] = prog - jmp_insn - 2;
  1667. /* arg1: mov rdi, progs[i] */
  1668. emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
  1669. /* arg2: mov rsi, rbx <- start time in nsec */
  1670. emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
  1671. /* arg3: lea rdx, [rbp - run_ctx_off] */
  1672. EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
  1673. if (emit_call(&prog, bpf_trampoline_exit(p), prog))
  1674. return -EINVAL;
  1675. *pprog = prog;
  1676. return 0;
  1677. }
  1678. static void emit_align(u8 **pprog, u32 align)
  1679. {
  1680. u8 *target, *prog = *pprog;
  1681. target = PTR_ALIGN(prog, align);
  1682. if (target != prog)
  1683. emit_nops(&prog, target - prog);
  1684. *pprog = prog;
  1685. }
  1686. static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
  1687. {
  1688. u8 *prog = *pprog;
  1689. s64 offset;
  1690. offset = func - (ip + 2 + 4);
  1691. if (!is_simm32(offset)) {
  1692. pr_err("Target %p is out of range\n", func);
  1693. return -EINVAL;
  1694. }
  1695. EMIT2_off32(0x0F, jmp_cond + 0x10, offset);
  1696. *pprog = prog;
  1697. return 0;
  1698. }
  1699. static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
  1700. struct bpf_tramp_links *tl, int stack_size,
  1701. int run_ctx_off, bool save_ret)
  1702. {
  1703. int i;
  1704. u8 *prog = *pprog;
  1705. for (i = 0; i < tl->nr_links; i++) {
  1706. if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
  1707. run_ctx_off, save_ret))
  1708. return -EINVAL;
  1709. }
  1710. *pprog = prog;
  1711. return 0;
  1712. }
  1713. static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
  1714. struct bpf_tramp_links *tl, int stack_size,
  1715. int run_ctx_off, u8 **branches)
  1716. {
  1717. u8 *prog = *pprog;
  1718. int i;
  1719. /* The first fmod_ret program will receive a garbage return value.
  1720. * Set this to 0 to avoid confusing the program.
  1721. */
  1722. emit_mov_imm32(&prog, false, BPF_REG_0, 0);
  1723. emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
  1724. for (i = 0; i < tl->nr_links; i++) {
  1725. if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true))
  1726. return -EINVAL;
  1727. /* mod_ret prog stored return value into [rbp - 8]. Emit:
  1728. * if (*(u64 *)(rbp - 8) != 0)
  1729. * goto do_fexit;
  1730. */
  1731. /* cmp QWORD PTR [rbp - 0x8], 0x0 */
  1732. EMIT4(0x48, 0x83, 0x7d, 0xf8); EMIT1(0x00);
  1733. /* Save the location of the branch and Generate 6 nops
  1734. * (4 bytes for an offset and 2 bytes for the jump) These nops
  1735. * are replaced with a conditional jump once do_fexit (i.e. the
  1736. * start of the fexit invocation) is finalized.
  1737. */
  1738. branches[i] = prog;
  1739. emit_nops(&prog, 4 + 2);
  1740. }
  1741. *pprog = prog;
  1742. return 0;
  1743. }
  1744. /* Example:
  1745. * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
  1746. * its 'struct btf_func_model' will be nr_args=2
  1747. * The assembly code when eth_type_trans is executing after trampoline:
  1748. *
  1749. * push rbp
  1750. * mov rbp, rsp
  1751. * sub rsp, 16 // space for skb and dev
  1752. * push rbx // temp regs to pass start time
  1753. * mov qword ptr [rbp - 16], rdi // save skb pointer to stack
  1754. * mov qword ptr [rbp - 8], rsi // save dev pointer to stack
  1755. * call __bpf_prog_enter // rcu_read_lock and preempt_disable
  1756. * mov rbx, rax // remember start time in bpf stats are enabled
  1757. * lea rdi, [rbp - 16] // R1==ctx of bpf prog
  1758. * call addr_of_jited_FENTRY_prog
  1759. * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
  1760. * mov rsi, rbx // prog start time
  1761. * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
  1762. * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack
  1763. * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack
  1764. * pop rbx
  1765. * leave
  1766. * ret
  1767. *
  1768. * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
  1769. * replaced with 'call generated_bpf_trampoline'. When it returns
  1770. * eth_type_trans will continue executing with original skb and dev pointers.
  1771. *
  1772. * The assembly code when eth_type_trans is called from trampoline:
  1773. *
  1774. * push rbp
  1775. * mov rbp, rsp
  1776. * sub rsp, 24 // space for skb, dev, return value
  1777. * push rbx // temp regs to pass start time
  1778. * mov qword ptr [rbp - 24], rdi // save skb pointer to stack
  1779. * mov qword ptr [rbp - 16], rsi // save dev pointer to stack
  1780. * call __bpf_prog_enter // rcu_read_lock and preempt_disable
  1781. * mov rbx, rax // remember start time if bpf stats are enabled
  1782. * lea rdi, [rbp - 24] // R1==ctx of bpf prog
  1783. * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev
  1784. * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
  1785. * mov rsi, rbx // prog start time
  1786. * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
  1787. * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack
  1788. * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack
  1789. * call eth_type_trans+5 // execute body of eth_type_trans
  1790. * mov qword ptr [rbp - 8], rax // save return value
  1791. * call __bpf_prog_enter // rcu_read_lock and preempt_disable
  1792. * mov rbx, rax // remember start time in bpf stats are enabled
  1793. * lea rdi, [rbp - 24] // R1==ctx of bpf prog
  1794. * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value
  1795. * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
  1796. * mov rsi, rbx // prog start time
  1797. * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
  1798. * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value
  1799. * pop rbx
  1800. * leave
  1801. * add rsp, 8 // skip eth_type_trans's frame
  1802. * ret // return to its caller
  1803. */
  1804. int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
  1805. const struct btf_func_model *m, u32 flags,
  1806. struct bpf_tramp_links *tlinks,
  1807. void *func_addr)
  1808. {
  1809. int ret, i, nr_args = m->nr_args, extra_nregs = 0;
  1810. int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
  1811. struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
  1812. struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
  1813. struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
  1814. void *orig_call = func_addr;
  1815. u8 **branches = NULL;
  1816. u8 *prog;
  1817. bool save_ret;
  1818. /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
  1819. if (nr_args > 6)
  1820. return -ENOTSUPP;
  1821. for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
  1822. if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
  1823. extra_nregs += (m->arg_size[i] + 7) / 8 - 1;
  1824. }
  1825. if (nr_args + extra_nregs > 6)
  1826. return -ENOTSUPP;
  1827. stack_size += extra_nregs * 8;
  1828. /* Generated trampoline stack layout:
  1829. *
  1830. * RBP + 8 [ return address ]
  1831. * RBP + 0 [ RBP ]
  1832. *
  1833. * RBP - 8 [ return value ] BPF_TRAMP_F_CALL_ORIG or
  1834. * BPF_TRAMP_F_RET_FENTRY_RET flags
  1835. *
  1836. * [ reg_argN ] always
  1837. * [ ... ]
  1838. * RBP - regs_off [ reg_arg1 ] program's ctx pointer
  1839. *
  1840. * RBP - args_off [ arg regs count ] always
  1841. *
  1842. * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
  1843. *
  1844. * RBP - run_ctx_off [ bpf_tramp_run_ctx ]
  1845. */
  1846. /* room for return value of orig_call or fentry prog */
  1847. save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
  1848. if (save_ret)
  1849. stack_size += 8;
  1850. regs_off = stack_size;
  1851. /* args count */
  1852. stack_size += 8;
  1853. args_off = stack_size;
  1854. if (flags & BPF_TRAMP_F_IP_ARG)
  1855. stack_size += 8; /* room for IP address argument */
  1856. ip_off = stack_size;
  1857. stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7;
  1858. run_ctx_off = stack_size;
  1859. if (flags & BPF_TRAMP_F_SKIP_FRAME) {
  1860. /* skip patched call instruction and point orig_call to actual
  1861. * body of the kernel function.
  1862. */
  1863. if (is_endbr(*(u32 *)orig_call))
  1864. orig_call += ENDBR_INSN_SIZE;
  1865. orig_call += X86_PATCH_SIZE;
  1866. }
  1867. prog = image;
  1868. EMIT_ENDBR();
  1869. EMIT1(0x55); /* push rbp */
  1870. EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
  1871. EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
  1872. EMIT1(0x53); /* push rbx */
  1873. /* Store number of argument registers of the traced function:
  1874. * mov rax, nr_args + extra_nregs
  1875. * mov QWORD PTR [rbp - args_off], rax
  1876. */
  1877. emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs);
  1878. emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
  1879. if (flags & BPF_TRAMP_F_IP_ARG) {
  1880. /* Store IP address of the traced function:
  1881. * movabsq rax, func_addr
  1882. * mov QWORD PTR [rbp - ip_off], rax
  1883. */
  1884. emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr);
  1885. emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
  1886. }
  1887. save_regs(m, &prog, nr_args, regs_off);
  1888. if (flags & BPF_TRAMP_F_CALL_ORIG) {
  1889. /* arg1: mov rdi, im */
  1890. emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
  1891. if (emit_call(&prog, __bpf_tramp_enter, prog)) {
  1892. ret = -EINVAL;
  1893. goto cleanup;
  1894. }
  1895. }
  1896. if (fentry->nr_links)
  1897. if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
  1898. flags & BPF_TRAMP_F_RET_FENTRY_RET))
  1899. return -EINVAL;
  1900. if (fmod_ret->nr_links) {
  1901. branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
  1902. GFP_KERNEL);
  1903. if (!branches)
  1904. return -ENOMEM;
  1905. if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
  1906. run_ctx_off, branches)) {
  1907. ret = -EINVAL;
  1908. goto cleanup;
  1909. }
  1910. }
  1911. if (flags & BPF_TRAMP_F_CALL_ORIG) {
  1912. restore_regs(m, &prog, nr_args, regs_off);
  1913. if (flags & BPF_TRAMP_F_ORIG_STACK) {
  1914. emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
  1915. EMIT2(0xff, 0xd0); /* call *rax */
  1916. } else {
  1917. /* call original function */
  1918. if (emit_call(&prog, orig_call, prog)) {
  1919. ret = -EINVAL;
  1920. goto cleanup;
  1921. }
  1922. }
  1923. /* remember return value in a stack for bpf prog to access */
  1924. emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
  1925. im->ip_after_call = prog;
  1926. memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
  1927. prog += X86_PATCH_SIZE;
  1928. }
  1929. if (fmod_ret->nr_links) {
  1930. /* From Intel 64 and IA-32 Architectures Optimization
  1931. * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
  1932. * Coding Rule 11: All branch targets should be 16-byte
  1933. * aligned.
  1934. */
  1935. emit_align(&prog, 16);
  1936. /* Update the branches saved in invoke_bpf_mod_ret with the
  1937. * aligned address of do_fexit.
  1938. */
  1939. for (i = 0; i < fmod_ret->nr_links; i++)
  1940. emit_cond_near_jump(&branches[i], prog, branches[i],
  1941. X86_JNE);
  1942. }
  1943. if (fexit->nr_links)
  1944. if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) {
  1945. ret = -EINVAL;
  1946. goto cleanup;
  1947. }
  1948. if (flags & BPF_TRAMP_F_RESTORE_REGS)
  1949. restore_regs(m, &prog, nr_args, regs_off);
  1950. /* This needs to be done regardless. If there were fmod_ret programs,
  1951. * the return value is only updated on the stack and still needs to be
  1952. * restored to R0.
  1953. */
  1954. if (flags & BPF_TRAMP_F_CALL_ORIG) {
  1955. im->ip_epilogue = prog;
  1956. /* arg1: mov rdi, im */
  1957. emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
  1958. if (emit_call(&prog, __bpf_tramp_exit, prog)) {
  1959. ret = -EINVAL;
  1960. goto cleanup;
  1961. }
  1962. }
  1963. /* restore return value of orig_call or fentry prog back into RAX */
  1964. if (save_ret)
  1965. emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
  1966. EMIT1(0x5B); /* pop rbx */
  1967. EMIT1(0xC9); /* leave */
  1968. if (flags & BPF_TRAMP_F_SKIP_FRAME)
  1969. /* skip our return address and return to parent */
  1970. EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
  1971. emit_return(&prog, prog);
  1972. /* Make sure the trampoline generation logic doesn't overflow */
  1973. if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) {
  1974. ret = -EFAULT;
  1975. goto cleanup;
  1976. }
  1977. ret = prog - (u8 *)image;
  1978. cleanup:
  1979. kfree(branches);
  1980. return ret;
  1981. }
  1982. static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
  1983. {
  1984. u8 *jg_reloc, *prog = *pprog;
  1985. int pivot, err, jg_bytes = 1;
  1986. s64 jg_offset;
  1987. if (a == b) {
  1988. /* Leaf node of recursion, i.e. not a range of indices
  1989. * anymore.
  1990. */
  1991. EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */
  1992. if (!is_simm32(progs[a]))
  1993. return -1;
  1994. EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
  1995. progs[a]);
  1996. err = emit_cond_near_jump(&prog, /* je func */
  1997. (void *)progs[a], image + (prog - buf),
  1998. X86_JE);
  1999. if (err)
  2000. return err;
  2001. emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf));
  2002. *pprog = prog;
  2003. return 0;
  2004. }
  2005. /* Not a leaf node, so we pivot, and recursively descend into
  2006. * the lower and upper ranges.
  2007. */
  2008. pivot = (b - a) / 2;
  2009. EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */
  2010. if (!is_simm32(progs[a + pivot]))
  2011. return -1;
  2012. EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]);
  2013. if (pivot > 2) { /* jg upper_part */
  2014. /* Require near jump. */
  2015. jg_bytes = 4;
  2016. EMIT2_off32(0x0F, X86_JG + 0x10, 0);
  2017. } else {
  2018. EMIT2(X86_JG, 0);
  2019. }
  2020. jg_reloc = prog;
  2021. err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */
  2022. progs, image, buf);
  2023. if (err)
  2024. return err;
  2025. /* From Intel 64 and IA-32 Architectures Optimization
  2026. * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
  2027. * Coding Rule 11: All branch targets should be 16-byte
  2028. * aligned.
  2029. */
  2030. emit_align(&prog, 16);
  2031. jg_offset = prog - jg_reloc;
  2032. emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);
  2033. err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */
  2034. b, progs, image, buf);
  2035. if (err)
  2036. return err;
  2037. *pprog = prog;
  2038. return 0;
  2039. }
  2040. static int cmp_ips(const void *a, const void *b)
  2041. {
  2042. const s64 *ipa = a;
  2043. const s64 *ipb = b;
  2044. if (*ipa > *ipb)
  2045. return 1;
  2046. if (*ipa < *ipb)
  2047. return -1;
  2048. return 0;
  2049. }
  2050. int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
  2051. {
  2052. u8 *prog = buf;
  2053. sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
  2054. return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
  2055. }
  2056. struct x64_jit_data {
  2057. struct bpf_binary_header *rw_header;
  2058. struct bpf_binary_header *header;
  2059. int *addrs;
  2060. u8 *image;
  2061. int proglen;
  2062. struct jit_context ctx;
  2063. };
  2064. #define MAX_PASSES 20
  2065. #define PADDING_PASSES (MAX_PASSES - 5)
  2066. struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
  2067. {
  2068. struct bpf_binary_header *rw_header = NULL;
  2069. struct bpf_binary_header *header = NULL;
  2070. struct bpf_prog *tmp, *orig_prog = prog;
  2071. struct x64_jit_data *jit_data;
  2072. int proglen, oldproglen = 0;
  2073. struct jit_context ctx = {};
  2074. bool tmp_blinded = false;
  2075. bool extra_pass = false;
  2076. bool padding = false;
  2077. u8 *rw_image = NULL;
  2078. u8 *image = NULL;
  2079. int *addrs;
  2080. int pass;
  2081. int i;
  2082. if (!prog->jit_requested)
  2083. return orig_prog;
  2084. tmp = bpf_jit_blind_constants(prog);
  2085. /*
  2086. * If blinding was requested and we failed during blinding,
  2087. * we must fall back to the interpreter.
  2088. */
  2089. if (IS_ERR(tmp))
  2090. return orig_prog;
  2091. if (tmp != prog) {
  2092. tmp_blinded = true;
  2093. prog = tmp;
  2094. }
  2095. jit_data = prog->aux->jit_data;
  2096. if (!jit_data) {
  2097. jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
  2098. if (!jit_data) {
  2099. prog = orig_prog;
  2100. goto out;
  2101. }
  2102. prog->aux->jit_data = jit_data;
  2103. }
  2104. addrs = jit_data->addrs;
  2105. if (addrs) {
  2106. ctx = jit_data->ctx;
  2107. oldproglen = jit_data->proglen;
  2108. image = jit_data->image;
  2109. header = jit_data->header;
  2110. rw_header = jit_data->rw_header;
  2111. rw_image = (void *)rw_header + ((void *)image - (void *)header);
  2112. extra_pass = true;
  2113. padding = true;
  2114. goto skip_init_addrs;
  2115. }
  2116. addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
  2117. if (!addrs) {
  2118. prog = orig_prog;
  2119. goto out_addrs;
  2120. }
  2121. /*
  2122. * Before first pass, make a rough estimation of addrs[]
  2123. * each BPF instruction is translated to less than 64 bytes
  2124. */
  2125. for (proglen = 0, i = 0; i <= prog->len; i++) {
  2126. proglen += 64;
  2127. addrs[i] = proglen;
  2128. }
  2129. ctx.cleanup_addr = proglen;
  2130. skip_init_addrs:
  2131. /*
  2132. * JITed image shrinks with every pass and the loop iterates
  2133. * until the image stops shrinking. Very large BPF programs
  2134. * may converge on the last pass. In such case do one more
  2135. * pass to emit the final image.
  2136. */
  2137. for (pass = 0; pass < MAX_PASSES || image; pass++) {
  2138. if (!padding && pass >= PADDING_PASSES)
  2139. padding = true;
  2140. proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding);
  2141. if (proglen <= 0) {
  2142. out_image:
  2143. image = NULL;
  2144. if (header) {
  2145. bpf_arch_text_copy(&header->size, &rw_header->size,
  2146. sizeof(rw_header->size));
  2147. bpf_jit_binary_pack_free(header, rw_header);
  2148. }
  2149. /* Fall back to interpreter mode */
  2150. prog = orig_prog;
  2151. if (extra_pass) {
  2152. prog->bpf_func = NULL;
  2153. prog->jited = 0;
  2154. prog->jited_len = 0;
  2155. }
  2156. goto out_addrs;
  2157. }
  2158. if (image) {
  2159. if (proglen != oldproglen) {
  2160. pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
  2161. proglen, oldproglen);
  2162. goto out_image;
  2163. }
  2164. break;
  2165. }
  2166. if (proglen == oldproglen) {
  2167. /*
  2168. * The number of entries in extable is the number of BPF_LDX
  2169. * insns that access kernel memory via "pointer to BTF type".
  2170. * The verifier changed their opcode from LDX|MEM|size
  2171. * to LDX|PROBE_MEM|size to make JITing easier.
  2172. */
  2173. u32 align = __alignof__(struct exception_table_entry);
  2174. u32 extable_size = prog->aux->num_exentries *
  2175. sizeof(struct exception_table_entry);
  2176. /* allocate module memory for x86 insns and extable */
  2177. header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size,
  2178. &image, align, &rw_header, &rw_image,
  2179. jit_fill_hole);
  2180. if (!header) {
  2181. prog = orig_prog;
  2182. goto out_addrs;
  2183. }
  2184. prog->aux->extable = (void *) image + roundup(proglen, align);
  2185. }
  2186. oldproglen = proglen;
  2187. cond_resched();
  2188. }
  2189. if (bpf_jit_enable > 1)
  2190. bpf_jit_dump(prog->len, proglen, pass + 1, rw_image);
  2191. if (image) {
  2192. if (!prog->is_func || extra_pass) {
  2193. /*
  2194. * bpf_jit_binary_pack_finalize fails in two scenarios:
  2195. * 1) header is not pointing to proper module memory;
  2196. * 2) the arch doesn't support bpf_arch_text_copy().
  2197. *
  2198. * Both cases are serious bugs and justify WARN_ON.
  2199. */
  2200. if (WARN_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header))) {
  2201. /* header has been freed */
  2202. header = NULL;
  2203. goto out_image;
  2204. }
  2205. bpf_tail_call_direct_fixup(prog);
  2206. } else {
  2207. jit_data->addrs = addrs;
  2208. jit_data->ctx = ctx;
  2209. jit_data->proglen = proglen;
  2210. jit_data->image = image;
  2211. jit_data->header = header;
  2212. jit_data->rw_header = rw_header;
  2213. }
  2214. prog->bpf_func = (void *)image;
  2215. prog->jited = 1;
  2216. prog->jited_len = proglen;
  2217. } else {
  2218. prog = orig_prog;
  2219. }
  2220. if (!image || !prog->is_func || extra_pass) {
  2221. if (image)
  2222. bpf_prog_fill_jited_linfo(prog, addrs + 1);
  2223. out_addrs:
  2224. kvfree(addrs);
  2225. kfree(jit_data);
  2226. prog->aux->jit_data = NULL;
  2227. }
  2228. out:
  2229. if (tmp_blinded)
  2230. bpf_jit_prog_release_other(prog, prog == orig_prog ?
  2231. tmp : orig_prog);
  2232. return prog;
  2233. }
  2234. bool bpf_jit_supports_kfunc_call(void)
  2235. {
  2236. return true;
  2237. }
  2238. void *bpf_arch_text_copy(void *dst, void *src, size_t len)
  2239. {
  2240. if (text_poke_copy(dst, src, len) == NULL)
  2241. return ERR_PTR(-EINVAL);
  2242. return dst;
  2243. }
  2244. /* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
  2245. bool bpf_jit_supports_subprog_tailcalls(void)
  2246. {
  2247. return true;
  2248. }
  2249. void bpf_jit_free(struct bpf_prog *prog)
  2250. {
  2251. if (prog->jited) {
  2252. struct x64_jit_data *jit_data = prog->aux->jit_data;
  2253. struct bpf_binary_header *hdr;
  2254. /*
  2255. * If we fail the final pass of JIT (from jit_subprogs),
  2256. * the program may not be finalized yet. Call finalize here
  2257. * before freeing it.
  2258. */
  2259. if (jit_data) {
  2260. bpf_jit_binary_pack_finalize(prog, jit_data->header,
  2261. jit_data->rw_header);
  2262. kvfree(jit_data->addrs);
  2263. kfree(jit_data);
  2264. }
  2265. hdr = bpf_jit_binary_pack_hdr(prog);
  2266. bpf_jit_binary_pack_free(hdr, NULL);
  2267. WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
  2268. }
  2269. bpf_prog_unlock_free(prog);
  2270. }
  2271. void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
  2272. struct bpf_prog *new, struct bpf_prog *old)
  2273. {
  2274. u8 *old_addr, *new_addr, *old_bypass_addr;
  2275. int ret;
  2276. old_bypass_addr = old ? NULL : poke->bypass_addr;
  2277. old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
  2278. new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
  2279. /*
  2280. * On program loading or teardown, the program's kallsym entry
  2281. * might not be in place, so we use __bpf_arch_text_poke to skip
  2282. * the kallsyms check.
  2283. */
  2284. if (new) {
  2285. ret = __bpf_arch_text_poke(poke->tailcall_target,
  2286. BPF_MOD_JUMP,
  2287. old_addr, new_addr);
  2288. BUG_ON(ret < 0);
  2289. if (!old) {
  2290. ret = __bpf_arch_text_poke(poke->tailcall_bypass,
  2291. BPF_MOD_JUMP,
  2292. poke->bypass_addr,
  2293. NULL);
  2294. BUG_ON(ret < 0);
  2295. }
  2296. } else {
  2297. ret = __bpf_arch_text_poke(poke->tailcall_bypass,
  2298. BPF_MOD_JUMP,
  2299. old_bypass_addr,
  2300. poke->bypass_addr);
  2301. BUG_ON(ret < 0);
  2302. /* let other CPUs finish the execution of program
  2303. * so that it will not possible to expose them
  2304. * to invalid nop, stack unwind, nop state
  2305. */
  2306. if (!ret)
  2307. synchronize_rcu();
  2308. ret = __bpf_arch_text_poke(poke->tailcall_target,
  2309. BPF_MOD_JUMP,
  2310. old_addr, NULL);
  2311. BUG_ON(ret < 0);
  2312. }
  2313. }