aes-modes.S 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
  4. *
  5. * Copyright (C) 2013 - 2017 Linaro Ltd <[email protected]>
  6. */
  7. /* included by aes-ce.S and aes-neon.S */
  8. .text
  9. .align 4
  10. #ifndef MAX_STRIDE
  11. #define MAX_STRIDE 4
  12. #endif
  13. #if MAX_STRIDE == 4
  14. #define ST4(x...) x
  15. #define ST5(x...)
  16. #else
  17. #define ST4(x...)
  18. #define ST5(x...) x
  19. #endif
  20. SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
  21. encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
  22. ret
  23. SYM_FUNC_END(aes_encrypt_block4x)
  24. SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
  25. decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
  26. ret
  27. SYM_FUNC_END(aes_decrypt_block4x)
  28. #if MAX_STRIDE == 5
  29. SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
  30. encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
  31. ret
  32. SYM_FUNC_END(aes_encrypt_block5x)
  33. SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
  34. decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
  35. ret
  36. SYM_FUNC_END(aes_decrypt_block5x)
  37. #endif
  38. /*
  39. * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  40. * int blocks)
  41. * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  42. * int blocks)
  43. */
  44. AES_FUNC_START(aes_ecb_encrypt)
  45. stp x29, x30, [sp, #-16]!
  46. mov x29, sp
  47. enc_prepare w3, x2, x5
  48. .LecbencloopNx:
  49. subs w4, w4, #MAX_STRIDE
  50. bmi .Lecbenc1x
  51. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  52. ST4( bl aes_encrypt_block4x )
  53. ST5( ld1 {v4.16b}, [x1], #16 )
  54. ST5( bl aes_encrypt_block5x )
  55. st1 {v0.16b-v3.16b}, [x0], #64
  56. ST5( st1 {v4.16b}, [x0], #16 )
  57. b .LecbencloopNx
  58. .Lecbenc1x:
  59. adds w4, w4, #MAX_STRIDE
  60. beq .Lecbencout
  61. .Lecbencloop:
  62. ld1 {v0.16b}, [x1], #16 /* get next pt block */
  63. encrypt_block v0, w3, x2, x5, w6
  64. st1 {v0.16b}, [x0], #16
  65. subs w4, w4, #1
  66. bne .Lecbencloop
  67. .Lecbencout:
  68. ldp x29, x30, [sp], #16
  69. ret
  70. AES_FUNC_END(aes_ecb_encrypt)
  71. AES_FUNC_START(aes_ecb_decrypt)
  72. stp x29, x30, [sp, #-16]!
  73. mov x29, sp
  74. dec_prepare w3, x2, x5
  75. .LecbdecloopNx:
  76. subs w4, w4, #MAX_STRIDE
  77. bmi .Lecbdec1x
  78. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  79. ST4( bl aes_decrypt_block4x )
  80. ST5( ld1 {v4.16b}, [x1], #16 )
  81. ST5( bl aes_decrypt_block5x )
  82. st1 {v0.16b-v3.16b}, [x0], #64
  83. ST5( st1 {v4.16b}, [x0], #16 )
  84. b .LecbdecloopNx
  85. .Lecbdec1x:
  86. adds w4, w4, #MAX_STRIDE
  87. beq .Lecbdecout
  88. .Lecbdecloop:
  89. ld1 {v0.16b}, [x1], #16 /* get next ct block */
  90. decrypt_block v0, w3, x2, x5, w6
  91. st1 {v0.16b}, [x0], #16
  92. subs w4, w4, #1
  93. bne .Lecbdecloop
  94. .Lecbdecout:
  95. ldp x29, x30, [sp], #16
  96. ret
  97. AES_FUNC_END(aes_ecb_decrypt)
  98. /*
  99. * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  100. * int blocks, u8 iv[])
  101. * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  102. * int blocks, u8 iv[])
  103. * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
  104. * int rounds, int blocks, u8 iv[],
  105. * u32 const rk2[]);
  106. * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
  107. * int rounds, int blocks, u8 iv[],
  108. * u32 const rk2[]);
  109. */
  110. AES_FUNC_START(aes_essiv_cbc_encrypt)
  111. ld1 {v4.16b}, [x5] /* get iv */
  112. mov w8, #14 /* AES-256: 14 rounds */
  113. enc_prepare w8, x6, x7
  114. encrypt_block v4, w8, x6, x7, w9
  115. enc_switch_key w3, x2, x6
  116. b .Lcbcencloop4x
  117. AES_FUNC_START(aes_cbc_encrypt)
  118. ld1 {v4.16b}, [x5] /* get iv */
  119. enc_prepare w3, x2, x6
  120. .Lcbcencloop4x:
  121. subs w4, w4, #4
  122. bmi .Lcbcenc1x
  123. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  124. eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
  125. encrypt_block v0, w3, x2, x6, w7
  126. eor v1.16b, v1.16b, v0.16b
  127. encrypt_block v1, w3, x2, x6, w7
  128. eor v2.16b, v2.16b, v1.16b
  129. encrypt_block v2, w3, x2, x6, w7
  130. eor v3.16b, v3.16b, v2.16b
  131. encrypt_block v3, w3, x2, x6, w7
  132. st1 {v0.16b-v3.16b}, [x0], #64
  133. mov v4.16b, v3.16b
  134. b .Lcbcencloop4x
  135. .Lcbcenc1x:
  136. adds w4, w4, #4
  137. beq .Lcbcencout
  138. .Lcbcencloop:
  139. ld1 {v0.16b}, [x1], #16 /* get next pt block */
  140. eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
  141. encrypt_block v4, w3, x2, x6, w7
  142. st1 {v4.16b}, [x0], #16
  143. subs w4, w4, #1
  144. bne .Lcbcencloop
  145. .Lcbcencout:
  146. st1 {v4.16b}, [x5] /* return iv */
  147. ret
  148. AES_FUNC_END(aes_cbc_encrypt)
  149. AES_FUNC_END(aes_essiv_cbc_encrypt)
  150. AES_FUNC_START(aes_essiv_cbc_decrypt)
  151. stp x29, x30, [sp, #-16]!
  152. mov x29, sp
  153. ld1 {cbciv.16b}, [x5] /* get iv */
  154. mov w8, #14 /* AES-256: 14 rounds */
  155. enc_prepare w8, x6, x7
  156. encrypt_block cbciv, w8, x6, x7, w9
  157. b .Lessivcbcdecstart
  158. AES_FUNC_START(aes_cbc_decrypt)
  159. stp x29, x30, [sp, #-16]!
  160. mov x29, sp
  161. ld1 {cbciv.16b}, [x5] /* get iv */
  162. .Lessivcbcdecstart:
  163. dec_prepare w3, x2, x6
  164. .LcbcdecloopNx:
  165. subs w4, w4, #MAX_STRIDE
  166. bmi .Lcbcdec1x
  167. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  168. #if MAX_STRIDE == 5
  169. ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
  170. mov v5.16b, v0.16b
  171. mov v6.16b, v1.16b
  172. mov v7.16b, v2.16b
  173. bl aes_decrypt_block5x
  174. sub x1, x1, #32
  175. eor v0.16b, v0.16b, cbciv.16b
  176. eor v1.16b, v1.16b, v5.16b
  177. ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
  178. ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
  179. eor v2.16b, v2.16b, v6.16b
  180. eor v3.16b, v3.16b, v7.16b
  181. eor v4.16b, v4.16b, v5.16b
  182. #else
  183. mov v4.16b, v0.16b
  184. mov v5.16b, v1.16b
  185. mov v6.16b, v2.16b
  186. bl aes_decrypt_block4x
  187. sub x1, x1, #16
  188. eor v0.16b, v0.16b, cbciv.16b
  189. eor v1.16b, v1.16b, v4.16b
  190. ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
  191. eor v2.16b, v2.16b, v5.16b
  192. eor v3.16b, v3.16b, v6.16b
  193. #endif
  194. st1 {v0.16b-v3.16b}, [x0], #64
  195. ST5( st1 {v4.16b}, [x0], #16 )
  196. b .LcbcdecloopNx
  197. .Lcbcdec1x:
  198. adds w4, w4, #MAX_STRIDE
  199. beq .Lcbcdecout
  200. .Lcbcdecloop:
  201. ld1 {v1.16b}, [x1], #16 /* get next ct block */
  202. mov v0.16b, v1.16b /* ...and copy to v0 */
  203. decrypt_block v0, w3, x2, x6, w7
  204. eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
  205. mov cbciv.16b, v1.16b /* ct is next iv */
  206. st1 {v0.16b}, [x0], #16
  207. subs w4, w4, #1
  208. bne .Lcbcdecloop
  209. .Lcbcdecout:
  210. st1 {cbciv.16b}, [x5] /* return iv */
  211. ldp x29, x30, [sp], #16
  212. ret
  213. AES_FUNC_END(aes_cbc_decrypt)
  214. AES_FUNC_END(aes_essiv_cbc_decrypt)
  215. /*
  216. * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
  217. * int rounds, int bytes, u8 const iv[])
  218. * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
  219. * int rounds, int bytes, u8 const iv[])
  220. */
  221. AES_FUNC_START(aes_cbc_cts_encrypt)
  222. adr_l x8, .Lcts_permute_table
  223. sub x4, x4, #16
  224. add x9, x8, #32
  225. add x8, x8, x4
  226. sub x9, x9, x4
  227. ld1 {v3.16b}, [x8]
  228. ld1 {v4.16b}, [x9]
  229. ld1 {v0.16b}, [x1], x4 /* overlapping loads */
  230. ld1 {v1.16b}, [x1]
  231. ld1 {v5.16b}, [x5] /* get iv */
  232. enc_prepare w3, x2, x6
  233. eor v0.16b, v0.16b, v5.16b /* xor with iv */
  234. tbl v1.16b, {v1.16b}, v4.16b
  235. encrypt_block v0, w3, x2, x6, w7
  236. eor v1.16b, v1.16b, v0.16b
  237. tbl v0.16b, {v0.16b}, v3.16b
  238. encrypt_block v1, w3, x2, x6, w7
  239. add x4, x0, x4
  240. st1 {v0.16b}, [x4] /* overlapping stores */
  241. st1 {v1.16b}, [x0]
  242. ret
  243. AES_FUNC_END(aes_cbc_cts_encrypt)
  244. AES_FUNC_START(aes_cbc_cts_decrypt)
  245. adr_l x8, .Lcts_permute_table
  246. sub x4, x4, #16
  247. add x9, x8, #32
  248. add x8, x8, x4
  249. sub x9, x9, x4
  250. ld1 {v3.16b}, [x8]
  251. ld1 {v4.16b}, [x9]
  252. ld1 {v0.16b}, [x1], x4 /* overlapping loads */
  253. ld1 {v1.16b}, [x1]
  254. ld1 {v5.16b}, [x5] /* get iv */
  255. dec_prepare w3, x2, x6
  256. decrypt_block v0, w3, x2, x6, w7
  257. tbl v2.16b, {v0.16b}, v3.16b
  258. eor v2.16b, v2.16b, v1.16b
  259. tbx v0.16b, {v1.16b}, v4.16b
  260. decrypt_block v0, w3, x2, x6, w7
  261. eor v0.16b, v0.16b, v5.16b /* xor with iv */
  262. add x4, x0, x4
  263. st1 {v2.16b}, [x4] /* overlapping stores */
  264. st1 {v0.16b}, [x0]
  265. ret
  266. AES_FUNC_END(aes_cbc_cts_decrypt)
  267. .section ".rodata", "a"
  268. .align 6
  269. .Lcts_permute_table:
  270. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  271. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  272. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  273. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
  274. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  275. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  276. .previous
  277. /*
  278. * This macro generates the code for CTR and XCTR mode.
  279. */
  280. .macro ctr_encrypt xctr
  281. // Arguments
  282. OUT .req x0
  283. IN .req x1
  284. KEY .req x2
  285. ROUNDS_W .req w3
  286. BYTES_W .req w4
  287. IV .req x5
  288. BYTE_CTR_W .req w6 // XCTR only
  289. // Intermediate values
  290. CTR_W .req w11 // XCTR only
  291. CTR .req x11 // XCTR only
  292. IV_PART .req x12
  293. BLOCKS .req x13
  294. BLOCKS_W .req w13
  295. stp x29, x30, [sp, #-16]!
  296. mov x29, sp
  297. enc_prepare ROUNDS_W, KEY, IV_PART
  298. ld1 {vctr.16b}, [IV]
  299. /*
  300. * Keep 64 bits of the IV in a register. For CTR mode this lets us
  301. * easily increment the IV. For XCTR mode this lets us efficiently XOR
  302. * the 64-bit counter with the IV.
  303. */
  304. .if \xctr
  305. umov IV_PART, vctr.d[0]
  306. lsr CTR_W, BYTE_CTR_W, #4
  307. .else
  308. umov IV_PART, vctr.d[1]
  309. rev IV_PART, IV_PART
  310. .endif
  311. .LctrloopNx\xctr:
  312. add BLOCKS_W, BYTES_W, #15
  313. sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
  314. lsr BLOCKS_W, BLOCKS_W, #4
  315. mov w8, #MAX_STRIDE
  316. cmp BLOCKS_W, w8
  317. csel BLOCKS_W, BLOCKS_W, w8, lt
  318. /*
  319. * Set up the counter values in v0-v{MAX_STRIDE-1}.
  320. *
  321. * If we are encrypting less than MAX_STRIDE blocks, the tail block
  322. * handling code expects the last keystream block to be in
  323. * v{MAX_STRIDE-1}. For example: if encrypting two blocks with
  324. * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
  325. */
  326. .if \xctr
  327. add CTR, CTR, BLOCKS
  328. .else
  329. adds IV_PART, IV_PART, BLOCKS
  330. .endif
  331. mov v0.16b, vctr.16b
  332. mov v1.16b, vctr.16b
  333. mov v2.16b, vctr.16b
  334. mov v3.16b, vctr.16b
  335. ST5( mov v4.16b, vctr.16b )
  336. .if \xctr
  337. sub x6, CTR, #MAX_STRIDE - 1
  338. sub x7, CTR, #MAX_STRIDE - 2
  339. sub x8, CTR, #MAX_STRIDE - 3
  340. sub x9, CTR, #MAX_STRIDE - 4
  341. ST5( sub x10, CTR, #MAX_STRIDE - 5 )
  342. eor x6, x6, IV_PART
  343. eor x7, x7, IV_PART
  344. eor x8, x8, IV_PART
  345. eor x9, x9, IV_PART
  346. ST5( eor x10, x10, IV_PART )
  347. mov v0.d[0], x6
  348. mov v1.d[0], x7
  349. mov v2.d[0], x8
  350. mov v3.d[0], x9
  351. ST5( mov v4.d[0], x10 )
  352. .else
  353. bcs 0f
  354. .subsection 1
  355. /*
  356. * This subsection handles carries.
  357. *
  358. * Conditional branching here is allowed with respect to time
  359. * invariance since the branches are dependent on the IV instead
  360. * of the plaintext or key. This code is rarely executed in
  361. * practice anyway.
  362. */
  363. /* Apply carry to outgoing counter. */
  364. 0: umov x8, vctr.d[0]
  365. rev x8, x8
  366. add x8, x8, #1
  367. rev x8, x8
  368. ins vctr.d[0], x8
  369. /*
  370. * Apply carry to counter blocks if needed.
  371. *
  372. * Since the carry flag was set, we know 0 <= IV_PART <
  373. * MAX_STRIDE. Using the value of IV_PART we can determine how
  374. * many counter blocks need to be updated.
  375. */
  376. cbz IV_PART, 2f
  377. adr x16, 1f
  378. sub x16, x16, IV_PART, lsl #3
  379. br x16
  380. bti c
  381. mov v0.d[0], vctr.d[0]
  382. bti c
  383. mov v1.d[0], vctr.d[0]
  384. bti c
  385. mov v2.d[0], vctr.d[0]
  386. bti c
  387. mov v3.d[0], vctr.d[0]
  388. ST5( bti c )
  389. ST5( mov v4.d[0], vctr.d[0] )
  390. 1: b 2f
  391. .previous
  392. 2: rev x7, IV_PART
  393. ins vctr.d[1], x7
  394. sub x7, IV_PART, #MAX_STRIDE - 1
  395. sub x8, IV_PART, #MAX_STRIDE - 2
  396. sub x9, IV_PART, #MAX_STRIDE - 3
  397. rev x7, x7
  398. rev x8, x8
  399. mov v1.d[1], x7
  400. rev x9, x9
  401. ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
  402. mov v2.d[1], x8
  403. ST5( rev x10, x10 )
  404. mov v3.d[1], x9
  405. ST5( mov v4.d[1], x10 )
  406. .endif
  407. /*
  408. * If there are at least MAX_STRIDE blocks left, XOR the data with
  409. * keystream and store. Otherwise jump to tail handling.
  410. */
  411. tbnz BYTES_W, #31, .Lctrtail\xctr
  412. ld1 {v5.16b-v7.16b}, [IN], #48
  413. ST4( bl aes_encrypt_block4x )
  414. ST5( bl aes_encrypt_block5x )
  415. eor v0.16b, v5.16b, v0.16b
  416. ST4( ld1 {v5.16b}, [IN], #16 )
  417. eor v1.16b, v6.16b, v1.16b
  418. ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
  419. eor v2.16b, v7.16b, v2.16b
  420. eor v3.16b, v5.16b, v3.16b
  421. ST5( eor v4.16b, v6.16b, v4.16b )
  422. st1 {v0.16b-v3.16b}, [OUT], #64
  423. ST5( st1 {v4.16b}, [OUT], #16 )
  424. cbz BYTES_W, .Lctrout\xctr
  425. b .LctrloopNx\xctr
  426. .Lctrout\xctr:
  427. .if !\xctr
  428. st1 {vctr.16b}, [IV] /* return next CTR value */
  429. .endif
  430. ldp x29, x30, [sp], #16
  431. ret
  432. .Lctrtail\xctr:
  433. /*
  434. * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
  435. *
  436. * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
  437. * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
  438. * v4 should have the next two counter blocks.
  439. *
  440. * This allows us to store the ciphertext by writing to overlapping
  441. * regions of memory. Any invalid ciphertext blocks get overwritten by
  442. * correctly computed blocks. This approach greatly simplifies the
  443. * logic for storing the ciphertext.
  444. */
  445. mov x16, #16
  446. ands w7, BYTES_W, #0xf
  447. csel x13, x7, x16, ne
  448. ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
  449. ST5( csel x14, x16, xzr, gt )
  450. cmp BYTES_W, #48 - (MAX_STRIDE << 4)
  451. csel x15, x16, xzr, gt
  452. cmp BYTES_W, #32 - (MAX_STRIDE << 4)
  453. csel x16, x16, xzr, gt
  454. cmp BYTES_W, #16 - (MAX_STRIDE << 4)
  455. adr_l x9, .Lcts_permute_table
  456. add x9, x9, x13
  457. ble .Lctrtail1x\xctr
  458. ST5( ld1 {v5.16b}, [IN], x14 )
  459. ld1 {v6.16b}, [IN], x15
  460. ld1 {v7.16b}, [IN], x16
  461. ST4( bl aes_encrypt_block4x )
  462. ST5( bl aes_encrypt_block5x )
  463. ld1 {v8.16b}, [IN], x13
  464. ld1 {v9.16b}, [IN]
  465. ld1 {v10.16b}, [x9]
  466. ST4( eor v6.16b, v6.16b, v0.16b )
  467. ST4( eor v7.16b, v7.16b, v1.16b )
  468. ST4( tbl v3.16b, {v3.16b}, v10.16b )
  469. ST4( eor v8.16b, v8.16b, v2.16b )
  470. ST4( eor v9.16b, v9.16b, v3.16b )
  471. ST5( eor v5.16b, v5.16b, v0.16b )
  472. ST5( eor v6.16b, v6.16b, v1.16b )
  473. ST5( tbl v4.16b, {v4.16b}, v10.16b )
  474. ST5( eor v7.16b, v7.16b, v2.16b )
  475. ST5( eor v8.16b, v8.16b, v3.16b )
  476. ST5( eor v9.16b, v9.16b, v4.16b )
  477. ST5( st1 {v5.16b}, [OUT], x14 )
  478. st1 {v6.16b}, [OUT], x15
  479. st1 {v7.16b}, [OUT], x16
  480. add x13, x13, OUT
  481. st1 {v9.16b}, [x13] // overlapping stores
  482. st1 {v8.16b}, [OUT]
  483. b .Lctrout\xctr
  484. .Lctrtail1x\xctr:
  485. /*
  486. * Handle <= 16 bytes of plaintext
  487. *
  488. * This code always reads and writes 16 bytes. To avoid out of bounds
  489. * accesses, XCTR and CTR modes must use a temporary buffer when
  490. * encrypting/decrypting less than 16 bytes.
  491. *
  492. * This code is unusual in that it loads the input and stores the output
  493. * relative to the end of the buffers rather than relative to the start.
  494. * This causes unusual behaviour when encrypting/decrypting less than 16
  495. * bytes; the end of the data is expected to be at the end of the
  496. * temporary buffer rather than the start of the data being at the start
  497. * of the temporary buffer.
  498. */
  499. sub x8, x7, #16
  500. csel x7, x7, x8, eq
  501. add IN, IN, x7
  502. add OUT, OUT, x7
  503. ld1 {v5.16b}, [IN]
  504. ld1 {v6.16b}, [OUT]
  505. ST5( mov v3.16b, v4.16b )
  506. encrypt_block v3, ROUNDS_W, KEY, x8, w7
  507. ld1 {v10.16b-v11.16b}, [x9]
  508. tbl v3.16b, {v3.16b}, v10.16b
  509. sshr v11.16b, v11.16b, #7
  510. eor v5.16b, v5.16b, v3.16b
  511. bif v5.16b, v6.16b, v11.16b
  512. st1 {v5.16b}, [OUT]
  513. b .Lctrout\xctr
  514. // Arguments
  515. .unreq OUT
  516. .unreq IN
  517. .unreq KEY
  518. .unreq ROUNDS_W
  519. .unreq BYTES_W
  520. .unreq IV
  521. .unreq BYTE_CTR_W // XCTR only
  522. // Intermediate values
  523. .unreq CTR_W // XCTR only
  524. .unreq CTR // XCTR only
  525. .unreq IV_PART
  526. .unreq BLOCKS
  527. .unreq BLOCKS_W
  528. .endm
  529. /*
  530. * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  531. * int bytes, u8 ctr[])
  532. *
  533. * The input and output buffers must always be at least 16 bytes even if
  534. * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
  535. * accesses will occur. The data to be encrypted/decrypted is expected
  536. * to be at the end of this 16-byte temporary buffer rather than the
  537. * start.
  538. */
  539. AES_FUNC_START(aes_ctr_encrypt)
  540. ctr_encrypt 0
  541. AES_FUNC_END(aes_ctr_encrypt)
  542. /*
  543. * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  544. * int bytes, u8 const iv[], int byte_ctr)
  545. *
  546. * The input and output buffers must always be at least 16 bytes even if
  547. * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
  548. * accesses will occur. The data to be encrypted/decrypted is expected
  549. * to be at the end of this 16-byte temporary buffer rather than the
  550. * start.
  551. */
  552. AES_FUNC_START(aes_xctr_encrypt)
  553. ctr_encrypt 1
  554. AES_FUNC_END(aes_xctr_encrypt)
  555. /*
  556. * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  557. * int bytes, u8 const rk2[], u8 iv[], int first)
  558. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  559. * int bytes, u8 const rk2[], u8 iv[], int first)
  560. */
  561. .macro next_tweak, out, in, tmp
  562. sshr \tmp\().2d, \in\().2d, #63
  563. and \tmp\().16b, \tmp\().16b, xtsmask.16b
  564. add \out\().2d, \in\().2d, \in\().2d
  565. ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
  566. eor \out\().16b, \out\().16b, \tmp\().16b
  567. .endm
  568. .macro xts_load_mask, tmp
  569. movi xtsmask.2s, #0x1
  570. movi \tmp\().2s, #0x87
  571. uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
  572. .endm
  573. AES_FUNC_START(aes_xts_encrypt)
  574. stp x29, x30, [sp, #-16]!
  575. mov x29, sp
  576. ld1 {v4.16b}, [x6]
  577. xts_load_mask v8
  578. cbz w7, .Lxtsencnotfirst
  579. enc_prepare w3, x5, x8
  580. xts_cts_skip_tw w7, .LxtsencNx
  581. encrypt_block v4, w3, x5, x8, w7 /* first tweak */
  582. enc_switch_key w3, x2, x8
  583. b .LxtsencNx
  584. .Lxtsencnotfirst:
  585. enc_prepare w3, x2, x8
  586. .LxtsencloopNx:
  587. next_tweak v4, v4, v8
  588. .LxtsencNx:
  589. subs w4, w4, #64
  590. bmi .Lxtsenc1x
  591. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  592. next_tweak v5, v4, v8
  593. eor v0.16b, v0.16b, v4.16b
  594. next_tweak v6, v5, v8
  595. eor v1.16b, v1.16b, v5.16b
  596. eor v2.16b, v2.16b, v6.16b
  597. next_tweak v7, v6, v8
  598. eor v3.16b, v3.16b, v7.16b
  599. bl aes_encrypt_block4x
  600. eor v3.16b, v3.16b, v7.16b
  601. eor v0.16b, v0.16b, v4.16b
  602. eor v1.16b, v1.16b, v5.16b
  603. eor v2.16b, v2.16b, v6.16b
  604. st1 {v0.16b-v3.16b}, [x0], #64
  605. mov v4.16b, v7.16b
  606. cbz w4, .Lxtsencret
  607. xts_reload_mask v8
  608. b .LxtsencloopNx
  609. .Lxtsenc1x:
  610. adds w4, w4, #64
  611. beq .Lxtsencout
  612. subs w4, w4, #16
  613. bmi .LxtsencctsNx
  614. .Lxtsencloop:
  615. ld1 {v0.16b}, [x1], #16
  616. .Lxtsencctsout:
  617. eor v0.16b, v0.16b, v4.16b
  618. encrypt_block v0, w3, x2, x8, w7
  619. eor v0.16b, v0.16b, v4.16b
  620. cbz w4, .Lxtsencout
  621. subs w4, w4, #16
  622. next_tweak v4, v4, v8
  623. bmi .Lxtsenccts
  624. st1 {v0.16b}, [x0], #16
  625. b .Lxtsencloop
  626. .Lxtsencout:
  627. st1 {v0.16b}, [x0]
  628. .Lxtsencret:
  629. st1 {v4.16b}, [x6]
  630. ldp x29, x30, [sp], #16
  631. ret
  632. .LxtsencctsNx:
  633. mov v0.16b, v3.16b
  634. sub x0, x0, #16
  635. .Lxtsenccts:
  636. adr_l x8, .Lcts_permute_table
  637. add x1, x1, w4, sxtw /* rewind input pointer */
  638. add w4, w4, #16 /* # bytes in final block */
  639. add x9, x8, #32
  640. add x8, x8, x4
  641. sub x9, x9, x4
  642. add x4, x0, x4 /* output address of final block */
  643. ld1 {v1.16b}, [x1] /* load final block */
  644. ld1 {v2.16b}, [x8]
  645. ld1 {v3.16b}, [x9]
  646. tbl v2.16b, {v0.16b}, v2.16b
  647. tbx v0.16b, {v1.16b}, v3.16b
  648. st1 {v2.16b}, [x4] /* overlapping stores */
  649. mov w4, wzr
  650. b .Lxtsencctsout
  651. AES_FUNC_END(aes_xts_encrypt)
  652. AES_FUNC_START(aes_xts_decrypt)
  653. stp x29, x30, [sp, #-16]!
  654. mov x29, sp
  655. /* subtract 16 bytes if we are doing CTS */
  656. sub w8, w4, #0x10
  657. tst w4, #0xf
  658. csel w4, w4, w8, eq
  659. ld1 {v4.16b}, [x6]
  660. xts_load_mask v8
  661. xts_cts_skip_tw w7, .Lxtsdecskiptw
  662. cbz w7, .Lxtsdecnotfirst
  663. enc_prepare w3, x5, x8
  664. encrypt_block v4, w3, x5, x8, w7 /* first tweak */
  665. .Lxtsdecskiptw:
  666. dec_prepare w3, x2, x8
  667. b .LxtsdecNx
  668. .Lxtsdecnotfirst:
  669. dec_prepare w3, x2, x8
  670. .LxtsdecloopNx:
  671. next_tweak v4, v4, v8
  672. .LxtsdecNx:
  673. subs w4, w4, #64
  674. bmi .Lxtsdec1x
  675. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  676. next_tweak v5, v4, v8
  677. eor v0.16b, v0.16b, v4.16b
  678. next_tweak v6, v5, v8
  679. eor v1.16b, v1.16b, v5.16b
  680. eor v2.16b, v2.16b, v6.16b
  681. next_tweak v7, v6, v8
  682. eor v3.16b, v3.16b, v7.16b
  683. bl aes_decrypt_block4x
  684. eor v3.16b, v3.16b, v7.16b
  685. eor v0.16b, v0.16b, v4.16b
  686. eor v1.16b, v1.16b, v5.16b
  687. eor v2.16b, v2.16b, v6.16b
  688. st1 {v0.16b-v3.16b}, [x0], #64
  689. mov v4.16b, v7.16b
  690. cbz w4, .Lxtsdecout
  691. xts_reload_mask v8
  692. b .LxtsdecloopNx
  693. .Lxtsdec1x:
  694. adds w4, w4, #64
  695. beq .Lxtsdecout
  696. subs w4, w4, #16
  697. .Lxtsdecloop:
  698. ld1 {v0.16b}, [x1], #16
  699. bmi .Lxtsdeccts
  700. .Lxtsdecctsout:
  701. eor v0.16b, v0.16b, v4.16b
  702. decrypt_block v0, w3, x2, x8, w7
  703. eor v0.16b, v0.16b, v4.16b
  704. st1 {v0.16b}, [x0], #16
  705. cbz w4, .Lxtsdecout
  706. subs w4, w4, #16
  707. next_tweak v4, v4, v8
  708. b .Lxtsdecloop
  709. .Lxtsdecout:
  710. st1 {v4.16b}, [x6]
  711. ldp x29, x30, [sp], #16
  712. ret
  713. .Lxtsdeccts:
  714. adr_l x8, .Lcts_permute_table
  715. add x1, x1, w4, sxtw /* rewind input pointer */
  716. add w4, w4, #16 /* # bytes in final block */
  717. add x9, x8, #32
  718. add x8, x8, x4
  719. sub x9, x9, x4
  720. add x4, x0, x4 /* output address of final block */
  721. next_tweak v5, v4, v8
  722. ld1 {v1.16b}, [x1] /* load final block */
  723. ld1 {v2.16b}, [x8]
  724. ld1 {v3.16b}, [x9]
  725. eor v0.16b, v0.16b, v5.16b
  726. decrypt_block v0, w3, x2, x8, w7
  727. eor v0.16b, v0.16b, v5.16b
  728. tbl v2.16b, {v0.16b}, v2.16b
  729. tbx v0.16b, {v1.16b}, v3.16b
  730. st1 {v2.16b}, [x4] /* overlapping stores */
  731. mov w4, wzr
  732. b .Lxtsdecctsout
  733. AES_FUNC_END(aes_xts_decrypt)
  734. /*
  735. * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
  736. * int blocks, u8 dg[], int enc_before, int enc_after)
  737. */
  738. AES_FUNC_START(aes_mac_update)
  739. ld1 {v0.16b}, [x4] /* get dg */
  740. enc_prepare w2, x1, x7
  741. cbz w5, .Lmacloop4x
  742. encrypt_block v0, w2, x1, x7, w8
  743. .Lmacloop4x:
  744. subs w3, w3, #4
  745. bmi .Lmac1x
  746. ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
  747. eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
  748. encrypt_block v0, w2, x1, x7, w8
  749. eor v0.16b, v0.16b, v2.16b
  750. encrypt_block v0, w2, x1, x7, w8
  751. eor v0.16b, v0.16b, v3.16b
  752. encrypt_block v0, w2, x1, x7, w8
  753. eor v0.16b, v0.16b, v4.16b
  754. cmp w3, wzr
  755. csinv x5, x6, xzr, eq
  756. cbz w5, .Lmacout
  757. encrypt_block v0, w2, x1, x7, w8
  758. st1 {v0.16b}, [x4] /* return dg */
  759. cond_yield .Lmacout, x7, x8
  760. b .Lmacloop4x
  761. .Lmac1x:
  762. add w3, w3, #4
  763. .Lmacloop:
  764. cbz w3, .Lmacout
  765. ld1 {v1.16b}, [x0], #16 /* get next pt block */
  766. eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
  767. subs w3, w3, #1
  768. csinv x5, x6, xzr, eq
  769. cbz w5, .Lmacout
  770. .Lmacenc:
  771. encrypt_block v0, w2, x1, x7, w8
  772. b .Lmacloop
  773. .Lmacout:
  774. st1 {v0.16b}, [x4] /* return dg */
  775. mov w0, w3
  776. ret
  777. AES_FUNC_END(aes_mac_update)