aes-neon.S 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
  4. *
  5. * Copyright (C) 2013 - 2017 Linaro Ltd. <[email protected]>
  6. */
  7. #include <linux/linkage.h>
  8. #include <asm/assembler.h>
  9. #define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func)
  10. #define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func)
  11. xtsmask .req v7
  12. cbciv .req v7
  13. vctr .req v4
  14. .macro xts_reload_mask, tmp
  15. xts_load_mask \tmp
  16. .endm
  17. /* special case for the neon-bs driver calling into this one for CTS */
  18. .macro xts_cts_skip_tw, reg, lbl
  19. tbnz \reg, #1, \lbl
  20. .endm
  21. /* multiply by polynomial 'x' in GF(2^8) */
  22. .macro mul_by_x, out, in, temp, const
  23. sshr \temp, \in, #7
  24. shl \out, \in, #1
  25. and \temp, \temp, \const
  26. eor \out, \out, \temp
  27. .endm
  28. /* multiply by polynomial 'x^2' in GF(2^8) */
  29. .macro mul_by_x2, out, in, temp, const
  30. ushr \temp, \in, #6
  31. shl \out, \in, #2
  32. pmul \temp, \temp, \const
  33. eor \out, \out, \temp
  34. .endm
  35. /* preload the entire Sbox */
  36. .macro prepare, sbox, shiftrows, temp
  37. movi v12.16b, #0x1b
  38. ldr_l q13, \shiftrows, \temp
  39. ldr_l q14, .Lror32by8, \temp
  40. adr_l \temp, \sbox
  41. ld1 {v16.16b-v19.16b}, [\temp], #64
  42. ld1 {v20.16b-v23.16b}, [\temp], #64
  43. ld1 {v24.16b-v27.16b}, [\temp], #64
  44. ld1 {v28.16b-v31.16b}, [\temp]
  45. .endm
  46. /* do preload for encryption */
  47. .macro enc_prepare, ignore0, ignore1, temp
  48. prepare crypto_aes_sbox, .LForward_ShiftRows, \temp
  49. .endm
  50. .macro enc_switch_key, ignore0, ignore1, temp
  51. /* do nothing */
  52. .endm
  53. /* do preload for decryption */
  54. .macro dec_prepare, ignore0, ignore1, temp
  55. prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
  56. .endm
  57. /* apply SubBytes transformation using the preloaded Sbox */
  58. .macro sub_bytes, in
  59. sub v9.16b, \in\().16b, v15.16b
  60. tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
  61. sub v10.16b, v9.16b, v15.16b
  62. tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
  63. sub v11.16b, v10.16b, v15.16b
  64. tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
  65. tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
  66. .endm
  67. /* apply MixColumns transformation */
  68. .macro mix_columns, in, enc
  69. .if \enc == 0
  70. /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
  71. mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b
  72. eor \in\().16b, \in\().16b, v8.16b
  73. rev32 v8.8h, v8.8h
  74. eor \in\().16b, \in\().16b, v8.16b
  75. .endif
  76. mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b
  77. rev32 v8.8h, \in\().8h
  78. eor v8.16b, v8.16b, v9.16b
  79. eor \in\().16b, \in\().16b, v8.16b
  80. tbl \in\().16b, {\in\().16b}, v14.16b
  81. eor \in\().16b, \in\().16b, v8.16b
  82. .endm
  83. .macro do_block, enc, in, rounds, rk, rkp, i
  84. ld1 {v15.4s}, [\rk]
  85. add \rkp, \rk, #16
  86. mov \i, \rounds
  87. 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
  88. movi v15.16b, #0x40
  89. tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
  90. sub_bytes \in
  91. subs \i, \i, #1
  92. ld1 {v15.4s}, [\rkp], #16
  93. beq 2222f
  94. mix_columns \in, \enc
  95. b 1111b
  96. 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
  97. .endm
  98. .macro encrypt_block, in, rounds, rk, rkp, i
  99. do_block 1, \in, \rounds, \rk, \rkp, \i
  100. .endm
  101. .macro decrypt_block, in, rounds, rk, rkp, i
  102. do_block 0, \in, \rounds, \rk, \rkp, \i
  103. .endm
  104. /*
  105. * Interleaved versions: functionally equivalent to the
  106. * ones above, but applied to AES states in parallel.
  107. */
  108. .macro sub_bytes_4x, in0, in1, in2, in3
  109. sub v8.16b, \in0\().16b, v15.16b
  110. tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
  111. sub v9.16b, \in1\().16b, v15.16b
  112. tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
  113. sub v10.16b, \in2\().16b, v15.16b
  114. tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
  115. sub v11.16b, \in3\().16b, v15.16b
  116. tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
  117. tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
  118. tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
  119. sub v8.16b, v8.16b, v15.16b
  120. tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
  121. sub v9.16b, v9.16b, v15.16b
  122. tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
  123. sub v10.16b, v10.16b, v15.16b
  124. tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
  125. sub v11.16b, v11.16b, v15.16b
  126. tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
  127. sub v8.16b, v8.16b, v15.16b
  128. tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
  129. sub v9.16b, v9.16b, v15.16b
  130. tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
  131. sub v10.16b, v10.16b, v15.16b
  132. tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
  133. sub v11.16b, v11.16b, v15.16b
  134. tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
  135. tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
  136. tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
  137. .endm
  138. .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
  139. sshr \tmp0\().16b, \in0\().16b, #7
  140. shl \out0\().16b, \in0\().16b, #1
  141. sshr \tmp1\().16b, \in1\().16b, #7
  142. and \tmp0\().16b, \tmp0\().16b, \const\().16b
  143. shl \out1\().16b, \in1\().16b, #1
  144. and \tmp1\().16b, \tmp1\().16b, \const\().16b
  145. eor \out0\().16b, \out0\().16b, \tmp0\().16b
  146. eor \out1\().16b, \out1\().16b, \tmp1\().16b
  147. .endm
  148. .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
  149. ushr \tmp0\().16b, \in0\().16b, #6
  150. shl \out0\().16b, \in0\().16b, #2
  151. ushr \tmp1\().16b, \in1\().16b, #6
  152. pmul \tmp0\().16b, \tmp0\().16b, \const\().16b
  153. shl \out1\().16b, \in1\().16b, #2
  154. pmul \tmp1\().16b, \tmp1\().16b, \const\().16b
  155. eor \out0\().16b, \out0\().16b, \tmp0\().16b
  156. eor \out1\().16b, \out1\().16b, \tmp1\().16b
  157. .endm
  158. .macro mix_columns_2x, in0, in1, enc
  159. .if \enc == 0
  160. /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
  161. mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12
  162. eor \in0\().16b, \in0\().16b, v8.16b
  163. rev32 v8.8h, v8.8h
  164. eor \in1\().16b, \in1\().16b, v9.16b
  165. rev32 v9.8h, v9.8h
  166. eor \in0\().16b, \in0\().16b, v8.16b
  167. eor \in1\().16b, \in1\().16b, v9.16b
  168. .endif
  169. mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12
  170. rev32 v10.8h, \in0\().8h
  171. rev32 v11.8h, \in1\().8h
  172. eor v10.16b, v10.16b, v8.16b
  173. eor v11.16b, v11.16b, v9.16b
  174. eor \in0\().16b, \in0\().16b, v10.16b
  175. eor \in1\().16b, \in1\().16b, v11.16b
  176. tbl \in0\().16b, {\in0\().16b}, v14.16b
  177. tbl \in1\().16b, {\in1\().16b}, v14.16b
  178. eor \in0\().16b, \in0\().16b, v10.16b
  179. eor \in1\().16b, \in1\().16b, v11.16b
  180. .endm
  181. .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
  182. ld1 {v15.4s}, [\rk]
  183. add \rkp, \rk, #16
  184. mov \i, \rounds
  185. 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  186. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  187. eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
  188. eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
  189. movi v15.16b, #0x40
  190. tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
  191. tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
  192. tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
  193. tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
  194. sub_bytes_4x \in0, \in1, \in2, \in3
  195. subs \i, \i, #1
  196. ld1 {v15.4s}, [\rkp], #16
  197. beq 2222f
  198. mix_columns_2x \in0, \in1, \enc
  199. mix_columns_2x \in2, \in3, \enc
  200. b 1111b
  201. 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  202. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  203. eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
  204. eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
  205. .endm
  206. .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
  207. do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
  208. .endm
  209. .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
  210. do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
  211. .endm
  212. #include "aes-modes.S"
  213. .section ".rodata", "a"
  214. .align 4
  215. .LForward_ShiftRows:
  216. .octa 0x0b06010c07020d08030e09040f0a0500
  217. .LReverse_ShiftRows:
  218. .octa 0x0306090c0f0205080b0e0104070a0d00
  219. .Lror32by8:
  220. .octa 0x0c0f0e0d080b0a090407060500030201