sm4-aesni-avx2-asm_64.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
  4. * as specified in
  5. * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
  6. *
  7. * Copyright (C) 2018 Markku-Juhani O. Saarinen <[email protected]>
  8. * Copyright (C) 2020 Jussi Kivilinna <[email protected]>
  9. * Copyright (c) 2021 Tianjia Zhang <[email protected]>
  10. */
  11. /* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
  12. * https://github.com/mjosaarinen/sm4ni
  13. */
  14. #include <linux/linkage.h>
  15. #include <linux/cfi_types.h>
  16. #include <asm/frame.h>
  17. #define rRIP (%rip)
  18. /* vector registers */
  19. #define RX0 %ymm0
  20. #define RX1 %ymm1
  21. #define MASK_4BIT %ymm2
  22. #define RTMP0 %ymm3
  23. #define RTMP1 %ymm4
  24. #define RTMP2 %ymm5
  25. #define RTMP3 %ymm6
  26. #define RTMP4 %ymm7
  27. #define RA0 %ymm8
  28. #define RA1 %ymm9
  29. #define RA2 %ymm10
  30. #define RA3 %ymm11
  31. #define RB0 %ymm12
  32. #define RB1 %ymm13
  33. #define RB2 %ymm14
  34. #define RB3 %ymm15
  35. #define RNOT %ymm0
  36. #define RBSWAP %ymm1
  37. #define RX0x %xmm0
  38. #define RX1x %xmm1
  39. #define MASK_4BITx %xmm2
  40. #define RNOTx %xmm0
  41. #define RBSWAPx %xmm1
  42. #define RTMP0x %xmm3
  43. #define RTMP1x %xmm4
  44. #define RTMP2x %xmm5
  45. #define RTMP3x %xmm6
  46. #define RTMP4x %xmm7
  47. /* helper macros */
  48. /* Transpose four 32-bit words between 128-bit vector lanes. */
  49. #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
  50. vpunpckhdq x1, x0, t2; \
  51. vpunpckldq x1, x0, x0; \
  52. \
  53. vpunpckldq x3, x2, t1; \
  54. vpunpckhdq x3, x2, x2; \
  55. \
  56. vpunpckhqdq t1, x0, x1; \
  57. vpunpcklqdq t1, x0, x0; \
  58. \
  59. vpunpckhqdq x2, t2, x3; \
  60. vpunpcklqdq x2, t2, x2;
  61. /* post-SubByte transform. */
  62. #define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
  63. vpand x, mask4bit, tmp0; \
  64. vpandn x, mask4bit, x; \
  65. vpsrld $4, x, x; \
  66. \
  67. vpshufb tmp0, lo_t, tmp0; \
  68. vpshufb x, hi_t, x; \
  69. vpxor tmp0, x, x;
  70. /* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
  71. * 'vaeslastenc' instruction. */
  72. #define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
  73. vpandn mask4bit, x, tmp0; \
  74. vpsrld $4, x, x; \
  75. vpand x, mask4bit, x; \
  76. \
  77. vpshufb tmp0, lo_t, tmp0; \
  78. vpshufb x, hi_t, x; \
  79. vpxor tmp0, x, x;
  80. .section .rodata.cst16, "aM", @progbits, 16
  81. .align 16
  82. /*
  83. * Following four affine transform look-up tables are from work by
  84. * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
  85. *
  86. * These allow exposing SM4 S-Box from AES SubByte.
  87. */
  88. /* pre-SubByte affine transform, from SM4 field to AES field. */
  89. .Lpre_tf_lo_s:
  90. .quad 0x9197E2E474720701, 0xC7C1B4B222245157
  91. .Lpre_tf_hi_s:
  92. .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
  93. /* post-SubByte affine transform, from AES field to SM4 field. */
  94. .Lpost_tf_lo_s:
  95. .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
  96. .Lpost_tf_hi_s:
  97. .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
  98. /* For isolating SubBytes from AESENCLAST, inverse shift row */
  99. .Linv_shift_row:
  100. .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
  101. .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
  102. /* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
  103. .Linv_shift_row_rol_8:
  104. .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
  105. .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
  106. /* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
  107. .Linv_shift_row_rol_16:
  108. .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
  109. .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
  110. /* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
  111. .Linv_shift_row_rol_24:
  112. .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
  113. .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
  114. /* For CTR-mode IV byteswap */
  115. .Lbswap128_mask:
  116. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  117. /* For input word byte-swap */
  118. .Lbswap32_mask:
  119. .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  120. .align 4
  121. /* 4-bit mask */
  122. .L0f0f0f0f:
  123. .long 0x0f0f0f0f
  124. /* 12 bytes, only for padding */
  125. .Lpadding_deadbeef:
  126. .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
  127. .text
  128. .align 16
  129. .align 8
  130. SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
  131. /* input:
  132. * %rdi: round key array, CTX
  133. * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
  134. * plaintext blocks
  135. * output:
  136. * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
  137. * ciphertext blocks
  138. */
  139. FRAME_BEGIN
  140. vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
  141. vpshufb RTMP2, RA0, RA0;
  142. vpshufb RTMP2, RA1, RA1;
  143. vpshufb RTMP2, RA2, RA2;
  144. vpshufb RTMP2, RA3, RA3;
  145. vpshufb RTMP2, RB0, RB0;
  146. vpshufb RTMP2, RB1, RB1;
  147. vpshufb RTMP2, RB2, RB2;
  148. vpshufb RTMP2, RB3, RB3;
  149. vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;
  150. transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
  151. transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
  152. #define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
  153. vpbroadcastd (4*(round))(%rdi), RX0; \
  154. vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \
  155. vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \
  156. vmovdqa RX0, RX1; \
  157. vpxor s1, RX0, RX0; \
  158. vpxor s2, RX0, RX0; \
  159. vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
  160. vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \
  161. vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \
  162. vpxor r1, RX1, RX1; \
  163. vpxor r2, RX1, RX1; \
  164. vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
  165. \
  166. /* sbox, non-linear part */ \
  167. transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
  168. transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
  169. vextracti128 $1, RX0, RTMP4x; \
  170. vextracti128 $1, RX1, RTMP0x; \
  171. vaesenclast MASK_4BITx, RX0x, RX0x; \
  172. vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \
  173. vaesenclast MASK_4BITx, RX1x, RX1x; \
  174. vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \
  175. vinserti128 $1, RTMP4x, RX0, RX0; \
  176. vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \
  177. vinserti128 $1, RTMP0x, RX1, RX1; \
  178. transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
  179. transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
  180. \
  181. /* linear part */ \
  182. vpshufb RTMP4, RX0, RTMP0; \
  183. vpxor RTMP0, s0, s0; /* s0 ^ x */ \
  184. vpshufb RTMP4, RX1, RTMP2; \
  185. vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \
  186. vpxor RTMP2, r0, r0; /* r0 ^ x */ \
  187. vpshufb RTMP4, RX0, RTMP1; \
  188. vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
  189. vpshufb RTMP4, RX1, RTMP3; \
  190. vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \
  191. vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
  192. vpshufb RTMP4, RX0, RTMP1; \
  193. vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
  194. vpshufb RTMP4, RX1, RTMP3; \
  195. vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \
  196. vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
  197. vpshufb RTMP4, RX0, RTMP1; \
  198. vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
  199. vpslld $2, RTMP0, RTMP1; \
  200. vpsrld $30, RTMP0, RTMP0; \
  201. vpxor RTMP0, s0, s0; \
  202. /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
  203. vpxor RTMP1, s0, s0; \
  204. vpshufb RTMP4, RX1, RTMP3; \
  205. vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
  206. vpslld $2, RTMP2, RTMP3; \
  207. vpsrld $30, RTMP2, RTMP2; \
  208. vpxor RTMP2, r0, r0; \
  209. /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
  210. vpxor RTMP3, r0, r0;
  211. leaq (32*4)(%rdi), %rax;
  212. .align 16
  213. .Lroundloop_blk8:
  214. ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
  215. ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
  216. ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
  217. ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
  218. leaq (4*4)(%rdi), %rdi;
  219. cmpq %rax, %rdi;
  220. jne .Lroundloop_blk8;
  221. #undef ROUND
  222. vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
  223. transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
  224. transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
  225. vpshufb RTMP2, RA0, RA0;
  226. vpshufb RTMP2, RA1, RA1;
  227. vpshufb RTMP2, RA2, RA2;
  228. vpshufb RTMP2, RA3, RA3;
  229. vpshufb RTMP2, RB0, RB0;
  230. vpshufb RTMP2, RB1, RB1;
  231. vpshufb RTMP2, RB2, RB2;
  232. vpshufb RTMP2, RB3, RB3;
  233. FRAME_END
  234. RET;
  235. SYM_FUNC_END(__sm4_crypt_blk16)
  236. #define inc_le128(x, minus_one, tmp) \
  237. vpcmpeqq minus_one, x, tmp; \
  238. vpsubq minus_one, x, x; \
  239. vpslldq $8, tmp, tmp; \
  240. vpsubq tmp, x, x;
  241. /*
  242. * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
  243. * const u8 *src, u8 *iv)
  244. */
  245. .align 8
  246. SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
  247. /* input:
  248. * %rdi: round key array, CTX
  249. * %rsi: dst (16 blocks)
  250. * %rdx: src (16 blocks)
  251. * %rcx: iv (big endian, 128bit)
  252. */
  253. FRAME_BEGIN
  254. movq 8(%rcx), %rax;
  255. bswapq %rax;
  256. vzeroupper;
  257. vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
  258. vpcmpeqd RNOT, RNOT, RNOT;
  259. vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
  260. vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
  261. /* load IV and byteswap */
  262. vmovdqu (%rcx), RTMP4x;
  263. vpshufb RTMP3x, RTMP4x, RTMP4x;
  264. vmovdqa RTMP4x, RTMP0x;
  265. inc_le128(RTMP4x, RNOTx, RTMP1x);
  266. vinserti128 $1, RTMP4x, RTMP0, RTMP0;
  267. vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
  268. /* check need for handling 64-bit overflow and carry */
  269. cmpq $(0xffffffffffffffff - 16), %rax;
  270. ja .Lhandle_ctr_carry;
  271. /* construct IVs */
  272. vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
  273. vpshufb RTMP3, RTMP0, RA1;
  274. vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
  275. vpshufb RTMP3, RTMP0, RA2;
  276. vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
  277. vpshufb RTMP3, RTMP0, RA3;
  278. vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
  279. vpshufb RTMP3, RTMP0, RB0;
  280. vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
  281. vpshufb RTMP3, RTMP0, RB1;
  282. vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
  283. vpshufb RTMP3, RTMP0, RB2;
  284. vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
  285. vpshufb RTMP3, RTMP0, RB3;
  286. vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
  287. vpshufb RTMP3x, RTMP0x, RTMP0x;
  288. jmp .Lctr_carry_done;
  289. .Lhandle_ctr_carry:
  290. /* construct IVs */
  291. inc_le128(RTMP0, RNOT, RTMP1);
  292. inc_le128(RTMP0, RNOT, RTMP1);
  293. vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
  294. inc_le128(RTMP0, RNOT, RTMP1);
  295. inc_le128(RTMP0, RNOT, RTMP1);
  296. vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
  297. inc_le128(RTMP0, RNOT, RTMP1);
  298. inc_le128(RTMP0, RNOT, RTMP1);
  299. vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
  300. inc_le128(RTMP0, RNOT, RTMP1);
  301. inc_le128(RTMP0, RNOT, RTMP1);
  302. vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
  303. inc_le128(RTMP0, RNOT, RTMP1);
  304. inc_le128(RTMP0, RNOT, RTMP1);
  305. vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
  306. inc_le128(RTMP0, RNOT, RTMP1);
  307. inc_le128(RTMP0, RNOT, RTMP1);
  308. vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
  309. inc_le128(RTMP0, RNOT, RTMP1);
  310. inc_le128(RTMP0, RNOT, RTMP1);
  311. vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
  312. inc_le128(RTMP0, RNOT, RTMP1);
  313. vextracti128 $1, RTMP0, RTMP0x;
  314. vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
  315. .align 4
  316. .Lctr_carry_done:
  317. /* store new IV */
  318. vmovdqu RTMP0x, (%rcx);
  319. call __sm4_crypt_blk16;
  320. vpxor (0 * 32)(%rdx), RA0, RA0;
  321. vpxor (1 * 32)(%rdx), RA1, RA1;
  322. vpxor (2 * 32)(%rdx), RA2, RA2;
  323. vpxor (3 * 32)(%rdx), RA3, RA3;
  324. vpxor (4 * 32)(%rdx), RB0, RB0;
  325. vpxor (5 * 32)(%rdx), RB1, RB1;
  326. vpxor (6 * 32)(%rdx), RB2, RB2;
  327. vpxor (7 * 32)(%rdx), RB3, RB3;
  328. vmovdqu RA0, (0 * 32)(%rsi);
  329. vmovdqu RA1, (1 * 32)(%rsi);
  330. vmovdqu RA2, (2 * 32)(%rsi);
  331. vmovdqu RA3, (3 * 32)(%rsi);
  332. vmovdqu RB0, (4 * 32)(%rsi);
  333. vmovdqu RB1, (5 * 32)(%rsi);
  334. vmovdqu RB2, (6 * 32)(%rsi);
  335. vmovdqu RB3, (7 * 32)(%rsi);
  336. vzeroall;
  337. FRAME_END
  338. RET;
  339. SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
  340. /*
  341. * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
  342. * const u8 *src, u8 *iv)
  343. */
  344. .align 8
  345. SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
  346. /* input:
  347. * %rdi: round key array, CTX
  348. * %rsi: dst (16 blocks)
  349. * %rdx: src (16 blocks)
  350. * %rcx: iv
  351. */
  352. FRAME_BEGIN
  353. vzeroupper;
  354. vmovdqu (0 * 32)(%rdx), RA0;
  355. vmovdqu (1 * 32)(%rdx), RA1;
  356. vmovdqu (2 * 32)(%rdx), RA2;
  357. vmovdqu (3 * 32)(%rdx), RA3;
  358. vmovdqu (4 * 32)(%rdx), RB0;
  359. vmovdqu (5 * 32)(%rdx), RB1;
  360. vmovdqu (6 * 32)(%rdx), RB2;
  361. vmovdqu (7 * 32)(%rdx), RB3;
  362. call __sm4_crypt_blk16;
  363. vmovdqu (%rcx), RNOTx;
  364. vinserti128 $1, (%rdx), RNOT, RNOT;
  365. vpxor RNOT, RA0, RA0;
  366. vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
  367. vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
  368. vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
  369. vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
  370. vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
  371. vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
  372. vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
  373. vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
  374. vmovdqu RNOTx, (%rcx); /* store new IV */
  375. vmovdqu RA0, (0 * 32)(%rsi);
  376. vmovdqu RA1, (1 * 32)(%rsi);
  377. vmovdqu RA2, (2 * 32)(%rsi);
  378. vmovdqu RA3, (3 * 32)(%rsi);
  379. vmovdqu RB0, (4 * 32)(%rsi);
  380. vmovdqu RB1, (5 * 32)(%rsi);
  381. vmovdqu RB2, (6 * 32)(%rsi);
  382. vmovdqu RB3, (7 * 32)(%rsi);
  383. vzeroall;
  384. FRAME_END
  385. RET;
  386. SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
  387. /*
  388. * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
  389. * const u8 *src, u8 *iv)
  390. */
  391. .align 8
  392. SYM_TYPED_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
  393. /* input:
  394. * %rdi: round key array, CTX
  395. * %rsi: dst (16 blocks)
  396. * %rdx: src (16 blocks)
  397. * %rcx: iv
  398. */
  399. FRAME_BEGIN
  400. vzeroupper;
  401. /* Load input */
  402. vmovdqu (%rcx), RNOTx;
  403. vinserti128 $1, (%rdx), RNOT, RA0;
  404. vmovdqu (0 * 32 + 16)(%rdx), RA1;
  405. vmovdqu (1 * 32 + 16)(%rdx), RA2;
  406. vmovdqu (2 * 32 + 16)(%rdx), RA3;
  407. vmovdqu (3 * 32 + 16)(%rdx), RB0;
  408. vmovdqu (4 * 32 + 16)(%rdx), RB1;
  409. vmovdqu (5 * 32 + 16)(%rdx), RB2;
  410. vmovdqu (6 * 32 + 16)(%rdx), RB3;
  411. /* Update IV */
  412. vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
  413. vmovdqu RNOTx, (%rcx);
  414. call __sm4_crypt_blk16;
  415. vpxor (0 * 32)(%rdx), RA0, RA0;
  416. vpxor (1 * 32)(%rdx), RA1, RA1;
  417. vpxor (2 * 32)(%rdx), RA2, RA2;
  418. vpxor (3 * 32)(%rdx), RA3, RA3;
  419. vpxor (4 * 32)(%rdx), RB0, RB0;
  420. vpxor (5 * 32)(%rdx), RB1, RB1;
  421. vpxor (6 * 32)(%rdx), RB2, RB2;
  422. vpxor (7 * 32)(%rdx), RB3, RB3;
  423. vmovdqu RA0, (0 * 32)(%rsi);
  424. vmovdqu RA1, (1 * 32)(%rsi);
  425. vmovdqu RA2, (2 * 32)(%rsi);
  426. vmovdqu RA3, (3 * 32)(%rsi);
  427. vmovdqu RB0, (4 * 32)(%rsi);
  428. vmovdqu RB1, (5 * 32)(%rsi);
  429. vmovdqu RB2, (6 * 32)(%rsi);
  430. vmovdqu RB3, (7 * 32)(%rsi);
  431. vzeroall;
  432. FRAME_END
  433. RET;
  434. SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)