twofish-avx-x86_64-asm_64.S 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
  4. *
  5. * Copyright (C) 2012 Johannes Goetzfried
  6. * <[email protected]>
  7. *
  8. * Copyright © 2012-2013 Jussi Kivilinna <[email protected]>
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/frame.h>
  12. #include "glue_helper-asm-avx.S"
  13. .file "twofish-avx-x86_64-asm_64.S"
  14. .section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  15. .align 16
  16. .Lbswap128_mask:
  17. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  18. .text
  19. /* structure of crypto context */
  20. #define s0 0
  21. #define s1 1024
  22. #define s2 2048
  23. #define s3 3072
  24. #define w 4096
  25. #define k 4128
  26. /**********************************************************************
  27. 8-way AVX twofish
  28. **********************************************************************/
  29. #define CTX %rdi
  30. #define RA1 %xmm0
  31. #define RB1 %xmm1
  32. #define RC1 %xmm2
  33. #define RD1 %xmm3
  34. #define RA2 %xmm4
  35. #define RB2 %xmm5
  36. #define RC2 %xmm6
  37. #define RD2 %xmm7
  38. #define RX0 %xmm8
  39. #define RY0 %xmm9
  40. #define RX1 %xmm10
  41. #define RY1 %xmm11
  42. #define RK1 %xmm12
  43. #define RK2 %xmm13
  44. #define RT %xmm14
  45. #define RR %xmm15
  46. #define RID1 %r13
  47. #define RID1d %r13d
  48. #define RID2 %rsi
  49. #define RID2d %esi
  50. #define RGI1 %rdx
  51. #define RGI1bl %dl
  52. #define RGI1bh %dh
  53. #define RGI2 %rcx
  54. #define RGI2bl %cl
  55. #define RGI2bh %ch
  56. #define RGI3 %rax
  57. #define RGI3bl %al
  58. #define RGI3bh %ah
  59. #define RGI4 %rbx
  60. #define RGI4bl %bl
  61. #define RGI4bh %bh
  62. #define RGS1 %r8
  63. #define RGS1d %r8d
  64. #define RGS2 %r9
  65. #define RGS2d %r9d
  66. #define RGS3 %r10
  67. #define RGS3d %r10d
  68. #define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
  69. movzbl src ## bl, RID1d; \
  70. movzbl src ## bh, RID2d; \
  71. shrq $16, src; \
  72. movl t0(CTX, RID1, 4), dst ## d; \
  73. movl t1(CTX, RID2, 4), RID2d; \
  74. movzbl src ## bl, RID1d; \
  75. xorl RID2d, dst ## d; \
  76. movzbl src ## bh, RID2d; \
  77. interleave_op(il_reg); \
  78. xorl t2(CTX, RID1, 4), dst ## d; \
  79. xorl t3(CTX, RID2, 4), dst ## d;
  80. #define dummy(d) /* do nothing */
  81. #define shr_next(reg) \
  82. shrq $16, reg;
  83. #define G(gi1, gi2, x, t0, t1, t2, t3) \
  84. lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \
  85. lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \
  86. \
  87. lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \
  88. shlq $32, RGS2; \
  89. orq RGS1, RGS2; \
  90. lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \
  91. shlq $32, RGS1; \
  92. orq RGS1, RGS3;
  93. #define round_head_2(a, b, x1, y1, x2, y2) \
  94. vmovq b ## 1, RGI3; \
  95. vpextrq $1, b ## 1, RGI4; \
  96. \
  97. G(RGI1, RGI2, x1, s0, s1, s2, s3); \
  98. vmovq a ## 2, RGI1; \
  99. vpextrq $1, a ## 2, RGI2; \
  100. vmovq RGS2, x1; \
  101. vpinsrq $1, RGS3, x1, x1; \
  102. \
  103. G(RGI3, RGI4, y1, s1, s2, s3, s0); \
  104. vmovq b ## 2, RGI3; \
  105. vpextrq $1, b ## 2, RGI4; \
  106. vmovq RGS2, y1; \
  107. vpinsrq $1, RGS3, y1, y1; \
  108. \
  109. G(RGI1, RGI2, x2, s0, s1, s2, s3); \
  110. vmovq RGS2, x2; \
  111. vpinsrq $1, RGS3, x2, x2; \
  112. \
  113. G(RGI3, RGI4, y2, s1, s2, s3, s0); \
  114. vmovq RGS2, y2; \
  115. vpinsrq $1, RGS3, y2, y2;
  116. #define encround_tail(a, b, c, d, x, y, prerotate) \
  117. vpaddd x, y, x; \
  118. vpaddd x, RK1, RT;\
  119. prerotate(b); \
  120. vpxor RT, c, c; \
  121. vpaddd y, x, y; \
  122. vpaddd y, RK2, y; \
  123. vpsrld $1, c, RT; \
  124. vpslld $(32 - 1), c, c; \
  125. vpor c, RT, c; \
  126. vpxor d, y, d; \
  127. #define decround_tail(a, b, c, d, x, y, prerotate) \
  128. vpaddd x, y, x; \
  129. vpaddd x, RK1, RT;\
  130. prerotate(a); \
  131. vpxor RT, c, c; \
  132. vpaddd y, x, y; \
  133. vpaddd y, RK2, y; \
  134. vpxor d, y, d; \
  135. vpsrld $1, d, y; \
  136. vpslld $(32 - 1), d, d; \
  137. vpor d, y, d; \
  138. #define rotate_1l(x) \
  139. vpslld $1, x, RR; \
  140. vpsrld $(32 - 1), x, x; \
  141. vpor x, RR, x;
  142. #define preload_rgi(c) \
  143. vmovq c, RGI1; \
  144. vpextrq $1, c, RGI2;
  145. #define encrypt_round(n, a, b, c, d, preload, prerotate) \
  146. vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
  147. vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
  148. round_head_2(a, b, RX0, RY0, RX1, RY1); \
  149. encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
  150. preload(c ## 1); \
  151. encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
  152. #define decrypt_round(n, a, b, c, d, preload, prerotate) \
  153. vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
  154. vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
  155. round_head_2(a, b, RX0, RY0, RX1, RY1); \
  156. decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
  157. preload(c ## 1); \
  158. decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
  159. #define encrypt_cycle(n) \
  160. encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
  161. encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
  162. #define encrypt_cycle_last(n) \
  163. encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
  164. encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
  165. #define decrypt_cycle(n) \
  166. decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
  167. decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
  168. #define decrypt_cycle_last(n) \
  169. decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
  170. decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
  171. #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  172. vpunpckldq x1, x0, t0; \
  173. vpunpckhdq x1, x0, t2; \
  174. vpunpckldq x3, x2, t1; \
  175. vpunpckhdq x3, x2, x3; \
  176. \
  177. vpunpcklqdq t1, t0, x0; \
  178. vpunpckhqdq t1, t0, x1; \
  179. vpunpcklqdq x3, t2, x2; \
  180. vpunpckhqdq x3, t2, x3;
  181. #define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
  182. vpxor x0, wkey, x0; \
  183. vpxor x1, wkey, x1; \
  184. vpxor x2, wkey, x2; \
  185. vpxor x3, wkey, x3; \
  186. \
  187. transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  188. #define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
  189. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  190. \
  191. vpxor x0, wkey, x0; \
  192. vpxor x1, wkey, x1; \
  193. vpxor x2, wkey, x2; \
  194. vpxor x3, wkey, x3;
  195. .align 8
  196. SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
  197. /* input:
  198. * %rdi: ctx, CTX
  199. * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
  200. * output:
  201. * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
  202. */
  203. vmovdqu w(CTX), RK1;
  204. pushq %r13;
  205. pushq %rbx;
  206. pushq %rcx;
  207. inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
  208. preload_rgi(RA1);
  209. rotate_1l(RD1);
  210. inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
  211. rotate_1l(RD2);
  212. encrypt_cycle(0);
  213. encrypt_cycle(1);
  214. encrypt_cycle(2);
  215. encrypt_cycle(3);
  216. encrypt_cycle(4);
  217. encrypt_cycle(5);
  218. encrypt_cycle(6);
  219. encrypt_cycle_last(7);
  220. vmovdqu (w+4*4)(CTX), RK1;
  221. popq %rcx;
  222. popq %rbx;
  223. popq %r13;
  224. outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
  225. outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
  226. RET;
  227. SYM_FUNC_END(__twofish_enc_blk8)
  228. .align 8
  229. SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
  230. /* input:
  231. * %rdi: ctx, CTX
  232. * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
  233. * output:
  234. * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
  235. */
  236. vmovdqu (w+4*4)(CTX), RK1;
  237. pushq %r13;
  238. pushq %rbx;
  239. inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
  240. preload_rgi(RC1);
  241. rotate_1l(RA1);
  242. inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
  243. rotate_1l(RA2);
  244. decrypt_cycle(7);
  245. decrypt_cycle(6);
  246. decrypt_cycle(5);
  247. decrypt_cycle(4);
  248. decrypt_cycle(3);
  249. decrypt_cycle(2);
  250. decrypt_cycle(1);
  251. decrypt_cycle_last(0);
  252. vmovdqu (w)(CTX), RK1;
  253. popq %rbx;
  254. popq %r13;
  255. outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
  256. outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
  257. RET;
  258. SYM_FUNC_END(__twofish_dec_blk8)
  259. SYM_FUNC_START(twofish_ecb_enc_8way)
  260. /* input:
  261. * %rdi: ctx, CTX
  262. * %rsi: dst
  263. * %rdx: src
  264. */
  265. FRAME_BEGIN
  266. movq %rsi, %r11;
  267. load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
  268. call __twofish_enc_blk8;
  269. store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
  270. FRAME_END
  271. RET;
  272. SYM_FUNC_END(twofish_ecb_enc_8way)
  273. SYM_FUNC_START(twofish_ecb_dec_8way)
  274. /* input:
  275. * %rdi: ctx, CTX
  276. * %rsi: dst
  277. * %rdx: src
  278. */
  279. FRAME_BEGIN
  280. movq %rsi, %r11;
  281. load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
  282. call __twofish_dec_blk8;
  283. store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
  284. FRAME_END
  285. RET;
  286. SYM_FUNC_END(twofish_ecb_dec_8way)
  287. SYM_FUNC_START(twofish_cbc_dec_8way)
  288. /* input:
  289. * %rdi: ctx, CTX
  290. * %rsi: dst
  291. * %rdx: src
  292. */
  293. FRAME_BEGIN
  294. pushq %r12;
  295. movq %rsi, %r11;
  296. movq %rdx, %r12;
  297. load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
  298. call __twofish_dec_blk8;
  299. store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
  300. popq %r12;
  301. FRAME_END
  302. RET;
  303. SYM_FUNC_END(twofish_cbc_dec_8way)