cast5-avx-x86_64-asm_64.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
  4. *
  5. * Copyright (C) 2012 Johannes Goetzfried
  6. * <[email protected]>
  7. *
  8. * Copyright © 2012 Jussi Kivilinna <[email protected]>
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/frame.h>
  12. .file "cast5-avx-x86_64-asm_64.S"
  13. .extern cast_s1
  14. .extern cast_s2
  15. .extern cast_s3
  16. .extern cast_s4
  17. /* structure of crypto context */
  18. #define km 0
  19. #define kr (16*4)
  20. #define rr ((16*4)+16)
  21. /* s-boxes */
  22. #define s1 cast_s1
  23. #define s2 cast_s2
  24. #define s3 cast_s3
  25. #define s4 cast_s4
  26. /**********************************************************************
  27. 16-way AVX cast5
  28. **********************************************************************/
  29. #define CTX %r15
  30. #define RL1 %xmm0
  31. #define RR1 %xmm1
  32. #define RL2 %xmm2
  33. #define RR2 %xmm3
  34. #define RL3 %xmm4
  35. #define RR3 %xmm5
  36. #define RL4 %xmm6
  37. #define RR4 %xmm7
  38. #define RX %xmm8
  39. #define RKM %xmm9
  40. #define RKR %xmm10
  41. #define RKRF %xmm11
  42. #define RKRR %xmm12
  43. #define R32 %xmm13
  44. #define R1ST %xmm14
  45. #define RTMP %xmm15
  46. #define RID1 %rdi
  47. #define RID1d %edi
  48. #define RID2 %rsi
  49. #define RID2d %esi
  50. #define RGI1 %rdx
  51. #define RGI1bl %dl
  52. #define RGI1bh %dh
  53. #define RGI2 %rcx
  54. #define RGI2bl %cl
  55. #define RGI2bh %ch
  56. #define RGI3 %rax
  57. #define RGI3bl %al
  58. #define RGI3bh %ah
  59. #define RGI4 %rbx
  60. #define RGI4bl %bl
  61. #define RGI4bh %bh
  62. #define RFS1 %r8
  63. #define RFS1d %r8d
  64. #define RFS2 %r9
  65. #define RFS2d %r9d
  66. #define RFS3 %r10
  67. #define RFS3d %r10d
  68. #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
  69. movzbl src ## bh, RID1d; \
  70. movzbl src ## bl, RID2d; \
  71. shrq $16, src; \
  72. movl s1(, RID1, 4), dst ## d; \
  73. op1 s2(, RID2, 4), dst ## d; \
  74. movzbl src ## bh, RID1d; \
  75. movzbl src ## bl, RID2d; \
  76. interleave_op(il_reg); \
  77. op2 s3(, RID1, 4), dst ## d; \
  78. op3 s4(, RID2, 4), dst ## d;
  79. #define dummy(d) /* do nothing */
  80. #define shr_next(reg) \
  81. shrq $16, reg;
  82. #define F_head(a, x, gi1, gi2, op0) \
  83. op0 a, RKM, x; \
  84. vpslld RKRF, x, RTMP; \
  85. vpsrld RKRR, x, x; \
  86. vpor RTMP, x, x; \
  87. \
  88. vmovq x, gi1; \
  89. vpextrq $1, x, gi2;
  90. #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
  91. lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
  92. lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
  93. \
  94. lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
  95. shlq $32, RFS2; \
  96. orq RFS1, RFS2; \
  97. lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
  98. shlq $32, RFS1; \
  99. orq RFS1, RFS3; \
  100. \
  101. vmovq RFS2, x; \
  102. vpinsrq $1, RFS3, x, x;
  103. #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
  104. F_head(b1, RX, RGI1, RGI2, op0); \
  105. F_head(b2, RX, RGI3, RGI4, op0); \
  106. \
  107. F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
  108. F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
  109. \
  110. vpxor a1, RX, a1; \
  111. vpxor a2, RTMP, a2;
  112. #define F1_2(a1, b1, a2, b2) \
  113. F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
  114. #define F2_2(a1, b1, a2, b2) \
  115. F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
  116. #define F3_2(a1, b1, a2, b2) \
  117. F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
  118. #define subround(a1, b1, a2, b2, f) \
  119. F ## f ## _2(a1, b1, a2, b2);
  120. #define round(l, r, n, f) \
  121. vbroadcastss (km+(4*n))(CTX), RKM; \
  122. vpand R1ST, RKR, RKRF; \
  123. vpsubq RKRF, R32, RKRR; \
  124. vpsrldq $1, RKR, RKR; \
  125. subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
  126. subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
  127. #define enc_preload_rkr() \
  128. vbroadcastss .L16_mask, RKR; \
  129. /* add 16-bit rotation to key rotations (mod 32) */ \
  130. vpxor kr(CTX), RKR, RKR;
  131. #define dec_preload_rkr() \
  132. vbroadcastss .L16_mask, RKR; \
  133. /* add 16-bit rotation to key rotations (mod 32) */ \
  134. vpxor kr(CTX), RKR, RKR; \
  135. vpshufb .Lbswap128_mask, RKR, RKR;
  136. #define transpose_2x4(x0, x1, t0, t1) \
  137. vpunpckldq x1, x0, t0; \
  138. vpunpckhdq x1, x0, t1; \
  139. \
  140. vpunpcklqdq t1, t0, x0; \
  141. vpunpckhqdq t1, t0, x1;
  142. #define inpack_blocks(x0, x1, t0, t1, rmask) \
  143. vpshufb rmask, x0, x0; \
  144. vpshufb rmask, x1, x1; \
  145. \
  146. transpose_2x4(x0, x1, t0, t1)
  147. #define outunpack_blocks(x0, x1, t0, t1, rmask) \
  148. transpose_2x4(x0, x1, t0, t1) \
  149. \
  150. vpshufb rmask, x0, x0; \
  151. vpshufb rmask, x1, x1;
  152. .section .rodata.cst16.bswap_mask, "aM", @progbits, 16
  153. .align 16
  154. .Lbswap_mask:
  155. .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  156. .section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  157. .align 16
  158. .Lbswap128_mask:
  159. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  160. .section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
  161. .align 16
  162. .Lbswap_iv_mask:
  163. .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
  164. .section .rodata.cst4.16_mask, "aM", @progbits, 4
  165. .align 4
  166. .L16_mask:
  167. .byte 16, 16, 16, 16
  168. .section .rodata.cst4.32_mask, "aM", @progbits, 4
  169. .align 4
  170. .L32_mask:
  171. .byte 32, 0, 0, 0
  172. .section .rodata.cst4.first_mask, "aM", @progbits, 4
  173. .align 4
  174. .Lfirst_mask:
  175. .byte 0x1f, 0, 0, 0
  176. .text
  177. .align 16
  178. SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
  179. /* input:
  180. * %rdi: ctx
  181. * RL1: blocks 1 and 2
  182. * RR1: blocks 3 and 4
  183. * RL2: blocks 5 and 6
  184. * RR2: blocks 7 and 8
  185. * RL3: blocks 9 and 10
  186. * RR3: blocks 11 and 12
  187. * RL4: blocks 13 and 14
  188. * RR4: blocks 15 and 16
  189. * output:
  190. * RL1: encrypted blocks 1 and 2
  191. * RR1: encrypted blocks 3 and 4
  192. * RL2: encrypted blocks 5 and 6
  193. * RR2: encrypted blocks 7 and 8
  194. * RL3: encrypted blocks 9 and 10
  195. * RR3: encrypted blocks 11 and 12
  196. * RL4: encrypted blocks 13 and 14
  197. * RR4: encrypted blocks 15 and 16
  198. */
  199. pushq %r15;
  200. pushq %rbx;
  201. movq %rdi, CTX;
  202. vmovdqa .Lbswap_mask, RKM;
  203. vmovd .Lfirst_mask, R1ST;
  204. vmovd .L32_mask, R32;
  205. enc_preload_rkr();
  206. inpack_blocks(RL1, RR1, RTMP, RX, RKM);
  207. inpack_blocks(RL2, RR2, RTMP, RX, RKM);
  208. inpack_blocks(RL3, RR3, RTMP, RX, RKM);
  209. inpack_blocks(RL4, RR4, RTMP, RX, RKM);
  210. round(RL, RR, 0, 1);
  211. round(RR, RL, 1, 2);
  212. round(RL, RR, 2, 3);
  213. round(RR, RL, 3, 1);
  214. round(RL, RR, 4, 2);
  215. round(RR, RL, 5, 3);
  216. round(RL, RR, 6, 1);
  217. round(RR, RL, 7, 2);
  218. round(RL, RR, 8, 3);
  219. round(RR, RL, 9, 1);
  220. round(RL, RR, 10, 2);
  221. round(RR, RL, 11, 3);
  222. movzbl rr(CTX), %eax;
  223. testl %eax, %eax;
  224. jnz .L__skip_enc;
  225. round(RL, RR, 12, 1);
  226. round(RR, RL, 13, 2);
  227. round(RL, RR, 14, 3);
  228. round(RR, RL, 15, 1);
  229. .L__skip_enc:
  230. popq %rbx;
  231. popq %r15;
  232. vmovdqa .Lbswap_mask, RKM;
  233. outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
  234. outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
  235. outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
  236. outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
  237. RET;
  238. SYM_FUNC_END(__cast5_enc_blk16)
  239. .align 16
  240. SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
  241. /* input:
  242. * %rdi: ctx
  243. * RL1: encrypted blocks 1 and 2
  244. * RR1: encrypted blocks 3 and 4
  245. * RL2: encrypted blocks 5 and 6
  246. * RR2: encrypted blocks 7 and 8
  247. * RL3: encrypted blocks 9 and 10
  248. * RR3: encrypted blocks 11 and 12
  249. * RL4: encrypted blocks 13 and 14
  250. * RR4: encrypted blocks 15 and 16
  251. * output:
  252. * RL1: decrypted blocks 1 and 2
  253. * RR1: decrypted blocks 3 and 4
  254. * RL2: decrypted blocks 5 and 6
  255. * RR2: decrypted blocks 7 and 8
  256. * RL3: decrypted blocks 9 and 10
  257. * RR3: decrypted blocks 11 and 12
  258. * RL4: decrypted blocks 13 and 14
  259. * RR4: decrypted blocks 15 and 16
  260. */
  261. pushq %r15;
  262. pushq %rbx;
  263. movq %rdi, CTX;
  264. vmovdqa .Lbswap_mask, RKM;
  265. vmovd .Lfirst_mask, R1ST;
  266. vmovd .L32_mask, R32;
  267. dec_preload_rkr();
  268. inpack_blocks(RL1, RR1, RTMP, RX, RKM);
  269. inpack_blocks(RL2, RR2, RTMP, RX, RKM);
  270. inpack_blocks(RL3, RR3, RTMP, RX, RKM);
  271. inpack_blocks(RL4, RR4, RTMP, RX, RKM);
  272. movzbl rr(CTX), %eax;
  273. testl %eax, %eax;
  274. jnz .L__skip_dec;
  275. round(RL, RR, 15, 1);
  276. round(RR, RL, 14, 3);
  277. round(RL, RR, 13, 2);
  278. round(RR, RL, 12, 1);
  279. .L__dec_tail:
  280. round(RL, RR, 11, 3);
  281. round(RR, RL, 10, 2);
  282. round(RL, RR, 9, 1);
  283. round(RR, RL, 8, 3);
  284. round(RL, RR, 7, 2);
  285. round(RR, RL, 6, 1);
  286. round(RL, RR, 5, 3);
  287. round(RR, RL, 4, 2);
  288. round(RL, RR, 3, 1);
  289. round(RR, RL, 2, 3);
  290. round(RL, RR, 1, 2);
  291. round(RR, RL, 0, 1);
  292. vmovdqa .Lbswap_mask, RKM;
  293. popq %rbx;
  294. popq %r15;
  295. outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
  296. outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
  297. outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
  298. outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
  299. RET;
  300. .L__skip_dec:
  301. vpsrldq $4, RKR, RKR;
  302. jmp .L__dec_tail;
  303. SYM_FUNC_END(__cast5_dec_blk16)
  304. SYM_FUNC_START(cast5_ecb_enc_16way)
  305. /* input:
  306. * %rdi: ctx
  307. * %rsi: dst
  308. * %rdx: src
  309. */
  310. FRAME_BEGIN
  311. pushq %r15;
  312. movq %rdi, CTX;
  313. movq %rsi, %r11;
  314. vmovdqu (0*4*4)(%rdx), RL1;
  315. vmovdqu (1*4*4)(%rdx), RR1;
  316. vmovdqu (2*4*4)(%rdx), RL2;
  317. vmovdqu (3*4*4)(%rdx), RR2;
  318. vmovdqu (4*4*4)(%rdx), RL3;
  319. vmovdqu (5*4*4)(%rdx), RR3;
  320. vmovdqu (6*4*4)(%rdx), RL4;
  321. vmovdqu (7*4*4)(%rdx), RR4;
  322. call __cast5_enc_blk16;
  323. vmovdqu RR1, (0*4*4)(%r11);
  324. vmovdqu RL1, (1*4*4)(%r11);
  325. vmovdqu RR2, (2*4*4)(%r11);
  326. vmovdqu RL2, (3*4*4)(%r11);
  327. vmovdqu RR3, (4*4*4)(%r11);
  328. vmovdqu RL3, (5*4*4)(%r11);
  329. vmovdqu RR4, (6*4*4)(%r11);
  330. vmovdqu RL4, (7*4*4)(%r11);
  331. popq %r15;
  332. FRAME_END
  333. RET;
  334. SYM_FUNC_END(cast5_ecb_enc_16way)
  335. SYM_FUNC_START(cast5_ecb_dec_16way)
  336. /* input:
  337. * %rdi: ctx
  338. * %rsi: dst
  339. * %rdx: src
  340. */
  341. FRAME_BEGIN
  342. pushq %r15;
  343. movq %rdi, CTX;
  344. movq %rsi, %r11;
  345. vmovdqu (0*4*4)(%rdx), RL1;
  346. vmovdqu (1*4*4)(%rdx), RR1;
  347. vmovdqu (2*4*4)(%rdx), RL2;
  348. vmovdqu (3*4*4)(%rdx), RR2;
  349. vmovdqu (4*4*4)(%rdx), RL3;
  350. vmovdqu (5*4*4)(%rdx), RR3;
  351. vmovdqu (6*4*4)(%rdx), RL4;
  352. vmovdqu (7*4*4)(%rdx), RR4;
  353. call __cast5_dec_blk16;
  354. vmovdqu RR1, (0*4*4)(%r11);
  355. vmovdqu RL1, (1*4*4)(%r11);
  356. vmovdqu RR2, (2*4*4)(%r11);
  357. vmovdqu RL2, (3*4*4)(%r11);
  358. vmovdqu RR3, (4*4*4)(%r11);
  359. vmovdqu RL3, (5*4*4)(%r11);
  360. vmovdqu RR4, (6*4*4)(%r11);
  361. vmovdqu RL4, (7*4*4)(%r11);
  362. popq %r15;
  363. FRAME_END
  364. RET;
  365. SYM_FUNC_END(cast5_ecb_dec_16way)
  366. SYM_FUNC_START(cast5_cbc_dec_16way)
  367. /* input:
  368. * %rdi: ctx
  369. * %rsi: dst
  370. * %rdx: src
  371. */
  372. FRAME_BEGIN
  373. pushq %r12;
  374. pushq %r15;
  375. movq %rdi, CTX;
  376. movq %rsi, %r11;
  377. movq %rdx, %r12;
  378. vmovdqu (0*16)(%rdx), RL1;
  379. vmovdqu (1*16)(%rdx), RR1;
  380. vmovdqu (2*16)(%rdx), RL2;
  381. vmovdqu (3*16)(%rdx), RR2;
  382. vmovdqu (4*16)(%rdx), RL3;
  383. vmovdqu (5*16)(%rdx), RR3;
  384. vmovdqu (6*16)(%rdx), RL4;
  385. vmovdqu (7*16)(%rdx), RR4;
  386. call __cast5_dec_blk16;
  387. /* xor with src */
  388. vmovq (%r12), RX;
  389. vpshufd $0x4f, RX, RX;
  390. vpxor RX, RR1, RR1;
  391. vpxor 0*16+8(%r12), RL1, RL1;
  392. vpxor 1*16+8(%r12), RR2, RR2;
  393. vpxor 2*16+8(%r12), RL2, RL2;
  394. vpxor 3*16+8(%r12), RR3, RR3;
  395. vpxor 4*16+8(%r12), RL3, RL3;
  396. vpxor 5*16+8(%r12), RR4, RR4;
  397. vpxor 6*16+8(%r12), RL4, RL4;
  398. vmovdqu RR1, (0*16)(%r11);
  399. vmovdqu RL1, (1*16)(%r11);
  400. vmovdqu RR2, (2*16)(%r11);
  401. vmovdqu RL2, (3*16)(%r11);
  402. vmovdqu RR3, (4*16)(%r11);
  403. vmovdqu RL3, (5*16)(%r11);
  404. vmovdqu RR4, (6*16)(%r11);
  405. vmovdqu RL4, (7*16)(%r11);
  406. popq %r15;
  407. popq %r12;
  408. FRAME_END
  409. RET;
  410. SYM_FUNC_END(cast5_cbc_dec_16way)
  411. SYM_FUNC_START(cast5_ctr_16way)
  412. /* input:
  413. * %rdi: ctx
  414. * %rsi: dst
  415. * %rdx: src
  416. * %rcx: iv (big endian, 64bit)
  417. */
  418. FRAME_BEGIN
  419. pushq %r12;
  420. pushq %r15;
  421. movq %rdi, CTX;
  422. movq %rsi, %r11;
  423. movq %rdx, %r12;
  424. vpcmpeqd RTMP, RTMP, RTMP;
  425. vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
  426. vpcmpeqd RKR, RKR, RKR;
  427. vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
  428. vmovdqa .Lbswap_iv_mask, R1ST;
  429. vmovdqa .Lbswap128_mask, RKM;
  430. /* load IV and byteswap */
  431. vmovq (%rcx), RX;
  432. vpshufb R1ST, RX, RX;
  433. /* construct IVs */
  434. vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
  435. vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
  436. vpsubq RKR, RX, RX;
  437. vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
  438. vpsubq RKR, RX, RX;
  439. vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
  440. vpsubq RKR, RX, RX;
  441. vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
  442. vpsubq RKR, RX, RX;
  443. vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
  444. vpsubq RKR, RX, RX;
  445. vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
  446. vpsubq RKR, RX, RX;
  447. vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
  448. vpsubq RKR, RX, RX;
  449. vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
  450. /* store last IV */
  451. vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
  452. vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
  453. vmovq RX, (%rcx);
  454. call __cast5_enc_blk16;
  455. /* dst = src ^ iv */
  456. vpxor (0*16)(%r12), RR1, RR1;
  457. vpxor (1*16)(%r12), RL1, RL1;
  458. vpxor (2*16)(%r12), RR2, RR2;
  459. vpxor (3*16)(%r12), RL2, RL2;
  460. vpxor (4*16)(%r12), RR3, RR3;
  461. vpxor (5*16)(%r12), RL3, RL3;
  462. vpxor (6*16)(%r12), RR4, RR4;
  463. vpxor (7*16)(%r12), RL4, RL4;
  464. vmovdqu RR1, (0*16)(%r11);
  465. vmovdqu RL1, (1*16)(%r11);
  466. vmovdqu RR2, (2*16)(%r11);
  467. vmovdqu RL2, (3*16)(%r11);
  468. vmovdqu RR3, (4*16)(%r11);
  469. vmovdqu RL3, (5*16)(%r11);
  470. vmovdqu RR4, (6*16)(%r11);
  471. vmovdqu RL4, (7*16)(%r11);
  472. popq %r15;
  473. popq %r12;
  474. FRAME_END
  475. RET;
  476. SYM_FUNC_END(cast5_ctr_16way)