twofish-x86_64-asm_64-3way.S 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * Twofish Cipher 3-way parallel algorithm (x86_64)
  4. *
  5. * Copyright (C) 2011 Jussi Kivilinna <[email protected]>
  6. */
  7. #include <linux/linkage.h>
  8. .file "twofish-x86_64-asm-3way.S"
  9. .text
  10. /* structure of crypto context */
  11. #define s0 0
  12. #define s1 1024
  13. #define s2 2048
  14. #define s3 3072
  15. #define w 4096
  16. #define k 4128
  17. /**********************************************************************
  18. 3-way twofish
  19. **********************************************************************/
  20. #define CTX %rdi
  21. #define RIO %rdx
  22. #define RAB0 %rax
  23. #define RAB1 %rbx
  24. #define RAB2 %rcx
  25. #define RAB0d %eax
  26. #define RAB1d %ebx
  27. #define RAB2d %ecx
  28. #define RAB0bh %ah
  29. #define RAB1bh %bh
  30. #define RAB2bh %ch
  31. #define RAB0bl %al
  32. #define RAB1bl %bl
  33. #define RAB2bl %cl
  34. #define CD0 0x0(%rsp)
  35. #define CD1 0x8(%rsp)
  36. #define CD2 0x10(%rsp)
  37. # used only before/after all rounds
  38. #define RCD0 %r8
  39. #define RCD1 %r9
  40. #define RCD2 %r10
  41. # used only during rounds
  42. #define RX0 %r8
  43. #define RX1 %r9
  44. #define RX2 %r10
  45. #define RX0d %r8d
  46. #define RX1d %r9d
  47. #define RX2d %r10d
  48. #define RY0 %r11
  49. #define RY1 %r12
  50. #define RY2 %r13
  51. #define RY0d %r11d
  52. #define RY1d %r12d
  53. #define RY2d %r13d
  54. #define RT0 %rdx
  55. #define RT1 %rsi
  56. #define RT0d %edx
  57. #define RT1d %esi
  58. #define RT1bl %sil
  59. #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
  60. movzbl ab ## bl, tmp2 ## d; \
  61. movzbl ab ## bh, tmp1 ## d; \
  62. rorq $(rot), ab; \
  63. op1##l T0(CTX, tmp2, 4), dst ## d; \
  64. op2##l T1(CTX, tmp1, 4), dst ## d;
  65. #define swap_ab_with_cd(ab, cd, tmp) \
  66. movq cd, tmp; \
  67. movq ab, cd; \
  68. movq tmp, ab;
  69. /*
  70. * Combined G1 & G2 function. Reordered with help of rotates to have moves
  71. * at beginning.
  72. */
  73. #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
  74. /* G1,1 && G2,1 */ \
  75. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
  76. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
  77. \
  78. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
  79. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
  80. \
  81. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
  82. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
  83. \
  84. /* G1,2 && G2,2 */ \
  85. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
  86. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
  87. swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
  88. \
  89. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
  90. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
  91. swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
  92. \
  93. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
  94. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
  95. swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
  96. #define enc_round_end(ab, x, y, n) \
  97. addl y ## d, x ## d; \
  98. addl x ## d, y ## d; \
  99. addl k+4*(2*(n))(CTX), x ## d; \
  100. xorl ab ## d, x ## d; \
  101. addl k+4*(2*(n)+1)(CTX), y ## d; \
  102. shrq $32, ab; \
  103. roll $1, ab ## d; \
  104. xorl y ## d, ab ## d; \
  105. shlq $32, ab; \
  106. rorl $1, x ## d; \
  107. orq x, ab;
  108. #define dec_round_end(ba, x, y, n) \
  109. addl y ## d, x ## d; \
  110. addl x ## d, y ## d; \
  111. addl k+4*(2*(n))(CTX), x ## d; \
  112. addl k+4*(2*(n)+1)(CTX), y ## d; \
  113. xorl ba ## d, y ## d; \
  114. shrq $32, ba; \
  115. roll $1, ba ## d; \
  116. xorl x ## d, ba ## d; \
  117. shlq $32, ba; \
  118. rorl $1, y ## d; \
  119. orq y, ba;
  120. #define encrypt_round3(ab, cd, n) \
  121. g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
  122. \
  123. enc_round_end(ab ## 0, RX0, RY0, n); \
  124. enc_round_end(ab ## 1, RX1, RY1, n); \
  125. enc_round_end(ab ## 2, RX2, RY2, n);
  126. #define decrypt_round3(ba, dc, n) \
  127. g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
  128. \
  129. dec_round_end(ba ## 0, RX0, RY0, n); \
  130. dec_round_end(ba ## 1, RX1, RY1, n); \
  131. dec_round_end(ba ## 2, RX2, RY2, n);
  132. #define encrypt_cycle3(ab, cd, n) \
  133. encrypt_round3(ab, cd, n*2); \
  134. encrypt_round3(ab, cd, (n*2)+1);
  135. #define decrypt_cycle3(ba, dc, n) \
  136. decrypt_round3(ba, dc, (n*2)+1); \
  137. decrypt_round3(ba, dc, (n*2));
  138. #define push_cd() \
  139. pushq RCD2; \
  140. pushq RCD1; \
  141. pushq RCD0;
  142. #define pop_cd() \
  143. popq RCD0; \
  144. popq RCD1; \
  145. popq RCD2;
  146. #define inpack3(in, n, xy, m) \
  147. movq 4*(n)(in), xy ## 0; \
  148. xorq w+4*m(CTX), xy ## 0; \
  149. \
  150. movq 4*(4+(n))(in), xy ## 1; \
  151. xorq w+4*m(CTX), xy ## 1; \
  152. \
  153. movq 4*(8+(n))(in), xy ## 2; \
  154. xorq w+4*m(CTX), xy ## 2;
  155. #define outunpack3(op, out, n, xy, m) \
  156. xorq w+4*m(CTX), xy ## 0; \
  157. op ## q xy ## 0, 4*(n)(out); \
  158. \
  159. xorq w+4*m(CTX), xy ## 1; \
  160. op ## q xy ## 1, 4*(4+(n))(out); \
  161. \
  162. xorq w+4*m(CTX), xy ## 2; \
  163. op ## q xy ## 2, 4*(8+(n))(out);
  164. #define inpack_enc3() \
  165. inpack3(RIO, 0, RAB, 0); \
  166. inpack3(RIO, 2, RCD, 2);
  167. #define outunpack_enc3(op) \
  168. outunpack3(op, RIO, 2, RAB, 6); \
  169. outunpack3(op, RIO, 0, RCD, 4);
  170. #define inpack_dec3() \
  171. inpack3(RIO, 0, RAB, 4); \
  172. rorq $32, RAB0; \
  173. rorq $32, RAB1; \
  174. rorq $32, RAB2; \
  175. inpack3(RIO, 2, RCD, 6); \
  176. rorq $32, RCD0; \
  177. rorq $32, RCD1; \
  178. rorq $32, RCD2;
  179. #define outunpack_dec3() \
  180. rorq $32, RCD0; \
  181. rorq $32, RCD1; \
  182. rorq $32, RCD2; \
  183. outunpack3(mov, RIO, 0, RCD, 0); \
  184. rorq $32, RAB0; \
  185. rorq $32, RAB1; \
  186. rorq $32, RAB2; \
  187. outunpack3(mov, RIO, 2, RAB, 2);
  188. SYM_FUNC_START(__twofish_enc_blk_3way)
  189. /* input:
  190. * %rdi: ctx, CTX
  191. * %rsi: dst
  192. * %rdx: src, RIO
  193. * %rcx: bool, if true: xor output
  194. */
  195. pushq %r13;
  196. pushq %r12;
  197. pushq %rbx;
  198. pushq %rcx; /* bool xor */
  199. pushq %rsi; /* dst */
  200. inpack_enc3();
  201. push_cd();
  202. encrypt_cycle3(RAB, CD, 0);
  203. encrypt_cycle3(RAB, CD, 1);
  204. encrypt_cycle3(RAB, CD, 2);
  205. encrypt_cycle3(RAB, CD, 3);
  206. encrypt_cycle3(RAB, CD, 4);
  207. encrypt_cycle3(RAB, CD, 5);
  208. encrypt_cycle3(RAB, CD, 6);
  209. encrypt_cycle3(RAB, CD, 7);
  210. pop_cd();
  211. popq RIO; /* dst */
  212. popq RT1; /* bool xor */
  213. testb RT1bl, RT1bl;
  214. jnz .L__enc_xor3;
  215. outunpack_enc3(mov);
  216. popq %rbx;
  217. popq %r12;
  218. popq %r13;
  219. RET;
  220. .L__enc_xor3:
  221. outunpack_enc3(xor);
  222. popq %rbx;
  223. popq %r12;
  224. popq %r13;
  225. RET;
  226. SYM_FUNC_END(__twofish_enc_blk_3way)
  227. SYM_FUNC_START(twofish_dec_blk_3way)
  228. /* input:
  229. * %rdi: ctx, CTX
  230. * %rsi: dst
  231. * %rdx: src, RIO
  232. */
  233. pushq %r13;
  234. pushq %r12;
  235. pushq %rbx;
  236. pushq %rsi; /* dst */
  237. inpack_dec3();
  238. push_cd();
  239. decrypt_cycle3(RAB, CD, 7);
  240. decrypt_cycle3(RAB, CD, 6);
  241. decrypt_cycle3(RAB, CD, 5);
  242. decrypt_cycle3(RAB, CD, 4);
  243. decrypt_cycle3(RAB, CD, 3);
  244. decrypt_cycle3(RAB, CD, 2);
  245. decrypt_cycle3(RAB, CD, 1);
  246. decrypt_cycle3(RAB, CD, 0);
  247. pop_cd();
  248. popq RIO; /* dst */
  249. outunpack_dec3();
  250. popq %rbx;
  251. popq %r12;
  252. popq %r13;
  253. RET;
  254. SYM_FUNC_END(twofish_dec_blk_3way)