twofish-x86_64-asm_64.S 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /***************************************************************************
  3. * Copyright (C) 2006 by Joachim Fritschi, <[email protected]> *
  4. * *
  5. ***************************************************************************/
  6. .file "twofish-x86_64-asm.S"
  7. .text
  8. #include <linux/linkage.h>
  9. #include <asm/asm-offsets.h>
  10. #define a_offset 0
  11. #define b_offset 4
  12. #define c_offset 8
  13. #define d_offset 12
  14. /* Structure of the crypto context struct*/
  15. #define s0 0 /* S0 Array 256 Words each */
  16. #define s1 1024 /* S1 Array */
  17. #define s2 2048 /* S2 Array */
  18. #define s3 3072 /* S3 Array */
  19. #define w 4096 /* 8 whitening keys (word) */
  20. #define k 4128 /* key 1-32 ( word ) */
  21. /* define a few register aliases to allow macro substitution */
  22. #define R0 %rax
  23. #define R0D %eax
  24. #define R0B %al
  25. #define R0H %ah
  26. #define R1 %rbx
  27. #define R1D %ebx
  28. #define R1B %bl
  29. #define R1H %bh
  30. #define R2 %rcx
  31. #define R2D %ecx
  32. #define R2B %cl
  33. #define R2H %ch
  34. #define R3 %rdx
  35. #define R3D %edx
  36. #define R3B %dl
  37. #define R3H %dh
  38. /* performs input whitening */
  39. #define input_whitening(src,context,offset)\
  40. xor w+offset(context), src;
  41. /* performs input whitening */
  42. #define output_whitening(src,context,offset)\
  43. xor w+16+offset(context), src;
  44. /*
  45. * a input register containing a (rotated 16)
  46. * b input register containing b
  47. * c input register containing c
  48. * d input register containing d (already rol $1)
  49. * operations on a and b are interleaved to increase performance
  50. */
  51. #define encrypt_round(a,b,c,d,round)\
  52. movzx b ## B, %edi;\
  53. mov s1(%r11,%rdi,4),%r8d;\
  54. movzx a ## B, %edi;\
  55. mov s2(%r11,%rdi,4),%r9d;\
  56. movzx b ## H, %edi;\
  57. ror $16, b ## D;\
  58. xor s2(%r11,%rdi,4),%r8d;\
  59. movzx a ## H, %edi;\
  60. ror $16, a ## D;\
  61. xor s3(%r11,%rdi,4),%r9d;\
  62. movzx b ## B, %edi;\
  63. xor s3(%r11,%rdi,4),%r8d;\
  64. movzx a ## B, %edi;\
  65. xor (%r11,%rdi,4), %r9d;\
  66. movzx b ## H, %edi;\
  67. ror $15, b ## D;\
  68. xor (%r11,%rdi,4), %r8d;\
  69. movzx a ## H, %edi;\
  70. xor s1(%r11,%rdi,4),%r9d;\
  71. add %r8d, %r9d;\
  72. add %r9d, %r8d;\
  73. add k+round(%r11), %r9d;\
  74. xor %r9d, c ## D;\
  75. rol $15, c ## D;\
  76. add k+4+round(%r11),%r8d;\
  77. xor %r8d, d ## D;
  78. /*
  79. * a input register containing a(rotated 16)
  80. * b input register containing b
  81. * c input register containing c
  82. * d input register containing d (already rol $1)
  83. * operations on a and b are interleaved to increase performance
  84. * during the round a and b are prepared for the output whitening
  85. */
  86. #define encrypt_last_round(a,b,c,d,round)\
  87. mov b ## D, %r10d;\
  88. shl $32, %r10;\
  89. movzx b ## B, %edi;\
  90. mov s1(%r11,%rdi,4),%r8d;\
  91. movzx a ## B, %edi;\
  92. mov s2(%r11,%rdi,4),%r9d;\
  93. movzx b ## H, %edi;\
  94. ror $16, b ## D;\
  95. xor s2(%r11,%rdi,4),%r8d;\
  96. movzx a ## H, %edi;\
  97. ror $16, a ## D;\
  98. xor s3(%r11,%rdi,4),%r9d;\
  99. movzx b ## B, %edi;\
  100. xor s3(%r11,%rdi,4),%r8d;\
  101. movzx a ## B, %edi;\
  102. xor (%r11,%rdi,4), %r9d;\
  103. xor a, %r10;\
  104. movzx b ## H, %edi;\
  105. xor (%r11,%rdi,4), %r8d;\
  106. movzx a ## H, %edi;\
  107. xor s1(%r11,%rdi,4),%r9d;\
  108. add %r8d, %r9d;\
  109. add %r9d, %r8d;\
  110. add k+round(%r11), %r9d;\
  111. xor %r9d, c ## D;\
  112. ror $1, c ## D;\
  113. add k+4+round(%r11),%r8d;\
  114. xor %r8d, d ## D
  115. /*
  116. * a input register containing a
  117. * b input register containing b (rotated 16)
  118. * c input register containing c (already rol $1)
  119. * d input register containing d
  120. * operations on a and b are interleaved to increase performance
  121. */
  122. #define decrypt_round(a,b,c,d,round)\
  123. movzx a ## B, %edi;\
  124. mov (%r11,%rdi,4), %r9d;\
  125. movzx b ## B, %edi;\
  126. mov s3(%r11,%rdi,4),%r8d;\
  127. movzx a ## H, %edi;\
  128. ror $16, a ## D;\
  129. xor s1(%r11,%rdi,4),%r9d;\
  130. movzx b ## H, %edi;\
  131. ror $16, b ## D;\
  132. xor (%r11,%rdi,4), %r8d;\
  133. movzx a ## B, %edi;\
  134. xor s2(%r11,%rdi,4),%r9d;\
  135. movzx b ## B, %edi;\
  136. xor s1(%r11,%rdi,4),%r8d;\
  137. movzx a ## H, %edi;\
  138. ror $15, a ## D;\
  139. xor s3(%r11,%rdi,4),%r9d;\
  140. movzx b ## H, %edi;\
  141. xor s2(%r11,%rdi,4),%r8d;\
  142. add %r8d, %r9d;\
  143. add %r9d, %r8d;\
  144. add k+round(%r11), %r9d;\
  145. xor %r9d, c ## D;\
  146. add k+4+round(%r11),%r8d;\
  147. xor %r8d, d ## D;\
  148. rol $15, d ## D;
  149. /*
  150. * a input register containing a
  151. * b input register containing b
  152. * c input register containing c (already rol $1)
  153. * d input register containing d
  154. * operations on a and b are interleaved to increase performance
  155. * during the round a and b are prepared for the output whitening
  156. */
  157. #define decrypt_last_round(a,b,c,d,round)\
  158. movzx a ## B, %edi;\
  159. mov (%r11,%rdi,4), %r9d;\
  160. movzx b ## B, %edi;\
  161. mov s3(%r11,%rdi,4),%r8d;\
  162. movzx b ## H, %edi;\
  163. ror $16, b ## D;\
  164. xor (%r11,%rdi,4), %r8d;\
  165. movzx a ## H, %edi;\
  166. mov b ## D, %r10d;\
  167. shl $32, %r10;\
  168. xor a, %r10;\
  169. ror $16, a ## D;\
  170. xor s1(%r11,%rdi,4),%r9d;\
  171. movzx b ## B, %edi;\
  172. xor s1(%r11,%rdi,4),%r8d;\
  173. movzx a ## B, %edi;\
  174. xor s2(%r11,%rdi,4),%r9d;\
  175. movzx b ## H, %edi;\
  176. xor s2(%r11,%rdi,4),%r8d;\
  177. movzx a ## H, %edi;\
  178. xor s3(%r11,%rdi,4),%r9d;\
  179. add %r8d, %r9d;\
  180. add %r9d, %r8d;\
  181. add k+round(%r11), %r9d;\
  182. xor %r9d, c ## D;\
  183. add k+4+round(%r11),%r8d;\
  184. xor %r8d, d ## D;\
  185. ror $1, d ## D;
  186. SYM_FUNC_START(twofish_enc_blk)
  187. pushq R1
  188. /* %rdi contains the ctx address */
  189. /* %rsi contains the output address */
  190. /* %rdx contains the input address */
  191. /* ctx address is moved to free one non-rex register
  192. as target for the 8bit high operations */
  193. mov %rdi, %r11
  194. movq (R3), R1
  195. movq 8(R3), R3
  196. input_whitening(R1,%r11,a_offset)
  197. input_whitening(R3,%r11,c_offset)
  198. mov R1D, R0D
  199. rol $16, R0D
  200. shr $32, R1
  201. mov R3D, R2D
  202. shr $32, R3
  203. rol $1, R3D
  204. encrypt_round(R0,R1,R2,R3,0);
  205. encrypt_round(R2,R3,R0,R1,8);
  206. encrypt_round(R0,R1,R2,R3,2*8);
  207. encrypt_round(R2,R3,R0,R1,3*8);
  208. encrypt_round(R0,R1,R2,R3,4*8);
  209. encrypt_round(R2,R3,R0,R1,5*8);
  210. encrypt_round(R0,R1,R2,R3,6*8);
  211. encrypt_round(R2,R3,R0,R1,7*8);
  212. encrypt_round(R0,R1,R2,R3,8*8);
  213. encrypt_round(R2,R3,R0,R1,9*8);
  214. encrypt_round(R0,R1,R2,R3,10*8);
  215. encrypt_round(R2,R3,R0,R1,11*8);
  216. encrypt_round(R0,R1,R2,R3,12*8);
  217. encrypt_round(R2,R3,R0,R1,13*8);
  218. encrypt_round(R0,R1,R2,R3,14*8);
  219. encrypt_last_round(R2,R3,R0,R1,15*8);
  220. output_whitening(%r10,%r11,a_offset)
  221. movq %r10, (%rsi)
  222. shl $32, R1
  223. xor R0, R1
  224. output_whitening(R1,%r11,c_offset)
  225. movq R1, 8(%rsi)
  226. popq R1
  227. movl $1,%eax
  228. RET
  229. SYM_FUNC_END(twofish_enc_blk)
  230. SYM_FUNC_START(twofish_dec_blk)
  231. pushq R1
  232. /* %rdi contains the ctx address */
  233. /* %rsi contains the output address */
  234. /* %rdx contains the input address */
  235. /* ctx address is moved to free one non-rex register
  236. as target for the 8bit high operations */
  237. mov %rdi, %r11
  238. movq (R3), R1
  239. movq 8(R3), R3
  240. output_whitening(R1,%r11,a_offset)
  241. output_whitening(R3,%r11,c_offset)
  242. mov R1D, R0D
  243. shr $32, R1
  244. rol $16, R1D
  245. mov R3D, R2D
  246. shr $32, R3
  247. rol $1, R2D
  248. decrypt_round(R0,R1,R2,R3,15*8);
  249. decrypt_round(R2,R3,R0,R1,14*8);
  250. decrypt_round(R0,R1,R2,R3,13*8);
  251. decrypt_round(R2,R3,R0,R1,12*8);
  252. decrypt_round(R0,R1,R2,R3,11*8);
  253. decrypt_round(R2,R3,R0,R1,10*8);
  254. decrypt_round(R0,R1,R2,R3,9*8);
  255. decrypt_round(R2,R3,R0,R1,8*8);
  256. decrypt_round(R0,R1,R2,R3,7*8);
  257. decrypt_round(R2,R3,R0,R1,6*8);
  258. decrypt_round(R0,R1,R2,R3,5*8);
  259. decrypt_round(R2,R3,R0,R1,4*8);
  260. decrypt_round(R0,R1,R2,R3,3*8);
  261. decrypt_round(R2,R3,R0,R1,2*8);
  262. decrypt_round(R0,R1,R2,R3,1*8);
  263. decrypt_last_round(R2,R3,R0,R1,0);
  264. input_whitening(%r10,%r11,a_offset)
  265. movq %r10, (%rsi)
  266. shl $32, R1
  267. xor R0, R1
  268. input_whitening(R1,%r11,c_offset)
  269. movq R1, 8(%rsi)
  270. popq R1
  271. movl $1,%eax
  272. RET
  273. SYM_FUNC_END(twofish_dec_blk)