twofish-i586-asm_32.S 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /***************************************************************************
  3. * Copyright (C) 2006 by Joachim Fritschi, <[email protected]> *
  4. * *
  5. ***************************************************************************/
  6. .file "twofish-i586-asm.S"
  7. .text
  8. #include <linux/linkage.h>
  9. #include <asm/asm-offsets.h>
  10. /* return address at 0 */
  11. #define in_blk 12 /* input byte array address parameter*/
  12. #define out_blk 8 /* output byte array address parameter*/
  13. #define ctx 4 /* Twofish context structure */
  14. #define a_offset 0
  15. #define b_offset 4
  16. #define c_offset 8
  17. #define d_offset 12
  18. /* Structure of the crypto context struct*/
  19. #define s0 0 /* S0 Array 256 Words each */
  20. #define s1 1024 /* S1 Array */
  21. #define s2 2048 /* S2 Array */
  22. #define s3 3072 /* S3 Array */
  23. #define w 4096 /* 8 whitening keys (word) */
  24. #define k 4128 /* key 1-32 ( word ) */
  25. /* define a few register aliases to allow macro substitution */
  26. #define R0D %eax
  27. #define R0B %al
  28. #define R0H %ah
  29. #define R1D %ebx
  30. #define R1B %bl
  31. #define R1H %bh
  32. #define R2D %ecx
  33. #define R2B %cl
  34. #define R2H %ch
  35. #define R3D %edx
  36. #define R3B %dl
  37. #define R3H %dh
  38. /* performs input whitening */
  39. #define input_whitening(src,context,offset)\
  40. xor w+offset(context), src;
  41. /* performs input whitening */
  42. #define output_whitening(src,context,offset)\
  43. xor w+16+offset(context), src;
  44. /*
  45. * a input register containing a (rotated 16)
  46. * b input register containing b
  47. * c input register containing c
  48. * d input register containing d (already rol $1)
  49. * operations on a and b are interleaved to increase performance
  50. */
  51. #define encrypt_round(a,b,c,d,round)\
  52. push d ## D;\
  53. movzx b ## B, %edi;\
  54. mov s1(%ebp,%edi,4),d ## D;\
  55. movzx a ## B, %edi;\
  56. mov s2(%ebp,%edi,4),%esi;\
  57. movzx b ## H, %edi;\
  58. ror $16, b ## D;\
  59. xor s2(%ebp,%edi,4),d ## D;\
  60. movzx a ## H, %edi;\
  61. ror $16, a ## D;\
  62. xor s3(%ebp,%edi,4),%esi;\
  63. movzx b ## B, %edi;\
  64. xor s3(%ebp,%edi,4),d ## D;\
  65. movzx a ## B, %edi;\
  66. xor (%ebp,%edi,4), %esi;\
  67. movzx b ## H, %edi;\
  68. ror $15, b ## D;\
  69. xor (%ebp,%edi,4), d ## D;\
  70. movzx a ## H, %edi;\
  71. xor s1(%ebp,%edi,4),%esi;\
  72. pop %edi;\
  73. add d ## D, %esi;\
  74. add %esi, d ## D;\
  75. add k+round(%ebp), %esi;\
  76. xor %esi, c ## D;\
  77. rol $15, c ## D;\
  78. add k+4+round(%ebp),d ## D;\
  79. xor %edi, d ## D;
  80. /*
  81. * a input register containing a (rotated 16)
  82. * b input register containing b
  83. * c input register containing c
  84. * d input register containing d (already rol $1)
  85. * operations on a and b are interleaved to increase performance
  86. * last round has different rotations for the output preparation
  87. */
  88. #define encrypt_last_round(a,b,c,d,round)\
  89. push d ## D;\
  90. movzx b ## B, %edi;\
  91. mov s1(%ebp,%edi,4),d ## D;\
  92. movzx a ## B, %edi;\
  93. mov s2(%ebp,%edi,4),%esi;\
  94. movzx b ## H, %edi;\
  95. ror $16, b ## D;\
  96. xor s2(%ebp,%edi,4),d ## D;\
  97. movzx a ## H, %edi;\
  98. ror $16, a ## D;\
  99. xor s3(%ebp,%edi,4),%esi;\
  100. movzx b ## B, %edi;\
  101. xor s3(%ebp,%edi,4),d ## D;\
  102. movzx a ## B, %edi;\
  103. xor (%ebp,%edi,4), %esi;\
  104. movzx b ## H, %edi;\
  105. ror $16, b ## D;\
  106. xor (%ebp,%edi,4), d ## D;\
  107. movzx a ## H, %edi;\
  108. xor s1(%ebp,%edi,4),%esi;\
  109. pop %edi;\
  110. add d ## D, %esi;\
  111. add %esi, d ## D;\
  112. add k+round(%ebp), %esi;\
  113. xor %esi, c ## D;\
  114. ror $1, c ## D;\
  115. add k+4+round(%ebp),d ## D;\
  116. xor %edi, d ## D;
  117. /*
  118. * a input register containing a
  119. * b input register containing b (rotated 16)
  120. * c input register containing c
  121. * d input register containing d (already rol $1)
  122. * operations on a and b are interleaved to increase performance
  123. */
  124. #define decrypt_round(a,b,c,d,round)\
  125. push c ## D;\
  126. movzx a ## B, %edi;\
  127. mov (%ebp,%edi,4), c ## D;\
  128. movzx b ## B, %edi;\
  129. mov s3(%ebp,%edi,4),%esi;\
  130. movzx a ## H, %edi;\
  131. ror $16, a ## D;\
  132. xor s1(%ebp,%edi,4),c ## D;\
  133. movzx b ## H, %edi;\
  134. ror $16, b ## D;\
  135. xor (%ebp,%edi,4), %esi;\
  136. movzx a ## B, %edi;\
  137. xor s2(%ebp,%edi,4),c ## D;\
  138. movzx b ## B, %edi;\
  139. xor s1(%ebp,%edi,4),%esi;\
  140. movzx a ## H, %edi;\
  141. ror $15, a ## D;\
  142. xor s3(%ebp,%edi,4),c ## D;\
  143. movzx b ## H, %edi;\
  144. xor s2(%ebp,%edi,4),%esi;\
  145. pop %edi;\
  146. add %esi, c ## D;\
  147. add c ## D, %esi;\
  148. add k+round(%ebp), c ## D;\
  149. xor %edi, c ## D;\
  150. add k+4+round(%ebp),%esi;\
  151. xor %esi, d ## D;\
  152. rol $15, d ## D;
  153. /*
  154. * a input register containing a
  155. * b input register containing b (rotated 16)
  156. * c input register containing c
  157. * d input register containing d (already rol $1)
  158. * operations on a and b are interleaved to increase performance
  159. * last round has different rotations for the output preparation
  160. */
  161. #define decrypt_last_round(a,b,c,d,round)\
  162. push c ## D;\
  163. movzx a ## B, %edi;\
  164. mov (%ebp,%edi,4), c ## D;\
  165. movzx b ## B, %edi;\
  166. mov s3(%ebp,%edi,4),%esi;\
  167. movzx a ## H, %edi;\
  168. ror $16, a ## D;\
  169. xor s1(%ebp,%edi,4),c ## D;\
  170. movzx b ## H, %edi;\
  171. ror $16, b ## D;\
  172. xor (%ebp,%edi,4), %esi;\
  173. movzx a ## B, %edi;\
  174. xor s2(%ebp,%edi,4),c ## D;\
  175. movzx b ## B, %edi;\
  176. xor s1(%ebp,%edi,4),%esi;\
  177. movzx a ## H, %edi;\
  178. ror $16, a ## D;\
  179. xor s3(%ebp,%edi,4),c ## D;\
  180. movzx b ## H, %edi;\
  181. xor s2(%ebp,%edi,4),%esi;\
  182. pop %edi;\
  183. add %esi, c ## D;\
  184. add c ## D, %esi;\
  185. add k+round(%ebp), c ## D;\
  186. xor %edi, c ## D;\
  187. add k+4+round(%ebp),%esi;\
  188. xor %esi, d ## D;\
  189. ror $1, d ## D;
  190. SYM_FUNC_START(twofish_enc_blk)
  191. push %ebp /* save registers according to calling convention*/
  192. push %ebx
  193. push %esi
  194. push %edi
  195. mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base
  196. * pointer to the ctx address */
  197. mov in_blk+16(%esp),%edi /* input address in edi */
  198. mov (%edi), %eax
  199. mov b_offset(%edi), %ebx
  200. mov c_offset(%edi), %ecx
  201. mov d_offset(%edi), %edx
  202. input_whitening(%eax,%ebp,a_offset)
  203. ror $16, %eax
  204. input_whitening(%ebx,%ebp,b_offset)
  205. input_whitening(%ecx,%ebp,c_offset)
  206. input_whitening(%edx,%ebp,d_offset)
  207. rol $1, %edx
  208. encrypt_round(R0,R1,R2,R3,0);
  209. encrypt_round(R2,R3,R0,R1,8);
  210. encrypt_round(R0,R1,R2,R3,2*8);
  211. encrypt_round(R2,R3,R0,R1,3*8);
  212. encrypt_round(R0,R1,R2,R3,4*8);
  213. encrypt_round(R2,R3,R0,R1,5*8);
  214. encrypt_round(R0,R1,R2,R3,6*8);
  215. encrypt_round(R2,R3,R0,R1,7*8);
  216. encrypt_round(R0,R1,R2,R3,8*8);
  217. encrypt_round(R2,R3,R0,R1,9*8);
  218. encrypt_round(R0,R1,R2,R3,10*8);
  219. encrypt_round(R2,R3,R0,R1,11*8);
  220. encrypt_round(R0,R1,R2,R3,12*8);
  221. encrypt_round(R2,R3,R0,R1,13*8);
  222. encrypt_round(R0,R1,R2,R3,14*8);
  223. encrypt_last_round(R2,R3,R0,R1,15*8);
  224. output_whitening(%eax,%ebp,c_offset)
  225. output_whitening(%ebx,%ebp,d_offset)
  226. output_whitening(%ecx,%ebp,a_offset)
  227. output_whitening(%edx,%ebp,b_offset)
  228. mov out_blk+16(%esp),%edi;
  229. mov %eax, c_offset(%edi)
  230. mov %ebx, d_offset(%edi)
  231. mov %ecx, (%edi)
  232. mov %edx, b_offset(%edi)
  233. pop %edi
  234. pop %esi
  235. pop %ebx
  236. pop %ebp
  237. mov $1, %eax
  238. RET
  239. SYM_FUNC_END(twofish_enc_blk)
  240. SYM_FUNC_START(twofish_dec_blk)
  241. push %ebp /* save registers according to calling convention*/
  242. push %ebx
  243. push %esi
  244. push %edi
  245. mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base
  246. * pointer to the ctx address */
  247. mov in_blk+16(%esp),%edi /* input address in edi */
  248. mov (%edi), %eax
  249. mov b_offset(%edi), %ebx
  250. mov c_offset(%edi), %ecx
  251. mov d_offset(%edi), %edx
  252. output_whitening(%eax,%ebp,a_offset)
  253. output_whitening(%ebx,%ebp,b_offset)
  254. ror $16, %ebx
  255. output_whitening(%ecx,%ebp,c_offset)
  256. output_whitening(%edx,%ebp,d_offset)
  257. rol $1, %ecx
  258. decrypt_round(R0,R1,R2,R3,15*8);
  259. decrypt_round(R2,R3,R0,R1,14*8);
  260. decrypt_round(R0,R1,R2,R3,13*8);
  261. decrypt_round(R2,R3,R0,R1,12*8);
  262. decrypt_round(R0,R1,R2,R3,11*8);
  263. decrypt_round(R2,R3,R0,R1,10*8);
  264. decrypt_round(R0,R1,R2,R3,9*8);
  265. decrypt_round(R2,R3,R0,R1,8*8);
  266. decrypt_round(R0,R1,R2,R3,7*8);
  267. decrypt_round(R2,R3,R0,R1,6*8);
  268. decrypt_round(R0,R1,R2,R3,5*8);
  269. decrypt_round(R2,R3,R0,R1,4*8);
  270. decrypt_round(R0,R1,R2,R3,3*8);
  271. decrypt_round(R2,R3,R0,R1,2*8);
  272. decrypt_round(R0,R1,R2,R3,1*8);
  273. decrypt_last_round(R2,R3,R0,R1,0);
  274. input_whitening(%eax,%ebp,c_offset)
  275. input_whitening(%ebx,%ebp,d_offset)
  276. input_whitening(%ecx,%ebp,a_offset)
  277. input_whitening(%edx,%ebp,b_offset)
  278. mov out_blk+16(%esp),%edi;
  279. mov %eax, c_offset(%edi)
  280. mov %ebx, d_offset(%edi)
  281. mov %ecx, (%edi)
  282. mov %edx, b_offset(%edi)
  283. pop %edi
  284. pop %esi
  285. pop %ebx
  286. pop %ebp
  287. mov $1, %eax
  288. RET
  289. SYM_FUNC_END(twofish_dec_blk)