blowfish-x86_64-asm_64.S 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * Blowfish Cipher Algorithm (x86_64)
  4. *
  5. * Copyright (C) 2011 Jussi Kivilinna <[email protected]>
  6. */
  7. #include <linux/linkage.h>
  8. #include <linux/cfi_types.h>
  9. .file "blowfish-x86_64-asm.S"
  10. .text
  11. /* structure of crypto context */
  12. #define p 0
  13. #define s0 ((16 + 2) * 4)
  14. #define s1 ((16 + 2 + (1 * 256)) * 4)
  15. #define s2 ((16 + 2 + (2 * 256)) * 4)
  16. #define s3 ((16 + 2 + (3 * 256)) * 4)
  17. /* register macros */
  18. #define CTX %r12
  19. #define RIO %rsi
  20. #define RX0 %rax
  21. #define RX1 %rbx
  22. #define RX2 %rcx
  23. #define RX3 %rdx
  24. #define RX0d %eax
  25. #define RX1d %ebx
  26. #define RX2d %ecx
  27. #define RX3d %edx
  28. #define RX0bl %al
  29. #define RX1bl %bl
  30. #define RX2bl %cl
  31. #define RX3bl %dl
  32. #define RX0bh %ah
  33. #define RX1bh %bh
  34. #define RX2bh %ch
  35. #define RX3bh %dh
  36. #define RT0 %rdi
  37. #define RT1 %rsi
  38. #define RT2 %r8
  39. #define RT3 %r9
  40. #define RT0d %edi
  41. #define RT1d %esi
  42. #define RT2d %r8d
  43. #define RT3d %r9d
  44. #define RKEY %r10
  45. /***********************************************************************
  46. * 1-way blowfish
  47. ***********************************************************************/
  48. #define F() \
  49. rorq $16, RX0; \
  50. movzbl RX0bh, RT0d; \
  51. movzbl RX0bl, RT1d; \
  52. rolq $16, RX0; \
  53. movl s0(CTX,RT0,4), RT0d; \
  54. addl s1(CTX,RT1,4), RT0d; \
  55. movzbl RX0bh, RT1d; \
  56. movzbl RX0bl, RT2d; \
  57. rolq $32, RX0; \
  58. xorl s2(CTX,RT1,4), RT0d; \
  59. addl s3(CTX,RT2,4), RT0d; \
  60. xorq RT0, RX0;
  61. #define add_roundkey_enc(n) \
  62. xorq p+4*(n)(CTX), RX0;
  63. #define round_enc(n) \
  64. add_roundkey_enc(n); \
  65. \
  66. F(); \
  67. F();
  68. #define add_roundkey_dec(n) \
  69. movq p+4*(n-1)(CTX), RT0; \
  70. rorq $32, RT0; \
  71. xorq RT0, RX0;
  72. #define round_dec(n) \
  73. add_roundkey_dec(n); \
  74. \
  75. F(); \
  76. F(); \
  77. #define read_block() \
  78. movq (RIO), RX0; \
  79. rorq $32, RX0; \
  80. bswapq RX0;
  81. #define write_block() \
  82. bswapq RX0; \
  83. movq RX0, (RIO);
  84. #define xor_block() \
  85. bswapq RX0; \
  86. xorq RX0, (RIO);
  87. SYM_FUNC_START(__blowfish_enc_blk)
  88. /* input:
  89. * %rdi: ctx
  90. * %rsi: dst
  91. * %rdx: src
  92. * %rcx: bool, if true: xor output
  93. */
  94. movq %r12, %r11;
  95. movq %rdi, CTX;
  96. movq %rsi, %r10;
  97. movq %rdx, RIO;
  98. read_block();
  99. round_enc(0);
  100. round_enc(2);
  101. round_enc(4);
  102. round_enc(6);
  103. round_enc(8);
  104. round_enc(10);
  105. round_enc(12);
  106. round_enc(14);
  107. add_roundkey_enc(16);
  108. movq %r11, %r12;
  109. movq %r10, RIO;
  110. test %cl, %cl;
  111. jnz .L__enc_xor;
  112. write_block();
  113. RET;
  114. .L__enc_xor:
  115. xor_block();
  116. RET;
  117. SYM_FUNC_END(__blowfish_enc_blk)
  118. SYM_TYPED_FUNC_START(blowfish_dec_blk)
  119. /* input:
  120. * %rdi: ctx
  121. * %rsi: dst
  122. * %rdx: src
  123. */
  124. movq %r12, %r11;
  125. movq %rdi, CTX;
  126. movq %rsi, %r10;
  127. movq %rdx, RIO;
  128. read_block();
  129. round_dec(17);
  130. round_dec(15);
  131. round_dec(13);
  132. round_dec(11);
  133. round_dec(9);
  134. round_dec(7);
  135. round_dec(5);
  136. round_dec(3);
  137. add_roundkey_dec(1);
  138. movq %r10, RIO;
  139. write_block();
  140. movq %r11, %r12;
  141. RET;
  142. SYM_FUNC_END(blowfish_dec_blk)
  143. /**********************************************************************
  144. 4-way blowfish, four blocks parallel
  145. **********************************************************************/
  146. /* F() for 4-way. Slower when used alone/1-way, but faster when used
  147. * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
  148. */
  149. #define F4(x) \
  150. movzbl x ## bh, RT1d; \
  151. movzbl x ## bl, RT3d; \
  152. rorq $16, x; \
  153. movzbl x ## bh, RT0d; \
  154. movzbl x ## bl, RT2d; \
  155. rorq $16, x; \
  156. movl s0(CTX,RT0,4), RT0d; \
  157. addl s1(CTX,RT2,4), RT0d; \
  158. xorl s2(CTX,RT1,4), RT0d; \
  159. addl s3(CTX,RT3,4), RT0d; \
  160. xorq RT0, x;
  161. #define add_preloaded_roundkey4() \
  162. xorq RKEY, RX0; \
  163. xorq RKEY, RX1; \
  164. xorq RKEY, RX2; \
  165. xorq RKEY, RX3;
  166. #define preload_roundkey_enc(n) \
  167. movq p+4*(n)(CTX), RKEY;
  168. #define add_roundkey_enc4(n) \
  169. add_preloaded_roundkey4(); \
  170. preload_roundkey_enc(n + 2);
  171. #define round_enc4(n) \
  172. add_roundkey_enc4(n); \
  173. \
  174. F4(RX0); \
  175. F4(RX1); \
  176. F4(RX2); \
  177. F4(RX3); \
  178. \
  179. F4(RX0); \
  180. F4(RX1); \
  181. F4(RX2); \
  182. F4(RX3);
  183. #define preload_roundkey_dec(n) \
  184. movq p+4*((n)-1)(CTX), RKEY; \
  185. rorq $32, RKEY;
  186. #define add_roundkey_dec4(n) \
  187. add_preloaded_roundkey4(); \
  188. preload_roundkey_dec(n - 2);
  189. #define round_dec4(n) \
  190. add_roundkey_dec4(n); \
  191. \
  192. F4(RX0); \
  193. F4(RX1); \
  194. F4(RX2); \
  195. F4(RX3); \
  196. \
  197. F4(RX0); \
  198. F4(RX1); \
  199. F4(RX2); \
  200. F4(RX3);
  201. #define read_block4() \
  202. movq (RIO), RX0; \
  203. rorq $32, RX0; \
  204. bswapq RX0; \
  205. \
  206. movq 8(RIO), RX1; \
  207. rorq $32, RX1; \
  208. bswapq RX1; \
  209. \
  210. movq 16(RIO), RX2; \
  211. rorq $32, RX2; \
  212. bswapq RX2; \
  213. \
  214. movq 24(RIO), RX3; \
  215. rorq $32, RX3; \
  216. bswapq RX3;
  217. #define write_block4() \
  218. bswapq RX0; \
  219. movq RX0, (RIO); \
  220. \
  221. bswapq RX1; \
  222. movq RX1, 8(RIO); \
  223. \
  224. bswapq RX2; \
  225. movq RX2, 16(RIO); \
  226. \
  227. bswapq RX3; \
  228. movq RX3, 24(RIO);
  229. #define xor_block4() \
  230. bswapq RX0; \
  231. xorq RX0, (RIO); \
  232. \
  233. bswapq RX1; \
  234. xorq RX1, 8(RIO); \
  235. \
  236. bswapq RX2; \
  237. xorq RX2, 16(RIO); \
  238. \
  239. bswapq RX3; \
  240. xorq RX3, 24(RIO);
  241. SYM_FUNC_START(__blowfish_enc_blk_4way)
  242. /* input:
  243. * %rdi: ctx
  244. * %rsi: dst
  245. * %rdx: src
  246. * %rcx: bool, if true: xor output
  247. */
  248. pushq %r12;
  249. pushq %rbx;
  250. pushq %rcx;
  251. movq %rdi, CTX
  252. movq %rsi, %r11;
  253. movq %rdx, RIO;
  254. preload_roundkey_enc(0);
  255. read_block4();
  256. round_enc4(0);
  257. round_enc4(2);
  258. round_enc4(4);
  259. round_enc4(6);
  260. round_enc4(8);
  261. round_enc4(10);
  262. round_enc4(12);
  263. round_enc4(14);
  264. add_preloaded_roundkey4();
  265. popq %r12;
  266. movq %r11, RIO;
  267. test %r12b, %r12b;
  268. jnz .L__enc_xor4;
  269. write_block4();
  270. popq %rbx;
  271. popq %r12;
  272. RET;
  273. .L__enc_xor4:
  274. xor_block4();
  275. popq %rbx;
  276. popq %r12;
  277. RET;
  278. SYM_FUNC_END(__blowfish_enc_blk_4way)
  279. SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
  280. /* input:
  281. * %rdi: ctx
  282. * %rsi: dst
  283. * %rdx: src
  284. */
  285. pushq %r12;
  286. pushq %rbx;
  287. movq %rdi, CTX;
  288. movq %rsi, %r11
  289. movq %rdx, RIO;
  290. preload_roundkey_dec(17);
  291. read_block4();
  292. round_dec4(17);
  293. round_dec4(15);
  294. round_dec4(13);
  295. round_dec4(11);
  296. round_dec4(9);
  297. round_dec4(7);
  298. round_dec4(5);
  299. round_dec4(3);
  300. add_preloaded_roundkey4();
  301. movq %r11, RIO;
  302. write_block4();
  303. popq %rbx;
  304. popq %r12;
  305. RET;
  306. SYM_FUNC_END(blowfish_dec_blk_4way)