camellia-x86_64-asm_64.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * Camellia Cipher Algorithm (x86_64)
  4. *
  5. * Copyright (C) 2012 Jussi Kivilinna <[email protected]>
  6. */
  7. #include <linux/linkage.h>
  8. .file "camellia-x86_64-asm_64.S"
  9. .text
  10. .extern camellia_sp10011110;
  11. .extern camellia_sp22000222;
  12. .extern camellia_sp03303033;
  13. .extern camellia_sp00444404;
  14. .extern camellia_sp02220222;
  15. .extern camellia_sp30333033;
  16. .extern camellia_sp44044404;
  17. .extern camellia_sp11101110;
  18. #define sp10011110 camellia_sp10011110
  19. #define sp22000222 camellia_sp22000222
  20. #define sp03303033 camellia_sp03303033
  21. #define sp00444404 camellia_sp00444404
  22. #define sp02220222 camellia_sp02220222
  23. #define sp30333033 camellia_sp30333033
  24. #define sp44044404 camellia_sp44044404
  25. #define sp11101110 camellia_sp11101110
  26. #define CAMELLIA_TABLE_BYTE_LEN 272
  27. /* struct camellia_ctx: */
  28. #define key_table 0
  29. #define key_length CAMELLIA_TABLE_BYTE_LEN
  30. /* register macros */
  31. #define CTX %rdi
  32. #define RIO %rsi
  33. #define RIOd %esi
  34. #define RAB0 %rax
  35. #define RCD0 %rcx
  36. #define RAB1 %rbx
  37. #define RCD1 %rdx
  38. #define RAB0d %eax
  39. #define RCD0d %ecx
  40. #define RAB1d %ebx
  41. #define RCD1d %edx
  42. #define RAB0bl %al
  43. #define RCD0bl %cl
  44. #define RAB1bl %bl
  45. #define RCD1bl %dl
  46. #define RAB0bh %ah
  47. #define RCD0bh %ch
  48. #define RAB1bh %bh
  49. #define RCD1bh %dh
  50. #define RT0 %rsi
  51. #define RT1 %r12
  52. #define RT2 %r8
  53. #define RT0d %esi
  54. #define RT1d %r12d
  55. #define RT2d %r8d
  56. #define RT2bl %r8b
  57. #define RXOR %r9
  58. #define RR12 %r10
  59. #define RDST %r11
  60. #define RXORd %r9d
  61. #define RXORbl %r9b
  62. #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
  63. movzbl ab ## bl, tmp2 ## d; \
  64. movzbl ab ## bh, tmp1 ## d; \
  65. rorq $16, ab; \
  66. xorq T0(, tmp2, 8), dst; \
  67. xorq T1(, tmp1, 8), dst;
  68. /**********************************************************************
  69. 1-way camellia
  70. **********************************************************************/
  71. #define roundsm(ab, subkey, cd) \
  72. movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
  73. \
  74. xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
  75. xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
  76. xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
  77. xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
  78. \
  79. xorq RT2, cd ## 0;
  80. #define fls(l, r, kl, kr) \
  81. movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
  82. andl l ## 0d, RT0d; \
  83. roll $1, RT0d; \
  84. shlq $32, RT0; \
  85. xorq RT0, l ## 0; \
  86. movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
  87. orq r ## 0, RT1; \
  88. shrq $32, RT1; \
  89. xorq RT1, r ## 0; \
  90. \
  91. movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
  92. orq l ## 0, RT2; \
  93. shrq $32, RT2; \
  94. xorq RT2, l ## 0; \
  95. movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
  96. andl r ## 0d, RT0d; \
  97. roll $1, RT0d; \
  98. shlq $32, RT0; \
  99. xorq RT0, r ## 0;
  100. #define enc_rounds(i) \
  101. roundsm(RAB, i + 2, RCD); \
  102. roundsm(RCD, i + 3, RAB); \
  103. roundsm(RAB, i + 4, RCD); \
  104. roundsm(RCD, i + 5, RAB); \
  105. roundsm(RAB, i + 6, RCD); \
  106. roundsm(RCD, i + 7, RAB);
  107. #define enc_fls(i) \
  108. fls(RAB, RCD, i + 0, i + 1);
  109. #define enc_inpack() \
  110. movq (RIO), RAB0; \
  111. bswapq RAB0; \
  112. rolq $32, RAB0; \
  113. movq 4*2(RIO), RCD0; \
  114. bswapq RCD0; \
  115. rorq $32, RCD0; \
  116. xorq key_table(CTX), RAB0;
  117. #define enc_outunpack(op, max) \
  118. xorq key_table(CTX, max, 8), RCD0; \
  119. rorq $32, RCD0; \
  120. bswapq RCD0; \
  121. op ## q RCD0, (RIO); \
  122. rolq $32, RAB0; \
  123. bswapq RAB0; \
  124. op ## q RAB0, 4*2(RIO);
  125. #define dec_rounds(i) \
  126. roundsm(RAB, i + 7, RCD); \
  127. roundsm(RCD, i + 6, RAB); \
  128. roundsm(RAB, i + 5, RCD); \
  129. roundsm(RCD, i + 4, RAB); \
  130. roundsm(RAB, i + 3, RCD); \
  131. roundsm(RCD, i + 2, RAB);
  132. #define dec_fls(i) \
  133. fls(RAB, RCD, i + 1, i + 0);
  134. #define dec_inpack(max) \
  135. movq (RIO), RAB0; \
  136. bswapq RAB0; \
  137. rolq $32, RAB0; \
  138. movq 4*2(RIO), RCD0; \
  139. bswapq RCD0; \
  140. rorq $32, RCD0; \
  141. xorq key_table(CTX, max, 8), RAB0;
  142. #define dec_outunpack() \
  143. xorq key_table(CTX), RCD0; \
  144. rorq $32, RCD0; \
  145. bswapq RCD0; \
  146. movq RCD0, (RIO); \
  147. rolq $32, RAB0; \
  148. bswapq RAB0; \
  149. movq RAB0, 4*2(RIO);
  150. SYM_FUNC_START(__camellia_enc_blk)
  151. /* input:
  152. * %rdi: ctx, CTX
  153. * %rsi: dst
  154. * %rdx: src
  155. * %rcx: bool xor
  156. */
  157. movq %r12, RR12;
  158. movq %rcx, RXOR;
  159. movq %rsi, RDST;
  160. movq %rdx, RIO;
  161. enc_inpack();
  162. enc_rounds(0);
  163. enc_fls(8);
  164. enc_rounds(8);
  165. enc_fls(16);
  166. enc_rounds(16);
  167. movl $24, RT1d; /* max */
  168. cmpb $16, key_length(CTX);
  169. je .L__enc_done;
  170. enc_fls(24);
  171. enc_rounds(24);
  172. movl $32, RT1d; /* max */
  173. .L__enc_done:
  174. testb RXORbl, RXORbl;
  175. movq RDST, RIO;
  176. jnz .L__enc_xor;
  177. enc_outunpack(mov, RT1);
  178. movq RR12, %r12;
  179. RET;
  180. .L__enc_xor:
  181. enc_outunpack(xor, RT1);
  182. movq RR12, %r12;
  183. RET;
  184. SYM_FUNC_END(__camellia_enc_blk)
  185. SYM_FUNC_START(camellia_dec_blk)
  186. /* input:
  187. * %rdi: ctx, CTX
  188. * %rsi: dst
  189. * %rdx: src
  190. */
  191. cmpl $16, key_length(CTX);
  192. movl $32, RT2d;
  193. movl $24, RXORd;
  194. cmovel RXORd, RT2d; /* max */
  195. movq %r12, RR12;
  196. movq %rsi, RDST;
  197. movq %rdx, RIO;
  198. dec_inpack(RT2);
  199. cmpb $24, RT2bl;
  200. je .L__dec_rounds16;
  201. dec_rounds(24);
  202. dec_fls(24);
  203. .L__dec_rounds16:
  204. dec_rounds(16);
  205. dec_fls(16);
  206. dec_rounds(8);
  207. dec_fls(8);
  208. dec_rounds(0);
  209. movq RDST, RIO;
  210. dec_outunpack();
  211. movq RR12, %r12;
  212. RET;
  213. SYM_FUNC_END(camellia_dec_blk)
  214. /**********************************************************************
  215. 2-way camellia
  216. **********************************************************************/
  217. #define roundsm2(ab, subkey, cd) \
  218. movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
  219. xorq RT2, cd ## 1; \
  220. \
  221. xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
  222. xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
  223. xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
  224. xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
  225. \
  226. xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
  227. xorq RT2, cd ## 0; \
  228. xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
  229. xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
  230. xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
  231. #define fls2(l, r, kl, kr) \
  232. movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
  233. andl l ## 0d, RT0d; \
  234. roll $1, RT0d; \
  235. shlq $32, RT0; \
  236. xorq RT0, l ## 0; \
  237. movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
  238. orq r ## 0, RT1; \
  239. shrq $32, RT1; \
  240. xorq RT1, r ## 0; \
  241. \
  242. movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
  243. andl l ## 1d, RT2d; \
  244. roll $1, RT2d; \
  245. shlq $32, RT2; \
  246. xorq RT2, l ## 1; \
  247. movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
  248. orq r ## 1, RT0; \
  249. shrq $32, RT0; \
  250. xorq RT0, r ## 1; \
  251. \
  252. movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
  253. orq l ## 0, RT1; \
  254. shrq $32, RT1; \
  255. xorq RT1, l ## 0; \
  256. movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
  257. andl r ## 0d, RT2d; \
  258. roll $1, RT2d; \
  259. shlq $32, RT2; \
  260. xorq RT2, r ## 0; \
  261. \
  262. movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
  263. orq l ## 1, RT0; \
  264. shrq $32, RT0; \
  265. xorq RT0, l ## 1; \
  266. movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
  267. andl r ## 1d, RT1d; \
  268. roll $1, RT1d; \
  269. shlq $32, RT1; \
  270. xorq RT1, r ## 1;
  271. #define enc_rounds2(i) \
  272. roundsm2(RAB, i + 2, RCD); \
  273. roundsm2(RCD, i + 3, RAB); \
  274. roundsm2(RAB, i + 4, RCD); \
  275. roundsm2(RCD, i + 5, RAB); \
  276. roundsm2(RAB, i + 6, RCD); \
  277. roundsm2(RCD, i + 7, RAB);
  278. #define enc_fls2(i) \
  279. fls2(RAB, RCD, i + 0, i + 1);
  280. #define enc_inpack2() \
  281. movq (RIO), RAB0; \
  282. bswapq RAB0; \
  283. rorq $32, RAB0; \
  284. movq 4*2(RIO), RCD0; \
  285. bswapq RCD0; \
  286. rolq $32, RCD0; \
  287. xorq key_table(CTX), RAB0; \
  288. \
  289. movq 8*2(RIO), RAB1; \
  290. bswapq RAB1; \
  291. rorq $32, RAB1; \
  292. movq 12*2(RIO), RCD1; \
  293. bswapq RCD1; \
  294. rolq $32, RCD1; \
  295. xorq key_table(CTX), RAB1;
  296. #define enc_outunpack2(op, max) \
  297. xorq key_table(CTX, max, 8), RCD0; \
  298. rolq $32, RCD0; \
  299. bswapq RCD0; \
  300. op ## q RCD0, (RIO); \
  301. rorq $32, RAB0; \
  302. bswapq RAB0; \
  303. op ## q RAB0, 4*2(RIO); \
  304. \
  305. xorq key_table(CTX, max, 8), RCD1; \
  306. rolq $32, RCD1; \
  307. bswapq RCD1; \
  308. op ## q RCD1, 8*2(RIO); \
  309. rorq $32, RAB1; \
  310. bswapq RAB1; \
  311. op ## q RAB1, 12*2(RIO);
  312. #define dec_rounds2(i) \
  313. roundsm2(RAB, i + 7, RCD); \
  314. roundsm2(RCD, i + 6, RAB); \
  315. roundsm2(RAB, i + 5, RCD); \
  316. roundsm2(RCD, i + 4, RAB); \
  317. roundsm2(RAB, i + 3, RCD); \
  318. roundsm2(RCD, i + 2, RAB);
  319. #define dec_fls2(i) \
  320. fls2(RAB, RCD, i + 1, i + 0);
  321. #define dec_inpack2(max) \
  322. movq (RIO), RAB0; \
  323. bswapq RAB0; \
  324. rorq $32, RAB0; \
  325. movq 4*2(RIO), RCD0; \
  326. bswapq RCD0; \
  327. rolq $32, RCD0; \
  328. xorq key_table(CTX, max, 8), RAB0; \
  329. \
  330. movq 8*2(RIO), RAB1; \
  331. bswapq RAB1; \
  332. rorq $32, RAB1; \
  333. movq 12*2(RIO), RCD1; \
  334. bswapq RCD1; \
  335. rolq $32, RCD1; \
  336. xorq key_table(CTX, max, 8), RAB1;
  337. #define dec_outunpack2() \
  338. xorq key_table(CTX), RCD0; \
  339. rolq $32, RCD0; \
  340. bswapq RCD0; \
  341. movq RCD0, (RIO); \
  342. rorq $32, RAB0; \
  343. bswapq RAB0; \
  344. movq RAB0, 4*2(RIO); \
  345. \
  346. xorq key_table(CTX), RCD1; \
  347. rolq $32, RCD1; \
  348. bswapq RCD1; \
  349. movq RCD1, 8*2(RIO); \
  350. rorq $32, RAB1; \
  351. bswapq RAB1; \
  352. movq RAB1, 12*2(RIO);
  353. SYM_FUNC_START(__camellia_enc_blk_2way)
  354. /* input:
  355. * %rdi: ctx, CTX
  356. * %rsi: dst
  357. * %rdx: src
  358. * %rcx: bool xor
  359. */
  360. pushq %rbx;
  361. movq %r12, RR12;
  362. movq %rcx, RXOR;
  363. movq %rsi, RDST;
  364. movq %rdx, RIO;
  365. enc_inpack2();
  366. enc_rounds2(0);
  367. enc_fls2(8);
  368. enc_rounds2(8);
  369. enc_fls2(16);
  370. enc_rounds2(16);
  371. movl $24, RT2d; /* max */
  372. cmpb $16, key_length(CTX);
  373. je .L__enc2_done;
  374. enc_fls2(24);
  375. enc_rounds2(24);
  376. movl $32, RT2d; /* max */
  377. .L__enc2_done:
  378. test RXORbl, RXORbl;
  379. movq RDST, RIO;
  380. jnz .L__enc2_xor;
  381. enc_outunpack2(mov, RT2);
  382. movq RR12, %r12;
  383. popq %rbx;
  384. RET;
  385. .L__enc2_xor:
  386. enc_outunpack2(xor, RT2);
  387. movq RR12, %r12;
  388. popq %rbx;
  389. RET;
  390. SYM_FUNC_END(__camellia_enc_blk_2way)
  391. SYM_FUNC_START(camellia_dec_blk_2way)
  392. /* input:
  393. * %rdi: ctx, CTX
  394. * %rsi: dst
  395. * %rdx: src
  396. */
  397. cmpl $16, key_length(CTX);
  398. movl $32, RT2d;
  399. movl $24, RXORd;
  400. cmovel RXORd, RT2d; /* max */
  401. movq %rbx, RXOR;
  402. movq %r12, RR12;
  403. movq %rsi, RDST;
  404. movq %rdx, RIO;
  405. dec_inpack2(RT2);
  406. cmpb $24, RT2bl;
  407. je .L__dec2_rounds16;
  408. dec_rounds2(24);
  409. dec_fls2(24);
  410. .L__dec2_rounds16:
  411. dec_rounds2(16);
  412. dec_fls2(16);
  413. dec_rounds2(8);
  414. dec_fls2(8);
  415. dec_rounds2(0);
  416. movq RDST, RIO;
  417. dec_outunpack2();
  418. movq RR12, %r12;
  419. movq RXOR, %rbx;
  420. RET;
  421. SYM_FUNC_END(camellia_dec_blk_2way)