sha1_ssse3_asm.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
  4. * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
  5. * processors. CPUs supporting Intel(R) AVX extensions will get an additional
  6. * boost.
  7. *
  8. * This work was inspired by the vectorized implementation of Dean Gaudet.
  9. * Additional information on it can be found at:
  10. * http://www.arctic.org/~dean/crypto/sha1.html
  11. *
  12. * It was improved upon with more efficient vectorization of the message
  13. * scheduling. This implementation has also been optimized for all current and
  14. * several future generations of Intel CPUs.
  15. *
  16. * See this article for more information about the implementation details:
  17. * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
  18. *
  19. * Copyright (C) 2010, Intel Corp.
  20. * Authors: Maxim Locktyukhin <[email protected]>
  21. * Ronen Zohar <[email protected]>
  22. *
  23. * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
  24. * Author: Mathias Krause <[email protected]>
  25. */
  26. #include <linux/linkage.h>
  27. #include <linux/cfi_types.h>
  28. #define CTX %rdi // arg1
  29. #define BUF %rsi // arg2
  30. #define CNT %rdx // arg3
  31. #define REG_A %ecx
  32. #define REG_B %esi
  33. #define REG_C %edi
  34. #define REG_D %r12d
  35. #define REG_E %edx
  36. #define REG_T1 %eax
  37. #define REG_T2 %ebx
  38. #define K_BASE %r8
  39. #define HASH_PTR %r9
  40. #define BUFFER_PTR %r10
  41. #define BUFFER_END %r11
  42. #define W_TMP1 %xmm0
  43. #define W_TMP2 %xmm9
  44. #define W0 %xmm1
  45. #define W4 %xmm2
  46. #define W8 %xmm3
  47. #define W12 %xmm4
  48. #define W16 %xmm5
  49. #define W20 %xmm6
  50. #define W24 %xmm7
  51. #define W28 %xmm8
  52. #define XMM_SHUFB_BSWAP %xmm10
  53. /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
  54. #define WK(t) (((t) & 15) * 4)(%rsp)
  55. #define W_PRECALC_AHEAD 16
  56. /*
  57. * This macro implements the SHA-1 function's body for single 64-byte block
  58. * param: function's name
  59. */
  60. .macro SHA1_VECTOR_ASM name
  61. SYM_TYPED_FUNC_START(\name)
  62. push %rbx
  63. push %r12
  64. push %rbp
  65. mov %rsp, %rbp
  66. sub $64, %rsp # allocate workspace
  67. and $~15, %rsp # align stack
  68. mov CTX, HASH_PTR
  69. mov BUF, BUFFER_PTR
  70. shl $6, CNT # multiply by 64
  71. add BUF, CNT
  72. mov CNT, BUFFER_END
  73. lea K_XMM_AR(%rip), K_BASE
  74. xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
  75. SHA1_PIPELINED_MAIN_BODY
  76. # cleanup workspace
  77. mov $8, %ecx
  78. mov %rsp, %rdi
  79. xor %eax, %eax
  80. rep stosq
  81. mov %rbp, %rsp # deallocate workspace
  82. pop %rbp
  83. pop %r12
  84. pop %rbx
  85. RET
  86. SYM_FUNC_END(\name)
  87. .endm
  88. /*
  89. * This macro implements 80 rounds of SHA-1 for one 64-byte block
  90. */
  91. .macro SHA1_PIPELINED_MAIN_BODY
  92. INIT_REGALLOC
  93. mov (HASH_PTR), A
  94. mov 4(HASH_PTR), B
  95. mov 8(HASH_PTR), C
  96. mov 12(HASH_PTR), D
  97. mov 16(HASH_PTR), E
  98. .set i, 0
  99. .rept W_PRECALC_AHEAD
  100. W_PRECALC i
  101. .set i, (i+1)
  102. .endr
  103. .align 4
  104. 1:
  105. RR F1,A,B,C,D,E,0
  106. RR F1,D,E,A,B,C,2
  107. RR F1,B,C,D,E,A,4
  108. RR F1,E,A,B,C,D,6
  109. RR F1,C,D,E,A,B,8
  110. RR F1,A,B,C,D,E,10
  111. RR F1,D,E,A,B,C,12
  112. RR F1,B,C,D,E,A,14
  113. RR F1,E,A,B,C,D,16
  114. RR F1,C,D,E,A,B,18
  115. RR F2,A,B,C,D,E,20
  116. RR F2,D,E,A,B,C,22
  117. RR F2,B,C,D,E,A,24
  118. RR F2,E,A,B,C,D,26
  119. RR F2,C,D,E,A,B,28
  120. RR F2,A,B,C,D,E,30
  121. RR F2,D,E,A,B,C,32
  122. RR F2,B,C,D,E,A,34
  123. RR F2,E,A,B,C,D,36
  124. RR F2,C,D,E,A,B,38
  125. RR F3,A,B,C,D,E,40
  126. RR F3,D,E,A,B,C,42
  127. RR F3,B,C,D,E,A,44
  128. RR F3,E,A,B,C,D,46
  129. RR F3,C,D,E,A,B,48
  130. RR F3,A,B,C,D,E,50
  131. RR F3,D,E,A,B,C,52
  132. RR F3,B,C,D,E,A,54
  133. RR F3,E,A,B,C,D,56
  134. RR F3,C,D,E,A,B,58
  135. add $64, BUFFER_PTR # move to the next 64-byte block
  136. cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
  137. cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
  138. RR F4,A,B,C,D,E,60
  139. RR F4,D,E,A,B,C,62
  140. RR F4,B,C,D,E,A,64
  141. RR F4,E,A,B,C,D,66
  142. RR F4,C,D,E,A,B,68
  143. RR F4,A,B,C,D,E,70
  144. RR F4,D,E,A,B,C,72
  145. RR F4,B,C,D,E,A,74
  146. RR F4,E,A,B,C,D,76
  147. RR F4,C,D,E,A,B,78
  148. UPDATE_HASH (HASH_PTR), A
  149. UPDATE_HASH 4(HASH_PTR), B
  150. UPDATE_HASH 8(HASH_PTR), C
  151. UPDATE_HASH 12(HASH_PTR), D
  152. UPDATE_HASH 16(HASH_PTR), E
  153. RESTORE_RENAMED_REGS
  154. cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
  155. jne 1b
  156. .endm
  157. .macro INIT_REGALLOC
  158. .set A, REG_A
  159. .set B, REG_B
  160. .set C, REG_C
  161. .set D, REG_D
  162. .set E, REG_E
  163. .set T1, REG_T1
  164. .set T2, REG_T2
  165. .endm
  166. .macro RESTORE_RENAMED_REGS
  167. # order is important (REG_C is where it should be)
  168. mov B, REG_B
  169. mov D, REG_D
  170. mov A, REG_A
  171. mov E, REG_E
  172. .endm
  173. .macro SWAP_REG_NAMES a, b
  174. .set _T, \a
  175. .set \a, \b
  176. .set \b, _T
  177. .endm
  178. .macro F1 b, c, d
  179. mov \c, T1
  180. SWAP_REG_NAMES \c, T1
  181. xor \d, T1
  182. and \b, T1
  183. xor \d, T1
  184. .endm
  185. .macro F2 b, c, d
  186. mov \d, T1
  187. SWAP_REG_NAMES \d, T1
  188. xor \c, T1
  189. xor \b, T1
  190. .endm
  191. .macro F3 b, c ,d
  192. mov \c, T1
  193. SWAP_REG_NAMES \c, T1
  194. mov \b, T2
  195. or \b, T1
  196. and \c, T2
  197. and \d, T1
  198. or T2, T1
  199. .endm
  200. .macro F4 b, c, d
  201. F2 \b, \c, \d
  202. .endm
  203. .macro UPDATE_HASH hash, val
  204. add \hash, \val
  205. mov \val, \hash
  206. .endm
  207. /*
  208. * RR does two rounds of SHA-1 back to back with W[] pre-calc
  209. * t1 = F(b, c, d); e += w(i)
  210. * e += t1; b <<= 30; d += w(i+1);
  211. * t1 = F(a, b, c);
  212. * d += t1; a <<= 5;
  213. * e += a;
  214. * t1 = e; a >>= 7;
  215. * t1 <<= 5;
  216. * d += t1;
  217. */
  218. .macro RR F, a, b, c, d, e, round
  219. add WK(\round), \e
  220. \F \b, \c, \d # t1 = F(b, c, d);
  221. W_PRECALC (\round + W_PRECALC_AHEAD)
  222. rol $30, \b
  223. add T1, \e
  224. add WK(\round + 1), \d
  225. \F \a, \b, \c
  226. W_PRECALC (\round + W_PRECALC_AHEAD + 1)
  227. rol $5, \a
  228. add \a, \e
  229. add T1, \d
  230. ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
  231. mov \e, T1
  232. SWAP_REG_NAMES \e, T1
  233. rol $5, T1
  234. add T1, \d
  235. # write: \a, \b
  236. # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
  237. .endm
  238. .macro W_PRECALC r
  239. .set i, \r
  240. .if (i < 20)
  241. .set K_XMM, 0
  242. .elseif (i < 40)
  243. .set K_XMM, 16
  244. .elseif (i < 60)
  245. .set K_XMM, 32
  246. .elseif (i < 80)
  247. .set K_XMM, 48
  248. .endif
  249. .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
  250. .set i, ((\r) % 80) # pre-compute for the next iteration
  251. .if (i == 0)
  252. W_PRECALC_RESET
  253. .endif
  254. W_PRECALC_00_15
  255. .elseif (i<32)
  256. W_PRECALC_16_31
  257. .elseif (i < 80) // rounds 32-79
  258. W_PRECALC_32_79
  259. .endif
  260. .endm
  261. .macro W_PRECALC_RESET
  262. .set W, W0
  263. .set W_minus_04, W4
  264. .set W_minus_08, W8
  265. .set W_minus_12, W12
  266. .set W_minus_16, W16
  267. .set W_minus_20, W20
  268. .set W_minus_24, W24
  269. .set W_minus_28, W28
  270. .set W_minus_32, W
  271. .endm
  272. .macro W_PRECALC_ROTATE
  273. .set W_minus_32, W_minus_28
  274. .set W_minus_28, W_minus_24
  275. .set W_minus_24, W_minus_20
  276. .set W_minus_20, W_minus_16
  277. .set W_minus_16, W_minus_12
  278. .set W_minus_12, W_minus_08
  279. .set W_minus_08, W_minus_04
  280. .set W_minus_04, W
  281. .set W, W_minus_32
  282. .endm
  283. .macro W_PRECALC_SSSE3
  284. .macro W_PRECALC_00_15
  285. W_PRECALC_00_15_SSSE3
  286. .endm
  287. .macro W_PRECALC_16_31
  288. W_PRECALC_16_31_SSSE3
  289. .endm
  290. .macro W_PRECALC_32_79
  291. W_PRECALC_32_79_SSSE3
  292. .endm
  293. /* message scheduling pre-compute for rounds 0-15 */
  294. .macro W_PRECALC_00_15_SSSE3
  295. .if ((i & 3) == 0)
  296. movdqu (i*4)(BUFFER_PTR), W_TMP1
  297. .elseif ((i & 3) == 1)
  298. pshufb XMM_SHUFB_BSWAP, W_TMP1
  299. movdqa W_TMP1, W
  300. .elseif ((i & 3) == 2)
  301. paddd (K_BASE), W_TMP1
  302. .elseif ((i & 3) == 3)
  303. movdqa W_TMP1, WK(i&~3)
  304. W_PRECALC_ROTATE
  305. .endif
  306. .endm
  307. /* message scheduling pre-compute for rounds 16-31
  308. *
  309. * - calculating last 32 w[i] values in 8 XMM registers
  310. * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
  311. * instruction
  312. *
  313. * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
  314. * dependency, but improves for 32-79
  315. */
  316. .macro W_PRECALC_16_31_SSSE3
  317. # blended scheduling of vector and scalar instruction streams, one 4-wide
  318. # vector iteration / 4 scalar rounds
  319. .if ((i & 3) == 0)
  320. movdqa W_minus_12, W
  321. palignr $8, W_minus_16, W # w[i-14]
  322. movdqa W_minus_04, W_TMP1
  323. psrldq $4, W_TMP1 # w[i-3]
  324. pxor W_minus_08, W
  325. .elseif ((i & 3) == 1)
  326. pxor W_minus_16, W_TMP1
  327. pxor W_TMP1, W
  328. movdqa W, W_TMP2
  329. movdqa W, W_TMP1
  330. pslldq $12, W_TMP2
  331. .elseif ((i & 3) == 2)
  332. psrld $31, W
  333. pslld $1, W_TMP1
  334. por W, W_TMP1
  335. movdqa W_TMP2, W
  336. psrld $30, W_TMP2
  337. pslld $2, W
  338. .elseif ((i & 3) == 3)
  339. pxor W, W_TMP1
  340. pxor W_TMP2, W_TMP1
  341. movdqa W_TMP1, W
  342. paddd K_XMM(K_BASE), W_TMP1
  343. movdqa W_TMP1, WK(i&~3)
  344. W_PRECALC_ROTATE
  345. .endif
  346. .endm
  347. /* message scheduling pre-compute for rounds 32-79
  348. *
  349. * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
  350. * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
  351. * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
  352. */
  353. .macro W_PRECALC_32_79_SSSE3
  354. .if ((i & 3) == 0)
  355. movdqa W_minus_04, W_TMP1
  356. pxor W_minus_28, W # W is W_minus_32 before xor
  357. palignr $8, W_minus_08, W_TMP1
  358. .elseif ((i & 3) == 1)
  359. pxor W_minus_16, W
  360. pxor W_TMP1, W
  361. movdqa W, W_TMP1
  362. .elseif ((i & 3) == 2)
  363. psrld $30, W
  364. pslld $2, W_TMP1
  365. por W, W_TMP1
  366. .elseif ((i & 3) == 3)
  367. movdqa W_TMP1, W
  368. paddd K_XMM(K_BASE), W_TMP1
  369. movdqa W_TMP1, WK(i&~3)
  370. W_PRECALC_ROTATE
  371. .endif
  372. .endm
  373. .endm // W_PRECALC_SSSE3
  374. #define K1 0x5a827999
  375. #define K2 0x6ed9eba1
  376. #define K3 0x8f1bbcdc
  377. #define K4 0xca62c1d6
  378. .section .rodata
  379. .align 16
  380. K_XMM_AR:
  381. .long K1, K1, K1, K1
  382. .long K2, K2, K2, K2
  383. .long K3, K3, K3, K3
  384. .long K4, K4, K4, K4
  385. BSWAP_SHUFB_CTL:
  386. .long 0x00010203
  387. .long 0x04050607
  388. .long 0x08090a0b
  389. .long 0x0c0d0e0f
  390. .section .text
  391. W_PRECALC_SSSE3
  392. .macro xmm_mov a, b
  393. movdqu \a,\b
  394. .endm
  395. /*
  396. * SSSE3 optimized implementation:
  397. *
  398. * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
  399. * const u8 *data, int blocks);
  400. *
  401. * Note that struct sha1_state is assumed to begin with u32 state[5].
  402. */
  403. SHA1_VECTOR_ASM sha1_transform_ssse3
  404. .macro W_PRECALC_AVX
  405. .purgem W_PRECALC_00_15
  406. .macro W_PRECALC_00_15
  407. W_PRECALC_00_15_AVX
  408. .endm
  409. .purgem W_PRECALC_16_31
  410. .macro W_PRECALC_16_31
  411. W_PRECALC_16_31_AVX
  412. .endm
  413. .purgem W_PRECALC_32_79
  414. .macro W_PRECALC_32_79
  415. W_PRECALC_32_79_AVX
  416. .endm
  417. .macro W_PRECALC_00_15_AVX
  418. .if ((i & 3) == 0)
  419. vmovdqu (i*4)(BUFFER_PTR), W_TMP1
  420. .elseif ((i & 3) == 1)
  421. vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
  422. .elseif ((i & 3) == 2)
  423. vpaddd (K_BASE), W, W_TMP1
  424. .elseif ((i & 3) == 3)
  425. vmovdqa W_TMP1, WK(i&~3)
  426. W_PRECALC_ROTATE
  427. .endif
  428. .endm
  429. .macro W_PRECALC_16_31_AVX
  430. .if ((i & 3) == 0)
  431. vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
  432. vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
  433. vpxor W_minus_08, W, W
  434. vpxor W_minus_16, W_TMP1, W_TMP1
  435. .elseif ((i & 3) == 1)
  436. vpxor W_TMP1, W, W
  437. vpslldq $12, W, W_TMP2
  438. vpslld $1, W, W_TMP1
  439. .elseif ((i & 3) == 2)
  440. vpsrld $31, W, W
  441. vpor W, W_TMP1, W_TMP1
  442. vpslld $2, W_TMP2, W
  443. vpsrld $30, W_TMP2, W_TMP2
  444. .elseif ((i & 3) == 3)
  445. vpxor W, W_TMP1, W_TMP1
  446. vpxor W_TMP2, W_TMP1, W
  447. vpaddd K_XMM(K_BASE), W, W_TMP1
  448. vmovdqu W_TMP1, WK(i&~3)
  449. W_PRECALC_ROTATE
  450. .endif
  451. .endm
  452. .macro W_PRECALC_32_79_AVX
  453. .if ((i & 3) == 0)
  454. vpalignr $8, W_minus_08, W_minus_04, W_TMP1
  455. vpxor W_minus_28, W, W # W is W_minus_32 before xor
  456. .elseif ((i & 3) == 1)
  457. vpxor W_minus_16, W_TMP1, W_TMP1
  458. vpxor W_TMP1, W, W
  459. .elseif ((i & 3) == 2)
  460. vpslld $2, W, W_TMP1
  461. vpsrld $30, W, W
  462. vpor W, W_TMP1, W
  463. .elseif ((i & 3) == 3)
  464. vpaddd K_XMM(K_BASE), W, W_TMP1
  465. vmovdqu W_TMP1, WK(i&~3)
  466. W_PRECALC_ROTATE
  467. .endif
  468. .endm
  469. .endm // W_PRECALC_AVX
  470. W_PRECALC_AVX
  471. .purgem xmm_mov
  472. .macro xmm_mov a, b
  473. vmovdqu \a,\b
  474. .endm
  475. /* AVX optimized implementation:
  476. * extern "C" void sha1_transform_avx(struct sha1_state *state,
  477. * const u8 *data, int blocks);
  478. */
  479. SHA1_VECTOR_ASM sha1_transform_avx