chacha-ssse3-x86_64.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
  4. *
  5. * Copyright (C) 2015 Martin Willi
  6. */
  7. #include <linux/linkage.h>
  8. #include <asm/frame.h>
  9. .section .rodata.cst16.ROT8, "aM", @progbits, 16
  10. .align 16
  11. ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
  12. .section .rodata.cst16.ROT16, "aM", @progbits, 16
  13. .align 16
  14. ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
  15. .section .rodata.cst16.CTRINC, "aM", @progbits, 16
  16. .align 16
  17. CTRINC: .octa 0x00000003000000020000000100000000
  18. .text
  19. /*
  20. * chacha_permute - permute one block
  21. *
  22. * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
  23. * function performs matrix operations on four words in parallel, but requires
  24. * shuffling to rearrange the words after each round. 8/16-bit word rotation is
  25. * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
  26. * rotation uses traditional shift+OR.
  27. *
  28. * The round count is given in %r8d.
  29. *
  30. * Clobbers: %r8d, %xmm4-%xmm7
  31. */
  32. SYM_FUNC_START_LOCAL(chacha_permute)
  33. movdqa ROT8(%rip),%xmm4
  34. movdqa ROT16(%rip),%xmm5
  35. .Ldoubleround:
  36. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  37. paddd %xmm1,%xmm0
  38. pxor %xmm0,%xmm3
  39. pshufb %xmm5,%xmm3
  40. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  41. paddd %xmm3,%xmm2
  42. pxor %xmm2,%xmm1
  43. movdqa %xmm1,%xmm6
  44. pslld $12,%xmm6
  45. psrld $20,%xmm1
  46. por %xmm6,%xmm1
  47. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  48. paddd %xmm1,%xmm0
  49. pxor %xmm0,%xmm3
  50. pshufb %xmm4,%xmm3
  51. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  52. paddd %xmm3,%xmm2
  53. pxor %xmm2,%xmm1
  54. movdqa %xmm1,%xmm7
  55. pslld $7,%xmm7
  56. psrld $25,%xmm1
  57. por %xmm7,%xmm1
  58. # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  59. pshufd $0x39,%xmm1,%xmm1
  60. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  61. pshufd $0x4e,%xmm2,%xmm2
  62. # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  63. pshufd $0x93,%xmm3,%xmm3
  64. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  65. paddd %xmm1,%xmm0
  66. pxor %xmm0,%xmm3
  67. pshufb %xmm5,%xmm3
  68. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  69. paddd %xmm3,%xmm2
  70. pxor %xmm2,%xmm1
  71. movdqa %xmm1,%xmm6
  72. pslld $12,%xmm6
  73. psrld $20,%xmm1
  74. por %xmm6,%xmm1
  75. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  76. paddd %xmm1,%xmm0
  77. pxor %xmm0,%xmm3
  78. pshufb %xmm4,%xmm3
  79. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  80. paddd %xmm3,%xmm2
  81. pxor %xmm2,%xmm1
  82. movdqa %xmm1,%xmm7
  83. pslld $7,%xmm7
  84. psrld $25,%xmm1
  85. por %xmm7,%xmm1
  86. # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  87. pshufd $0x93,%xmm1,%xmm1
  88. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  89. pshufd $0x4e,%xmm2,%xmm2
  90. # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  91. pshufd $0x39,%xmm3,%xmm3
  92. sub $2,%r8d
  93. jnz .Ldoubleround
  94. RET
  95. SYM_FUNC_END(chacha_permute)
  96. SYM_FUNC_START(chacha_block_xor_ssse3)
  97. # %rdi: Input state matrix, s
  98. # %rsi: up to 1 data block output, o
  99. # %rdx: up to 1 data block input, i
  100. # %rcx: input/output length in bytes
  101. # %r8d: nrounds
  102. FRAME_BEGIN
  103. # x0..3 = s0..3
  104. movdqu 0x00(%rdi),%xmm0
  105. movdqu 0x10(%rdi),%xmm1
  106. movdqu 0x20(%rdi),%xmm2
  107. movdqu 0x30(%rdi),%xmm3
  108. movdqa %xmm0,%xmm8
  109. movdqa %xmm1,%xmm9
  110. movdqa %xmm2,%xmm10
  111. movdqa %xmm3,%xmm11
  112. mov %rcx,%rax
  113. call chacha_permute
  114. # o0 = i0 ^ (x0 + s0)
  115. paddd %xmm8,%xmm0
  116. cmp $0x10,%rax
  117. jl .Lxorpart
  118. movdqu 0x00(%rdx),%xmm4
  119. pxor %xmm4,%xmm0
  120. movdqu %xmm0,0x00(%rsi)
  121. # o1 = i1 ^ (x1 + s1)
  122. paddd %xmm9,%xmm1
  123. movdqa %xmm1,%xmm0
  124. cmp $0x20,%rax
  125. jl .Lxorpart
  126. movdqu 0x10(%rdx),%xmm0
  127. pxor %xmm1,%xmm0
  128. movdqu %xmm0,0x10(%rsi)
  129. # o2 = i2 ^ (x2 + s2)
  130. paddd %xmm10,%xmm2
  131. movdqa %xmm2,%xmm0
  132. cmp $0x30,%rax
  133. jl .Lxorpart
  134. movdqu 0x20(%rdx),%xmm0
  135. pxor %xmm2,%xmm0
  136. movdqu %xmm0,0x20(%rsi)
  137. # o3 = i3 ^ (x3 + s3)
  138. paddd %xmm11,%xmm3
  139. movdqa %xmm3,%xmm0
  140. cmp $0x40,%rax
  141. jl .Lxorpart
  142. movdqu 0x30(%rdx),%xmm0
  143. pxor %xmm3,%xmm0
  144. movdqu %xmm0,0x30(%rsi)
  145. .Ldone:
  146. FRAME_END
  147. RET
  148. .Lxorpart:
  149. # xor remaining bytes from partial register into output
  150. mov %rax,%r9
  151. and $0x0f,%r9
  152. jz .Ldone
  153. and $~0x0f,%rax
  154. mov %rsi,%r11
  155. lea 8(%rsp),%r10
  156. sub $0x10,%rsp
  157. and $~31,%rsp
  158. lea (%rdx,%rax),%rsi
  159. mov %rsp,%rdi
  160. mov %r9,%rcx
  161. rep movsb
  162. pxor 0x00(%rsp),%xmm0
  163. movdqa %xmm0,0x00(%rsp)
  164. mov %rsp,%rsi
  165. lea (%r11,%rax),%rdi
  166. mov %r9,%rcx
  167. rep movsb
  168. lea -8(%r10),%rsp
  169. jmp .Ldone
  170. SYM_FUNC_END(chacha_block_xor_ssse3)
  171. SYM_FUNC_START(hchacha_block_ssse3)
  172. # %rdi: Input state matrix, s
  173. # %rsi: output (8 32-bit words)
  174. # %edx: nrounds
  175. FRAME_BEGIN
  176. movdqu 0x00(%rdi),%xmm0
  177. movdqu 0x10(%rdi),%xmm1
  178. movdqu 0x20(%rdi),%xmm2
  179. movdqu 0x30(%rdi),%xmm3
  180. mov %edx,%r8d
  181. call chacha_permute
  182. movdqu %xmm0,0x00(%rsi)
  183. movdqu %xmm3,0x10(%rsi)
  184. FRAME_END
  185. RET
  186. SYM_FUNC_END(hchacha_block_ssse3)
  187. SYM_FUNC_START(chacha_4block_xor_ssse3)
  188. # %rdi: Input state matrix, s
  189. # %rsi: up to 4 data blocks output, o
  190. # %rdx: up to 4 data blocks input, i
  191. # %rcx: input/output length in bytes
  192. # %r8d: nrounds
  193. # This function encrypts four consecutive ChaCha blocks by loading the
  194. # the state matrix in SSE registers four times. As we need some scratch
  195. # registers, we save the first four registers on the stack. The
  196. # algorithm performs each operation on the corresponding word of each
  197. # state matrix, hence requires no word shuffling. For final XORing step
  198. # we transpose the matrix by interleaving 32- and then 64-bit words,
  199. # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
  200. # done with the slightly better performing SSSE3 byte shuffling,
  201. # 7/12-bit word rotation uses traditional shift+OR.
  202. lea 8(%rsp),%r10
  203. sub $0x80,%rsp
  204. and $~63,%rsp
  205. mov %rcx,%rax
  206. # x0..15[0-3] = s0..3[0..3]
  207. movq 0x00(%rdi),%xmm1
  208. pshufd $0x00,%xmm1,%xmm0
  209. pshufd $0x55,%xmm1,%xmm1
  210. movq 0x08(%rdi),%xmm3
  211. pshufd $0x00,%xmm3,%xmm2
  212. pshufd $0x55,%xmm3,%xmm3
  213. movq 0x10(%rdi),%xmm5
  214. pshufd $0x00,%xmm5,%xmm4
  215. pshufd $0x55,%xmm5,%xmm5
  216. movq 0x18(%rdi),%xmm7
  217. pshufd $0x00,%xmm7,%xmm6
  218. pshufd $0x55,%xmm7,%xmm7
  219. movq 0x20(%rdi),%xmm9
  220. pshufd $0x00,%xmm9,%xmm8
  221. pshufd $0x55,%xmm9,%xmm9
  222. movq 0x28(%rdi),%xmm11
  223. pshufd $0x00,%xmm11,%xmm10
  224. pshufd $0x55,%xmm11,%xmm11
  225. movq 0x30(%rdi),%xmm13
  226. pshufd $0x00,%xmm13,%xmm12
  227. pshufd $0x55,%xmm13,%xmm13
  228. movq 0x38(%rdi),%xmm15
  229. pshufd $0x00,%xmm15,%xmm14
  230. pshufd $0x55,%xmm15,%xmm15
  231. # x0..3 on stack
  232. movdqa %xmm0,0x00(%rsp)
  233. movdqa %xmm1,0x10(%rsp)
  234. movdqa %xmm2,0x20(%rsp)
  235. movdqa %xmm3,0x30(%rsp)
  236. movdqa CTRINC(%rip),%xmm1
  237. movdqa ROT8(%rip),%xmm2
  238. movdqa ROT16(%rip),%xmm3
  239. # x12 += counter values 0-3
  240. paddd %xmm1,%xmm12
  241. .Ldoubleround4:
  242. # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  243. movdqa 0x00(%rsp),%xmm0
  244. paddd %xmm4,%xmm0
  245. movdqa %xmm0,0x00(%rsp)
  246. pxor %xmm0,%xmm12
  247. pshufb %xmm3,%xmm12
  248. # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  249. movdqa 0x10(%rsp),%xmm0
  250. paddd %xmm5,%xmm0
  251. movdqa %xmm0,0x10(%rsp)
  252. pxor %xmm0,%xmm13
  253. pshufb %xmm3,%xmm13
  254. # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  255. movdqa 0x20(%rsp),%xmm0
  256. paddd %xmm6,%xmm0
  257. movdqa %xmm0,0x20(%rsp)
  258. pxor %xmm0,%xmm14
  259. pshufb %xmm3,%xmm14
  260. # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  261. movdqa 0x30(%rsp),%xmm0
  262. paddd %xmm7,%xmm0
  263. movdqa %xmm0,0x30(%rsp)
  264. pxor %xmm0,%xmm15
  265. pshufb %xmm3,%xmm15
  266. # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  267. paddd %xmm12,%xmm8
  268. pxor %xmm8,%xmm4
  269. movdqa %xmm4,%xmm0
  270. pslld $12,%xmm0
  271. psrld $20,%xmm4
  272. por %xmm0,%xmm4
  273. # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  274. paddd %xmm13,%xmm9
  275. pxor %xmm9,%xmm5
  276. movdqa %xmm5,%xmm0
  277. pslld $12,%xmm0
  278. psrld $20,%xmm5
  279. por %xmm0,%xmm5
  280. # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  281. paddd %xmm14,%xmm10
  282. pxor %xmm10,%xmm6
  283. movdqa %xmm6,%xmm0
  284. pslld $12,%xmm0
  285. psrld $20,%xmm6
  286. por %xmm0,%xmm6
  287. # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  288. paddd %xmm15,%xmm11
  289. pxor %xmm11,%xmm7
  290. movdqa %xmm7,%xmm0
  291. pslld $12,%xmm0
  292. psrld $20,%xmm7
  293. por %xmm0,%xmm7
  294. # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  295. movdqa 0x00(%rsp),%xmm0
  296. paddd %xmm4,%xmm0
  297. movdqa %xmm0,0x00(%rsp)
  298. pxor %xmm0,%xmm12
  299. pshufb %xmm2,%xmm12
  300. # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  301. movdqa 0x10(%rsp),%xmm0
  302. paddd %xmm5,%xmm0
  303. movdqa %xmm0,0x10(%rsp)
  304. pxor %xmm0,%xmm13
  305. pshufb %xmm2,%xmm13
  306. # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  307. movdqa 0x20(%rsp),%xmm0
  308. paddd %xmm6,%xmm0
  309. movdqa %xmm0,0x20(%rsp)
  310. pxor %xmm0,%xmm14
  311. pshufb %xmm2,%xmm14
  312. # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  313. movdqa 0x30(%rsp),%xmm0
  314. paddd %xmm7,%xmm0
  315. movdqa %xmm0,0x30(%rsp)
  316. pxor %xmm0,%xmm15
  317. pshufb %xmm2,%xmm15
  318. # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  319. paddd %xmm12,%xmm8
  320. pxor %xmm8,%xmm4
  321. movdqa %xmm4,%xmm0
  322. pslld $7,%xmm0
  323. psrld $25,%xmm4
  324. por %xmm0,%xmm4
  325. # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  326. paddd %xmm13,%xmm9
  327. pxor %xmm9,%xmm5
  328. movdqa %xmm5,%xmm0
  329. pslld $7,%xmm0
  330. psrld $25,%xmm5
  331. por %xmm0,%xmm5
  332. # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  333. paddd %xmm14,%xmm10
  334. pxor %xmm10,%xmm6
  335. movdqa %xmm6,%xmm0
  336. pslld $7,%xmm0
  337. psrld $25,%xmm6
  338. por %xmm0,%xmm6
  339. # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  340. paddd %xmm15,%xmm11
  341. pxor %xmm11,%xmm7
  342. movdqa %xmm7,%xmm0
  343. pslld $7,%xmm0
  344. psrld $25,%xmm7
  345. por %xmm0,%xmm7
  346. # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  347. movdqa 0x00(%rsp),%xmm0
  348. paddd %xmm5,%xmm0
  349. movdqa %xmm0,0x00(%rsp)
  350. pxor %xmm0,%xmm15
  351. pshufb %xmm3,%xmm15
  352. # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
  353. movdqa 0x10(%rsp),%xmm0
  354. paddd %xmm6,%xmm0
  355. movdqa %xmm0,0x10(%rsp)
  356. pxor %xmm0,%xmm12
  357. pshufb %xmm3,%xmm12
  358. # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  359. movdqa 0x20(%rsp),%xmm0
  360. paddd %xmm7,%xmm0
  361. movdqa %xmm0,0x20(%rsp)
  362. pxor %xmm0,%xmm13
  363. pshufb %xmm3,%xmm13
  364. # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  365. movdqa 0x30(%rsp),%xmm0
  366. paddd %xmm4,%xmm0
  367. movdqa %xmm0,0x30(%rsp)
  368. pxor %xmm0,%xmm14
  369. pshufb %xmm3,%xmm14
  370. # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  371. paddd %xmm15,%xmm10
  372. pxor %xmm10,%xmm5
  373. movdqa %xmm5,%xmm0
  374. pslld $12,%xmm0
  375. psrld $20,%xmm5
  376. por %xmm0,%xmm5
  377. # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  378. paddd %xmm12,%xmm11
  379. pxor %xmm11,%xmm6
  380. movdqa %xmm6,%xmm0
  381. pslld $12,%xmm0
  382. psrld $20,%xmm6
  383. por %xmm0,%xmm6
  384. # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  385. paddd %xmm13,%xmm8
  386. pxor %xmm8,%xmm7
  387. movdqa %xmm7,%xmm0
  388. pslld $12,%xmm0
  389. psrld $20,%xmm7
  390. por %xmm0,%xmm7
  391. # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  392. paddd %xmm14,%xmm9
  393. pxor %xmm9,%xmm4
  394. movdqa %xmm4,%xmm0
  395. pslld $12,%xmm0
  396. psrld $20,%xmm4
  397. por %xmm0,%xmm4
  398. # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  399. movdqa 0x00(%rsp),%xmm0
  400. paddd %xmm5,%xmm0
  401. movdqa %xmm0,0x00(%rsp)
  402. pxor %xmm0,%xmm15
  403. pshufb %xmm2,%xmm15
  404. # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  405. movdqa 0x10(%rsp),%xmm0
  406. paddd %xmm6,%xmm0
  407. movdqa %xmm0,0x10(%rsp)
  408. pxor %xmm0,%xmm12
  409. pshufb %xmm2,%xmm12
  410. # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  411. movdqa 0x20(%rsp),%xmm0
  412. paddd %xmm7,%xmm0
  413. movdqa %xmm0,0x20(%rsp)
  414. pxor %xmm0,%xmm13
  415. pshufb %xmm2,%xmm13
  416. # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  417. movdqa 0x30(%rsp),%xmm0
  418. paddd %xmm4,%xmm0
  419. movdqa %xmm0,0x30(%rsp)
  420. pxor %xmm0,%xmm14
  421. pshufb %xmm2,%xmm14
  422. # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  423. paddd %xmm15,%xmm10
  424. pxor %xmm10,%xmm5
  425. movdqa %xmm5,%xmm0
  426. pslld $7,%xmm0
  427. psrld $25,%xmm5
  428. por %xmm0,%xmm5
  429. # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  430. paddd %xmm12,%xmm11
  431. pxor %xmm11,%xmm6
  432. movdqa %xmm6,%xmm0
  433. pslld $7,%xmm0
  434. psrld $25,%xmm6
  435. por %xmm0,%xmm6
  436. # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  437. paddd %xmm13,%xmm8
  438. pxor %xmm8,%xmm7
  439. movdqa %xmm7,%xmm0
  440. pslld $7,%xmm0
  441. psrld $25,%xmm7
  442. por %xmm0,%xmm7
  443. # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  444. paddd %xmm14,%xmm9
  445. pxor %xmm9,%xmm4
  446. movdqa %xmm4,%xmm0
  447. pslld $7,%xmm0
  448. psrld $25,%xmm4
  449. por %xmm0,%xmm4
  450. sub $2,%r8d
  451. jnz .Ldoubleround4
  452. # x0[0-3] += s0[0]
  453. # x1[0-3] += s0[1]
  454. movq 0x00(%rdi),%xmm3
  455. pshufd $0x00,%xmm3,%xmm2
  456. pshufd $0x55,%xmm3,%xmm3
  457. paddd 0x00(%rsp),%xmm2
  458. movdqa %xmm2,0x00(%rsp)
  459. paddd 0x10(%rsp),%xmm3
  460. movdqa %xmm3,0x10(%rsp)
  461. # x2[0-3] += s0[2]
  462. # x3[0-3] += s0[3]
  463. movq 0x08(%rdi),%xmm3
  464. pshufd $0x00,%xmm3,%xmm2
  465. pshufd $0x55,%xmm3,%xmm3
  466. paddd 0x20(%rsp),%xmm2
  467. movdqa %xmm2,0x20(%rsp)
  468. paddd 0x30(%rsp),%xmm3
  469. movdqa %xmm3,0x30(%rsp)
  470. # x4[0-3] += s1[0]
  471. # x5[0-3] += s1[1]
  472. movq 0x10(%rdi),%xmm3
  473. pshufd $0x00,%xmm3,%xmm2
  474. pshufd $0x55,%xmm3,%xmm3
  475. paddd %xmm2,%xmm4
  476. paddd %xmm3,%xmm5
  477. # x6[0-3] += s1[2]
  478. # x7[0-3] += s1[3]
  479. movq 0x18(%rdi),%xmm3
  480. pshufd $0x00,%xmm3,%xmm2
  481. pshufd $0x55,%xmm3,%xmm3
  482. paddd %xmm2,%xmm6
  483. paddd %xmm3,%xmm7
  484. # x8[0-3] += s2[0]
  485. # x9[0-3] += s2[1]
  486. movq 0x20(%rdi),%xmm3
  487. pshufd $0x00,%xmm3,%xmm2
  488. pshufd $0x55,%xmm3,%xmm3
  489. paddd %xmm2,%xmm8
  490. paddd %xmm3,%xmm9
  491. # x10[0-3] += s2[2]
  492. # x11[0-3] += s2[3]
  493. movq 0x28(%rdi),%xmm3
  494. pshufd $0x00,%xmm3,%xmm2
  495. pshufd $0x55,%xmm3,%xmm3
  496. paddd %xmm2,%xmm10
  497. paddd %xmm3,%xmm11
  498. # x12[0-3] += s3[0]
  499. # x13[0-3] += s3[1]
  500. movq 0x30(%rdi),%xmm3
  501. pshufd $0x00,%xmm3,%xmm2
  502. pshufd $0x55,%xmm3,%xmm3
  503. paddd %xmm2,%xmm12
  504. paddd %xmm3,%xmm13
  505. # x14[0-3] += s3[2]
  506. # x15[0-3] += s3[3]
  507. movq 0x38(%rdi),%xmm3
  508. pshufd $0x00,%xmm3,%xmm2
  509. pshufd $0x55,%xmm3,%xmm3
  510. paddd %xmm2,%xmm14
  511. paddd %xmm3,%xmm15
  512. # x12 += counter values 0-3
  513. paddd %xmm1,%xmm12
  514. # interleave 32-bit words in state n, n+1
  515. movdqa 0x00(%rsp),%xmm0
  516. movdqa 0x10(%rsp),%xmm1
  517. movdqa %xmm0,%xmm2
  518. punpckldq %xmm1,%xmm2
  519. punpckhdq %xmm1,%xmm0
  520. movdqa %xmm2,0x00(%rsp)
  521. movdqa %xmm0,0x10(%rsp)
  522. movdqa 0x20(%rsp),%xmm0
  523. movdqa 0x30(%rsp),%xmm1
  524. movdqa %xmm0,%xmm2
  525. punpckldq %xmm1,%xmm2
  526. punpckhdq %xmm1,%xmm0
  527. movdqa %xmm2,0x20(%rsp)
  528. movdqa %xmm0,0x30(%rsp)
  529. movdqa %xmm4,%xmm0
  530. punpckldq %xmm5,%xmm4
  531. punpckhdq %xmm5,%xmm0
  532. movdqa %xmm0,%xmm5
  533. movdqa %xmm6,%xmm0
  534. punpckldq %xmm7,%xmm6
  535. punpckhdq %xmm7,%xmm0
  536. movdqa %xmm0,%xmm7
  537. movdqa %xmm8,%xmm0
  538. punpckldq %xmm9,%xmm8
  539. punpckhdq %xmm9,%xmm0
  540. movdqa %xmm0,%xmm9
  541. movdqa %xmm10,%xmm0
  542. punpckldq %xmm11,%xmm10
  543. punpckhdq %xmm11,%xmm0
  544. movdqa %xmm0,%xmm11
  545. movdqa %xmm12,%xmm0
  546. punpckldq %xmm13,%xmm12
  547. punpckhdq %xmm13,%xmm0
  548. movdqa %xmm0,%xmm13
  549. movdqa %xmm14,%xmm0
  550. punpckldq %xmm15,%xmm14
  551. punpckhdq %xmm15,%xmm0
  552. movdqa %xmm0,%xmm15
  553. # interleave 64-bit words in state n, n+2
  554. movdqa 0x00(%rsp),%xmm0
  555. movdqa 0x20(%rsp),%xmm1
  556. movdqa %xmm0,%xmm2
  557. punpcklqdq %xmm1,%xmm2
  558. punpckhqdq %xmm1,%xmm0
  559. movdqa %xmm2,0x00(%rsp)
  560. movdqa %xmm0,0x20(%rsp)
  561. movdqa 0x10(%rsp),%xmm0
  562. movdqa 0x30(%rsp),%xmm1
  563. movdqa %xmm0,%xmm2
  564. punpcklqdq %xmm1,%xmm2
  565. punpckhqdq %xmm1,%xmm0
  566. movdqa %xmm2,0x10(%rsp)
  567. movdqa %xmm0,0x30(%rsp)
  568. movdqa %xmm4,%xmm0
  569. punpcklqdq %xmm6,%xmm4
  570. punpckhqdq %xmm6,%xmm0
  571. movdqa %xmm0,%xmm6
  572. movdqa %xmm5,%xmm0
  573. punpcklqdq %xmm7,%xmm5
  574. punpckhqdq %xmm7,%xmm0
  575. movdqa %xmm0,%xmm7
  576. movdqa %xmm8,%xmm0
  577. punpcklqdq %xmm10,%xmm8
  578. punpckhqdq %xmm10,%xmm0
  579. movdqa %xmm0,%xmm10
  580. movdqa %xmm9,%xmm0
  581. punpcklqdq %xmm11,%xmm9
  582. punpckhqdq %xmm11,%xmm0
  583. movdqa %xmm0,%xmm11
  584. movdqa %xmm12,%xmm0
  585. punpcklqdq %xmm14,%xmm12
  586. punpckhqdq %xmm14,%xmm0
  587. movdqa %xmm0,%xmm14
  588. movdqa %xmm13,%xmm0
  589. punpcklqdq %xmm15,%xmm13
  590. punpckhqdq %xmm15,%xmm0
  591. movdqa %xmm0,%xmm15
  592. # xor with corresponding input, write to output
  593. movdqa 0x00(%rsp),%xmm0
  594. cmp $0x10,%rax
  595. jl .Lxorpart4
  596. movdqu 0x00(%rdx),%xmm1
  597. pxor %xmm1,%xmm0
  598. movdqu %xmm0,0x00(%rsi)
  599. movdqu %xmm4,%xmm0
  600. cmp $0x20,%rax
  601. jl .Lxorpart4
  602. movdqu 0x10(%rdx),%xmm1
  603. pxor %xmm1,%xmm0
  604. movdqu %xmm0,0x10(%rsi)
  605. movdqu %xmm8,%xmm0
  606. cmp $0x30,%rax
  607. jl .Lxorpart4
  608. movdqu 0x20(%rdx),%xmm1
  609. pxor %xmm1,%xmm0
  610. movdqu %xmm0,0x20(%rsi)
  611. movdqu %xmm12,%xmm0
  612. cmp $0x40,%rax
  613. jl .Lxorpart4
  614. movdqu 0x30(%rdx),%xmm1
  615. pxor %xmm1,%xmm0
  616. movdqu %xmm0,0x30(%rsi)
  617. movdqa 0x20(%rsp),%xmm0
  618. cmp $0x50,%rax
  619. jl .Lxorpart4
  620. movdqu 0x40(%rdx),%xmm1
  621. pxor %xmm1,%xmm0
  622. movdqu %xmm0,0x40(%rsi)
  623. movdqu %xmm6,%xmm0
  624. cmp $0x60,%rax
  625. jl .Lxorpart4
  626. movdqu 0x50(%rdx),%xmm1
  627. pxor %xmm1,%xmm0
  628. movdqu %xmm0,0x50(%rsi)
  629. movdqu %xmm10,%xmm0
  630. cmp $0x70,%rax
  631. jl .Lxorpart4
  632. movdqu 0x60(%rdx),%xmm1
  633. pxor %xmm1,%xmm0
  634. movdqu %xmm0,0x60(%rsi)
  635. movdqu %xmm14,%xmm0
  636. cmp $0x80,%rax
  637. jl .Lxorpart4
  638. movdqu 0x70(%rdx),%xmm1
  639. pxor %xmm1,%xmm0
  640. movdqu %xmm0,0x70(%rsi)
  641. movdqa 0x10(%rsp),%xmm0
  642. cmp $0x90,%rax
  643. jl .Lxorpart4
  644. movdqu 0x80(%rdx),%xmm1
  645. pxor %xmm1,%xmm0
  646. movdqu %xmm0,0x80(%rsi)
  647. movdqu %xmm5,%xmm0
  648. cmp $0xa0,%rax
  649. jl .Lxorpart4
  650. movdqu 0x90(%rdx),%xmm1
  651. pxor %xmm1,%xmm0
  652. movdqu %xmm0,0x90(%rsi)
  653. movdqu %xmm9,%xmm0
  654. cmp $0xb0,%rax
  655. jl .Lxorpart4
  656. movdqu 0xa0(%rdx),%xmm1
  657. pxor %xmm1,%xmm0
  658. movdqu %xmm0,0xa0(%rsi)
  659. movdqu %xmm13,%xmm0
  660. cmp $0xc0,%rax
  661. jl .Lxorpart4
  662. movdqu 0xb0(%rdx),%xmm1
  663. pxor %xmm1,%xmm0
  664. movdqu %xmm0,0xb0(%rsi)
  665. movdqa 0x30(%rsp),%xmm0
  666. cmp $0xd0,%rax
  667. jl .Lxorpart4
  668. movdqu 0xc0(%rdx),%xmm1
  669. pxor %xmm1,%xmm0
  670. movdqu %xmm0,0xc0(%rsi)
  671. movdqu %xmm7,%xmm0
  672. cmp $0xe0,%rax
  673. jl .Lxorpart4
  674. movdqu 0xd0(%rdx),%xmm1
  675. pxor %xmm1,%xmm0
  676. movdqu %xmm0,0xd0(%rsi)
  677. movdqu %xmm11,%xmm0
  678. cmp $0xf0,%rax
  679. jl .Lxorpart4
  680. movdqu 0xe0(%rdx),%xmm1
  681. pxor %xmm1,%xmm0
  682. movdqu %xmm0,0xe0(%rsi)
  683. movdqu %xmm15,%xmm0
  684. cmp $0x100,%rax
  685. jl .Lxorpart4
  686. movdqu 0xf0(%rdx),%xmm1
  687. pxor %xmm1,%xmm0
  688. movdqu %xmm0,0xf0(%rsi)
  689. .Ldone4:
  690. lea -8(%r10),%rsp
  691. RET
  692. .Lxorpart4:
  693. # xor remaining bytes from partial register into output
  694. mov %rax,%r9
  695. and $0x0f,%r9
  696. jz .Ldone4
  697. and $~0x0f,%rax
  698. mov %rsi,%r11
  699. lea (%rdx,%rax),%rsi
  700. mov %rsp,%rdi
  701. mov %r9,%rcx
  702. rep movsb
  703. pxor 0x00(%rsp),%xmm0
  704. movdqa %xmm0,0x00(%rsp)
  705. mov %rsp,%rsi
  706. lea (%r11,%rax),%rdi
  707. mov %r9,%rcx
  708. rep movsb
  709. jmp .Ldone4
  710. SYM_FUNC_END(chacha_4block_xor_ssse3)