chacha-avx512vl-x86_64.S 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836
  1. /* SPDX-License-Identifier: GPL-2.0+ */
  2. /*
  3. * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
  4. *
  5. * Copyright (C) 2018 Martin Willi
  6. */
  7. #include <linux/linkage.h>
  8. .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
  9. .align 32
  10. CTR2BL: .octa 0x00000000000000000000000000000000
  11. .octa 0x00000000000000000000000000000001
  12. .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
  13. .align 32
  14. CTR4BL: .octa 0x00000000000000000000000000000002
  15. .octa 0x00000000000000000000000000000003
  16. .section .rodata.cst32.CTR8BL, "aM", @progbits, 32
  17. .align 32
  18. CTR8BL: .octa 0x00000003000000020000000100000000
  19. .octa 0x00000007000000060000000500000004
  20. .text
  21. SYM_FUNC_START(chacha_2block_xor_avx512vl)
  22. # %rdi: Input state matrix, s
  23. # %rsi: up to 2 data blocks output, o
  24. # %rdx: up to 2 data blocks input, i
  25. # %rcx: input/output length in bytes
  26. # %r8d: nrounds
  27. # This function encrypts two ChaCha blocks by loading the state
  28. # matrix twice across four AVX registers. It performs matrix operations
  29. # on four words in each matrix in parallel, but requires shuffling to
  30. # rearrange the words after each round.
  31. vzeroupper
  32. # x0..3[0-2] = s0..3
  33. vbroadcasti128 0x00(%rdi),%ymm0
  34. vbroadcasti128 0x10(%rdi),%ymm1
  35. vbroadcasti128 0x20(%rdi),%ymm2
  36. vbroadcasti128 0x30(%rdi),%ymm3
  37. vpaddd CTR2BL(%rip),%ymm3,%ymm3
  38. vmovdqa %ymm0,%ymm8
  39. vmovdqa %ymm1,%ymm9
  40. vmovdqa %ymm2,%ymm10
  41. vmovdqa %ymm3,%ymm11
  42. .Ldoubleround:
  43. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  44. vpaddd %ymm1,%ymm0,%ymm0
  45. vpxord %ymm0,%ymm3,%ymm3
  46. vprold $16,%ymm3,%ymm3
  47. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  48. vpaddd %ymm3,%ymm2,%ymm2
  49. vpxord %ymm2,%ymm1,%ymm1
  50. vprold $12,%ymm1,%ymm1
  51. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  52. vpaddd %ymm1,%ymm0,%ymm0
  53. vpxord %ymm0,%ymm3,%ymm3
  54. vprold $8,%ymm3,%ymm3
  55. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  56. vpaddd %ymm3,%ymm2,%ymm2
  57. vpxord %ymm2,%ymm1,%ymm1
  58. vprold $7,%ymm1,%ymm1
  59. # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  60. vpshufd $0x39,%ymm1,%ymm1
  61. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  62. vpshufd $0x4e,%ymm2,%ymm2
  63. # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  64. vpshufd $0x93,%ymm3,%ymm3
  65. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  66. vpaddd %ymm1,%ymm0,%ymm0
  67. vpxord %ymm0,%ymm3,%ymm3
  68. vprold $16,%ymm3,%ymm3
  69. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  70. vpaddd %ymm3,%ymm2,%ymm2
  71. vpxord %ymm2,%ymm1,%ymm1
  72. vprold $12,%ymm1,%ymm1
  73. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  74. vpaddd %ymm1,%ymm0,%ymm0
  75. vpxord %ymm0,%ymm3,%ymm3
  76. vprold $8,%ymm3,%ymm3
  77. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  78. vpaddd %ymm3,%ymm2,%ymm2
  79. vpxord %ymm2,%ymm1,%ymm1
  80. vprold $7,%ymm1,%ymm1
  81. # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  82. vpshufd $0x93,%ymm1,%ymm1
  83. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  84. vpshufd $0x4e,%ymm2,%ymm2
  85. # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  86. vpshufd $0x39,%ymm3,%ymm3
  87. sub $2,%r8d
  88. jnz .Ldoubleround
  89. # o0 = i0 ^ (x0 + s0)
  90. vpaddd %ymm8,%ymm0,%ymm7
  91. cmp $0x10,%rcx
  92. jl .Lxorpart2
  93. vpxord 0x00(%rdx),%xmm7,%xmm6
  94. vmovdqu %xmm6,0x00(%rsi)
  95. vextracti128 $1,%ymm7,%xmm0
  96. # o1 = i1 ^ (x1 + s1)
  97. vpaddd %ymm9,%ymm1,%ymm7
  98. cmp $0x20,%rcx
  99. jl .Lxorpart2
  100. vpxord 0x10(%rdx),%xmm7,%xmm6
  101. vmovdqu %xmm6,0x10(%rsi)
  102. vextracti128 $1,%ymm7,%xmm1
  103. # o2 = i2 ^ (x2 + s2)
  104. vpaddd %ymm10,%ymm2,%ymm7
  105. cmp $0x30,%rcx
  106. jl .Lxorpart2
  107. vpxord 0x20(%rdx),%xmm7,%xmm6
  108. vmovdqu %xmm6,0x20(%rsi)
  109. vextracti128 $1,%ymm7,%xmm2
  110. # o3 = i3 ^ (x3 + s3)
  111. vpaddd %ymm11,%ymm3,%ymm7
  112. cmp $0x40,%rcx
  113. jl .Lxorpart2
  114. vpxord 0x30(%rdx),%xmm7,%xmm6
  115. vmovdqu %xmm6,0x30(%rsi)
  116. vextracti128 $1,%ymm7,%xmm3
  117. # xor and write second block
  118. vmovdqa %xmm0,%xmm7
  119. cmp $0x50,%rcx
  120. jl .Lxorpart2
  121. vpxord 0x40(%rdx),%xmm7,%xmm6
  122. vmovdqu %xmm6,0x40(%rsi)
  123. vmovdqa %xmm1,%xmm7
  124. cmp $0x60,%rcx
  125. jl .Lxorpart2
  126. vpxord 0x50(%rdx),%xmm7,%xmm6
  127. vmovdqu %xmm6,0x50(%rsi)
  128. vmovdqa %xmm2,%xmm7
  129. cmp $0x70,%rcx
  130. jl .Lxorpart2
  131. vpxord 0x60(%rdx),%xmm7,%xmm6
  132. vmovdqu %xmm6,0x60(%rsi)
  133. vmovdqa %xmm3,%xmm7
  134. cmp $0x80,%rcx
  135. jl .Lxorpart2
  136. vpxord 0x70(%rdx),%xmm7,%xmm6
  137. vmovdqu %xmm6,0x70(%rsi)
  138. .Ldone2:
  139. vzeroupper
  140. RET
  141. .Lxorpart2:
  142. # xor remaining bytes from partial register into output
  143. mov %rcx,%rax
  144. and $0xf,%rcx
  145. jz .Ldone2
  146. mov %rax,%r9
  147. and $~0xf,%r9
  148. mov $1,%rax
  149. shld %cl,%rax,%rax
  150. sub $1,%rax
  151. kmovq %rax,%k1
  152. vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
  153. vpxord %xmm7,%xmm1,%xmm1
  154. vmovdqu8 %xmm1,(%rsi,%r9){%k1}
  155. jmp .Ldone2
  156. SYM_FUNC_END(chacha_2block_xor_avx512vl)
  157. SYM_FUNC_START(chacha_4block_xor_avx512vl)
  158. # %rdi: Input state matrix, s
  159. # %rsi: up to 4 data blocks output, o
  160. # %rdx: up to 4 data blocks input, i
  161. # %rcx: input/output length in bytes
  162. # %r8d: nrounds
  163. # This function encrypts four ChaCha blocks by loading the state
  164. # matrix four times across eight AVX registers. It performs matrix
  165. # operations on four words in two matrices in parallel, sequentially
  166. # to the operations on the four words of the other two matrices. The
  167. # required word shuffling has a rather high latency, we can do the
  168. # arithmetic on two matrix-pairs without much slowdown.
  169. vzeroupper
  170. # x0..3[0-4] = s0..3
  171. vbroadcasti128 0x00(%rdi),%ymm0
  172. vbroadcasti128 0x10(%rdi),%ymm1
  173. vbroadcasti128 0x20(%rdi),%ymm2
  174. vbroadcasti128 0x30(%rdi),%ymm3
  175. vmovdqa %ymm0,%ymm4
  176. vmovdqa %ymm1,%ymm5
  177. vmovdqa %ymm2,%ymm6
  178. vmovdqa %ymm3,%ymm7
  179. vpaddd CTR2BL(%rip),%ymm3,%ymm3
  180. vpaddd CTR4BL(%rip),%ymm7,%ymm7
  181. vmovdqa %ymm0,%ymm11
  182. vmovdqa %ymm1,%ymm12
  183. vmovdqa %ymm2,%ymm13
  184. vmovdqa %ymm3,%ymm14
  185. vmovdqa %ymm7,%ymm15
  186. .Ldoubleround4:
  187. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  188. vpaddd %ymm1,%ymm0,%ymm0
  189. vpxord %ymm0,%ymm3,%ymm3
  190. vprold $16,%ymm3,%ymm3
  191. vpaddd %ymm5,%ymm4,%ymm4
  192. vpxord %ymm4,%ymm7,%ymm7
  193. vprold $16,%ymm7,%ymm7
  194. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  195. vpaddd %ymm3,%ymm2,%ymm2
  196. vpxord %ymm2,%ymm1,%ymm1
  197. vprold $12,%ymm1,%ymm1
  198. vpaddd %ymm7,%ymm6,%ymm6
  199. vpxord %ymm6,%ymm5,%ymm5
  200. vprold $12,%ymm5,%ymm5
  201. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  202. vpaddd %ymm1,%ymm0,%ymm0
  203. vpxord %ymm0,%ymm3,%ymm3
  204. vprold $8,%ymm3,%ymm3
  205. vpaddd %ymm5,%ymm4,%ymm4
  206. vpxord %ymm4,%ymm7,%ymm7
  207. vprold $8,%ymm7,%ymm7
  208. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  209. vpaddd %ymm3,%ymm2,%ymm2
  210. vpxord %ymm2,%ymm1,%ymm1
  211. vprold $7,%ymm1,%ymm1
  212. vpaddd %ymm7,%ymm6,%ymm6
  213. vpxord %ymm6,%ymm5,%ymm5
  214. vprold $7,%ymm5,%ymm5
  215. # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  216. vpshufd $0x39,%ymm1,%ymm1
  217. vpshufd $0x39,%ymm5,%ymm5
  218. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  219. vpshufd $0x4e,%ymm2,%ymm2
  220. vpshufd $0x4e,%ymm6,%ymm6
  221. # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  222. vpshufd $0x93,%ymm3,%ymm3
  223. vpshufd $0x93,%ymm7,%ymm7
  224. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  225. vpaddd %ymm1,%ymm0,%ymm0
  226. vpxord %ymm0,%ymm3,%ymm3
  227. vprold $16,%ymm3,%ymm3
  228. vpaddd %ymm5,%ymm4,%ymm4
  229. vpxord %ymm4,%ymm7,%ymm7
  230. vprold $16,%ymm7,%ymm7
  231. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  232. vpaddd %ymm3,%ymm2,%ymm2
  233. vpxord %ymm2,%ymm1,%ymm1
  234. vprold $12,%ymm1,%ymm1
  235. vpaddd %ymm7,%ymm6,%ymm6
  236. vpxord %ymm6,%ymm5,%ymm5
  237. vprold $12,%ymm5,%ymm5
  238. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  239. vpaddd %ymm1,%ymm0,%ymm0
  240. vpxord %ymm0,%ymm3,%ymm3
  241. vprold $8,%ymm3,%ymm3
  242. vpaddd %ymm5,%ymm4,%ymm4
  243. vpxord %ymm4,%ymm7,%ymm7
  244. vprold $8,%ymm7,%ymm7
  245. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  246. vpaddd %ymm3,%ymm2,%ymm2
  247. vpxord %ymm2,%ymm1,%ymm1
  248. vprold $7,%ymm1,%ymm1
  249. vpaddd %ymm7,%ymm6,%ymm6
  250. vpxord %ymm6,%ymm5,%ymm5
  251. vprold $7,%ymm5,%ymm5
  252. # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  253. vpshufd $0x93,%ymm1,%ymm1
  254. vpshufd $0x93,%ymm5,%ymm5
  255. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  256. vpshufd $0x4e,%ymm2,%ymm2
  257. vpshufd $0x4e,%ymm6,%ymm6
  258. # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  259. vpshufd $0x39,%ymm3,%ymm3
  260. vpshufd $0x39,%ymm7,%ymm7
  261. sub $2,%r8d
  262. jnz .Ldoubleround4
  263. # o0 = i0 ^ (x0 + s0), first block
  264. vpaddd %ymm11,%ymm0,%ymm10
  265. cmp $0x10,%rcx
  266. jl .Lxorpart4
  267. vpxord 0x00(%rdx),%xmm10,%xmm9
  268. vmovdqu %xmm9,0x00(%rsi)
  269. vextracti128 $1,%ymm10,%xmm0
  270. # o1 = i1 ^ (x1 + s1), first block
  271. vpaddd %ymm12,%ymm1,%ymm10
  272. cmp $0x20,%rcx
  273. jl .Lxorpart4
  274. vpxord 0x10(%rdx),%xmm10,%xmm9
  275. vmovdqu %xmm9,0x10(%rsi)
  276. vextracti128 $1,%ymm10,%xmm1
  277. # o2 = i2 ^ (x2 + s2), first block
  278. vpaddd %ymm13,%ymm2,%ymm10
  279. cmp $0x30,%rcx
  280. jl .Lxorpart4
  281. vpxord 0x20(%rdx),%xmm10,%xmm9
  282. vmovdqu %xmm9,0x20(%rsi)
  283. vextracti128 $1,%ymm10,%xmm2
  284. # o3 = i3 ^ (x3 + s3), first block
  285. vpaddd %ymm14,%ymm3,%ymm10
  286. cmp $0x40,%rcx
  287. jl .Lxorpart4
  288. vpxord 0x30(%rdx),%xmm10,%xmm9
  289. vmovdqu %xmm9,0x30(%rsi)
  290. vextracti128 $1,%ymm10,%xmm3
  291. # xor and write second block
  292. vmovdqa %xmm0,%xmm10
  293. cmp $0x50,%rcx
  294. jl .Lxorpart4
  295. vpxord 0x40(%rdx),%xmm10,%xmm9
  296. vmovdqu %xmm9,0x40(%rsi)
  297. vmovdqa %xmm1,%xmm10
  298. cmp $0x60,%rcx
  299. jl .Lxorpart4
  300. vpxord 0x50(%rdx),%xmm10,%xmm9
  301. vmovdqu %xmm9,0x50(%rsi)
  302. vmovdqa %xmm2,%xmm10
  303. cmp $0x70,%rcx
  304. jl .Lxorpart4
  305. vpxord 0x60(%rdx),%xmm10,%xmm9
  306. vmovdqu %xmm9,0x60(%rsi)
  307. vmovdqa %xmm3,%xmm10
  308. cmp $0x80,%rcx
  309. jl .Lxorpart4
  310. vpxord 0x70(%rdx),%xmm10,%xmm9
  311. vmovdqu %xmm9,0x70(%rsi)
  312. # o0 = i0 ^ (x0 + s0), third block
  313. vpaddd %ymm11,%ymm4,%ymm10
  314. cmp $0x90,%rcx
  315. jl .Lxorpart4
  316. vpxord 0x80(%rdx),%xmm10,%xmm9
  317. vmovdqu %xmm9,0x80(%rsi)
  318. vextracti128 $1,%ymm10,%xmm4
  319. # o1 = i1 ^ (x1 + s1), third block
  320. vpaddd %ymm12,%ymm5,%ymm10
  321. cmp $0xa0,%rcx
  322. jl .Lxorpart4
  323. vpxord 0x90(%rdx),%xmm10,%xmm9
  324. vmovdqu %xmm9,0x90(%rsi)
  325. vextracti128 $1,%ymm10,%xmm5
  326. # o2 = i2 ^ (x2 + s2), third block
  327. vpaddd %ymm13,%ymm6,%ymm10
  328. cmp $0xb0,%rcx
  329. jl .Lxorpart4
  330. vpxord 0xa0(%rdx),%xmm10,%xmm9
  331. vmovdqu %xmm9,0xa0(%rsi)
  332. vextracti128 $1,%ymm10,%xmm6
  333. # o3 = i3 ^ (x3 + s3), third block
  334. vpaddd %ymm15,%ymm7,%ymm10
  335. cmp $0xc0,%rcx
  336. jl .Lxorpart4
  337. vpxord 0xb0(%rdx),%xmm10,%xmm9
  338. vmovdqu %xmm9,0xb0(%rsi)
  339. vextracti128 $1,%ymm10,%xmm7
  340. # xor and write fourth block
  341. vmovdqa %xmm4,%xmm10
  342. cmp $0xd0,%rcx
  343. jl .Lxorpart4
  344. vpxord 0xc0(%rdx),%xmm10,%xmm9
  345. vmovdqu %xmm9,0xc0(%rsi)
  346. vmovdqa %xmm5,%xmm10
  347. cmp $0xe0,%rcx
  348. jl .Lxorpart4
  349. vpxord 0xd0(%rdx),%xmm10,%xmm9
  350. vmovdqu %xmm9,0xd0(%rsi)
  351. vmovdqa %xmm6,%xmm10
  352. cmp $0xf0,%rcx
  353. jl .Lxorpart4
  354. vpxord 0xe0(%rdx),%xmm10,%xmm9
  355. vmovdqu %xmm9,0xe0(%rsi)
  356. vmovdqa %xmm7,%xmm10
  357. cmp $0x100,%rcx
  358. jl .Lxorpart4
  359. vpxord 0xf0(%rdx),%xmm10,%xmm9
  360. vmovdqu %xmm9,0xf0(%rsi)
  361. .Ldone4:
  362. vzeroupper
  363. RET
  364. .Lxorpart4:
  365. # xor remaining bytes from partial register into output
  366. mov %rcx,%rax
  367. and $0xf,%rcx
  368. jz .Ldone4
  369. mov %rax,%r9
  370. and $~0xf,%r9
  371. mov $1,%rax
  372. shld %cl,%rax,%rax
  373. sub $1,%rax
  374. kmovq %rax,%k1
  375. vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
  376. vpxord %xmm10,%xmm1,%xmm1
  377. vmovdqu8 %xmm1,(%rsi,%r9){%k1}
  378. jmp .Ldone4
  379. SYM_FUNC_END(chacha_4block_xor_avx512vl)
  380. SYM_FUNC_START(chacha_8block_xor_avx512vl)
  381. # %rdi: Input state matrix, s
  382. # %rsi: up to 8 data blocks output, o
  383. # %rdx: up to 8 data blocks input, i
  384. # %rcx: input/output length in bytes
  385. # %r8d: nrounds
  386. # This function encrypts eight consecutive ChaCha blocks by loading
  387. # the state matrix in AVX registers eight times. Compared to AVX2, this
  388. # mostly benefits from the new rotate instructions in VL and the
  389. # additional registers.
  390. vzeroupper
  391. # x0..15[0-7] = s[0..15]
  392. vpbroadcastd 0x00(%rdi),%ymm0
  393. vpbroadcastd 0x04(%rdi),%ymm1
  394. vpbroadcastd 0x08(%rdi),%ymm2
  395. vpbroadcastd 0x0c(%rdi),%ymm3
  396. vpbroadcastd 0x10(%rdi),%ymm4
  397. vpbroadcastd 0x14(%rdi),%ymm5
  398. vpbroadcastd 0x18(%rdi),%ymm6
  399. vpbroadcastd 0x1c(%rdi),%ymm7
  400. vpbroadcastd 0x20(%rdi),%ymm8
  401. vpbroadcastd 0x24(%rdi),%ymm9
  402. vpbroadcastd 0x28(%rdi),%ymm10
  403. vpbroadcastd 0x2c(%rdi),%ymm11
  404. vpbroadcastd 0x30(%rdi),%ymm12
  405. vpbroadcastd 0x34(%rdi),%ymm13
  406. vpbroadcastd 0x38(%rdi),%ymm14
  407. vpbroadcastd 0x3c(%rdi),%ymm15
  408. # x12 += counter values 0-3
  409. vpaddd CTR8BL(%rip),%ymm12,%ymm12
  410. vmovdqa64 %ymm0,%ymm16
  411. vmovdqa64 %ymm1,%ymm17
  412. vmovdqa64 %ymm2,%ymm18
  413. vmovdqa64 %ymm3,%ymm19
  414. vmovdqa64 %ymm4,%ymm20
  415. vmovdqa64 %ymm5,%ymm21
  416. vmovdqa64 %ymm6,%ymm22
  417. vmovdqa64 %ymm7,%ymm23
  418. vmovdqa64 %ymm8,%ymm24
  419. vmovdqa64 %ymm9,%ymm25
  420. vmovdqa64 %ymm10,%ymm26
  421. vmovdqa64 %ymm11,%ymm27
  422. vmovdqa64 %ymm12,%ymm28
  423. vmovdqa64 %ymm13,%ymm29
  424. vmovdqa64 %ymm14,%ymm30
  425. vmovdqa64 %ymm15,%ymm31
  426. .Ldoubleround8:
  427. # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  428. vpaddd %ymm0,%ymm4,%ymm0
  429. vpxord %ymm0,%ymm12,%ymm12
  430. vprold $16,%ymm12,%ymm12
  431. # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  432. vpaddd %ymm1,%ymm5,%ymm1
  433. vpxord %ymm1,%ymm13,%ymm13
  434. vprold $16,%ymm13,%ymm13
  435. # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  436. vpaddd %ymm2,%ymm6,%ymm2
  437. vpxord %ymm2,%ymm14,%ymm14
  438. vprold $16,%ymm14,%ymm14
  439. # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  440. vpaddd %ymm3,%ymm7,%ymm3
  441. vpxord %ymm3,%ymm15,%ymm15
  442. vprold $16,%ymm15,%ymm15
  443. # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  444. vpaddd %ymm12,%ymm8,%ymm8
  445. vpxord %ymm8,%ymm4,%ymm4
  446. vprold $12,%ymm4,%ymm4
  447. # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  448. vpaddd %ymm13,%ymm9,%ymm9
  449. vpxord %ymm9,%ymm5,%ymm5
  450. vprold $12,%ymm5,%ymm5
  451. # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  452. vpaddd %ymm14,%ymm10,%ymm10
  453. vpxord %ymm10,%ymm6,%ymm6
  454. vprold $12,%ymm6,%ymm6
  455. # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  456. vpaddd %ymm15,%ymm11,%ymm11
  457. vpxord %ymm11,%ymm7,%ymm7
  458. vprold $12,%ymm7,%ymm7
  459. # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  460. vpaddd %ymm0,%ymm4,%ymm0
  461. vpxord %ymm0,%ymm12,%ymm12
  462. vprold $8,%ymm12,%ymm12
  463. # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  464. vpaddd %ymm1,%ymm5,%ymm1
  465. vpxord %ymm1,%ymm13,%ymm13
  466. vprold $8,%ymm13,%ymm13
  467. # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  468. vpaddd %ymm2,%ymm6,%ymm2
  469. vpxord %ymm2,%ymm14,%ymm14
  470. vprold $8,%ymm14,%ymm14
  471. # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  472. vpaddd %ymm3,%ymm7,%ymm3
  473. vpxord %ymm3,%ymm15,%ymm15
  474. vprold $8,%ymm15,%ymm15
  475. # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  476. vpaddd %ymm12,%ymm8,%ymm8
  477. vpxord %ymm8,%ymm4,%ymm4
  478. vprold $7,%ymm4,%ymm4
  479. # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  480. vpaddd %ymm13,%ymm9,%ymm9
  481. vpxord %ymm9,%ymm5,%ymm5
  482. vprold $7,%ymm5,%ymm5
  483. # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  484. vpaddd %ymm14,%ymm10,%ymm10
  485. vpxord %ymm10,%ymm6,%ymm6
  486. vprold $7,%ymm6,%ymm6
  487. # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  488. vpaddd %ymm15,%ymm11,%ymm11
  489. vpxord %ymm11,%ymm7,%ymm7
  490. vprold $7,%ymm7,%ymm7
  491. # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  492. vpaddd %ymm0,%ymm5,%ymm0
  493. vpxord %ymm0,%ymm15,%ymm15
  494. vprold $16,%ymm15,%ymm15
  495. # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
  496. vpaddd %ymm1,%ymm6,%ymm1
  497. vpxord %ymm1,%ymm12,%ymm12
  498. vprold $16,%ymm12,%ymm12
  499. # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  500. vpaddd %ymm2,%ymm7,%ymm2
  501. vpxord %ymm2,%ymm13,%ymm13
  502. vprold $16,%ymm13,%ymm13
  503. # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  504. vpaddd %ymm3,%ymm4,%ymm3
  505. vpxord %ymm3,%ymm14,%ymm14
  506. vprold $16,%ymm14,%ymm14
  507. # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  508. vpaddd %ymm15,%ymm10,%ymm10
  509. vpxord %ymm10,%ymm5,%ymm5
  510. vprold $12,%ymm5,%ymm5
  511. # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  512. vpaddd %ymm12,%ymm11,%ymm11
  513. vpxord %ymm11,%ymm6,%ymm6
  514. vprold $12,%ymm6,%ymm6
  515. # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  516. vpaddd %ymm13,%ymm8,%ymm8
  517. vpxord %ymm8,%ymm7,%ymm7
  518. vprold $12,%ymm7,%ymm7
  519. # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  520. vpaddd %ymm14,%ymm9,%ymm9
  521. vpxord %ymm9,%ymm4,%ymm4
  522. vprold $12,%ymm4,%ymm4
  523. # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  524. vpaddd %ymm0,%ymm5,%ymm0
  525. vpxord %ymm0,%ymm15,%ymm15
  526. vprold $8,%ymm15,%ymm15
  527. # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  528. vpaddd %ymm1,%ymm6,%ymm1
  529. vpxord %ymm1,%ymm12,%ymm12
  530. vprold $8,%ymm12,%ymm12
  531. # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  532. vpaddd %ymm2,%ymm7,%ymm2
  533. vpxord %ymm2,%ymm13,%ymm13
  534. vprold $8,%ymm13,%ymm13
  535. # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  536. vpaddd %ymm3,%ymm4,%ymm3
  537. vpxord %ymm3,%ymm14,%ymm14
  538. vprold $8,%ymm14,%ymm14
  539. # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  540. vpaddd %ymm15,%ymm10,%ymm10
  541. vpxord %ymm10,%ymm5,%ymm5
  542. vprold $7,%ymm5,%ymm5
  543. # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  544. vpaddd %ymm12,%ymm11,%ymm11
  545. vpxord %ymm11,%ymm6,%ymm6
  546. vprold $7,%ymm6,%ymm6
  547. # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  548. vpaddd %ymm13,%ymm8,%ymm8
  549. vpxord %ymm8,%ymm7,%ymm7
  550. vprold $7,%ymm7,%ymm7
  551. # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  552. vpaddd %ymm14,%ymm9,%ymm9
  553. vpxord %ymm9,%ymm4,%ymm4
  554. vprold $7,%ymm4,%ymm4
  555. sub $2,%r8d
  556. jnz .Ldoubleround8
  557. # x0..15[0-3] += s[0..15]
  558. vpaddd %ymm16,%ymm0,%ymm0
  559. vpaddd %ymm17,%ymm1,%ymm1
  560. vpaddd %ymm18,%ymm2,%ymm2
  561. vpaddd %ymm19,%ymm3,%ymm3
  562. vpaddd %ymm20,%ymm4,%ymm4
  563. vpaddd %ymm21,%ymm5,%ymm5
  564. vpaddd %ymm22,%ymm6,%ymm6
  565. vpaddd %ymm23,%ymm7,%ymm7
  566. vpaddd %ymm24,%ymm8,%ymm8
  567. vpaddd %ymm25,%ymm9,%ymm9
  568. vpaddd %ymm26,%ymm10,%ymm10
  569. vpaddd %ymm27,%ymm11,%ymm11
  570. vpaddd %ymm28,%ymm12,%ymm12
  571. vpaddd %ymm29,%ymm13,%ymm13
  572. vpaddd %ymm30,%ymm14,%ymm14
  573. vpaddd %ymm31,%ymm15,%ymm15
  574. # interleave 32-bit words in state n, n+1
  575. vpunpckldq %ymm1,%ymm0,%ymm16
  576. vpunpckhdq %ymm1,%ymm0,%ymm17
  577. vpunpckldq %ymm3,%ymm2,%ymm18
  578. vpunpckhdq %ymm3,%ymm2,%ymm19
  579. vpunpckldq %ymm5,%ymm4,%ymm20
  580. vpunpckhdq %ymm5,%ymm4,%ymm21
  581. vpunpckldq %ymm7,%ymm6,%ymm22
  582. vpunpckhdq %ymm7,%ymm6,%ymm23
  583. vpunpckldq %ymm9,%ymm8,%ymm24
  584. vpunpckhdq %ymm9,%ymm8,%ymm25
  585. vpunpckldq %ymm11,%ymm10,%ymm26
  586. vpunpckhdq %ymm11,%ymm10,%ymm27
  587. vpunpckldq %ymm13,%ymm12,%ymm28
  588. vpunpckhdq %ymm13,%ymm12,%ymm29
  589. vpunpckldq %ymm15,%ymm14,%ymm30
  590. vpunpckhdq %ymm15,%ymm14,%ymm31
  591. # interleave 64-bit words in state n, n+2
  592. vpunpcklqdq %ymm18,%ymm16,%ymm0
  593. vpunpcklqdq %ymm19,%ymm17,%ymm1
  594. vpunpckhqdq %ymm18,%ymm16,%ymm2
  595. vpunpckhqdq %ymm19,%ymm17,%ymm3
  596. vpunpcklqdq %ymm22,%ymm20,%ymm4
  597. vpunpcklqdq %ymm23,%ymm21,%ymm5
  598. vpunpckhqdq %ymm22,%ymm20,%ymm6
  599. vpunpckhqdq %ymm23,%ymm21,%ymm7
  600. vpunpcklqdq %ymm26,%ymm24,%ymm8
  601. vpunpcklqdq %ymm27,%ymm25,%ymm9
  602. vpunpckhqdq %ymm26,%ymm24,%ymm10
  603. vpunpckhqdq %ymm27,%ymm25,%ymm11
  604. vpunpcklqdq %ymm30,%ymm28,%ymm12
  605. vpunpcklqdq %ymm31,%ymm29,%ymm13
  606. vpunpckhqdq %ymm30,%ymm28,%ymm14
  607. vpunpckhqdq %ymm31,%ymm29,%ymm15
  608. # interleave 128-bit words in state n, n+4
  609. # xor/write first four blocks
  610. vmovdqa64 %ymm0,%ymm16
  611. vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
  612. cmp $0x0020,%rcx
  613. jl .Lxorpart8
  614. vpxord 0x0000(%rdx),%ymm0,%ymm0
  615. vmovdqu64 %ymm0,0x0000(%rsi)
  616. vmovdqa64 %ymm16,%ymm0
  617. vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
  618. vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
  619. cmp $0x0040,%rcx
  620. jl .Lxorpart8
  621. vpxord 0x0020(%rdx),%ymm0,%ymm0
  622. vmovdqu64 %ymm0,0x0020(%rsi)
  623. vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
  624. vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
  625. cmp $0x0060,%rcx
  626. jl .Lxorpart8
  627. vpxord 0x0040(%rdx),%ymm0,%ymm0
  628. vmovdqu64 %ymm0,0x0040(%rsi)
  629. vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
  630. vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
  631. cmp $0x0080,%rcx
  632. jl .Lxorpart8
  633. vpxord 0x0060(%rdx),%ymm0,%ymm0
  634. vmovdqu64 %ymm0,0x0060(%rsi)
  635. vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
  636. vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
  637. cmp $0x00a0,%rcx
  638. jl .Lxorpart8
  639. vpxord 0x0080(%rdx),%ymm0,%ymm0
  640. vmovdqu64 %ymm0,0x0080(%rsi)
  641. vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
  642. vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
  643. cmp $0x00c0,%rcx
  644. jl .Lxorpart8
  645. vpxord 0x00a0(%rdx),%ymm0,%ymm0
  646. vmovdqu64 %ymm0,0x00a0(%rsi)
  647. vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
  648. vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
  649. cmp $0x00e0,%rcx
  650. jl .Lxorpart8
  651. vpxord 0x00c0(%rdx),%ymm0,%ymm0
  652. vmovdqu64 %ymm0,0x00c0(%rsi)
  653. vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
  654. vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
  655. cmp $0x0100,%rcx
  656. jl .Lxorpart8
  657. vpxord 0x00e0(%rdx),%ymm0,%ymm0
  658. vmovdqu64 %ymm0,0x00e0(%rsi)
  659. vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
  660. # xor remaining blocks, write to output
  661. vmovdqa64 %ymm4,%ymm0
  662. cmp $0x0120,%rcx
  663. jl .Lxorpart8
  664. vpxord 0x0100(%rdx),%ymm0,%ymm0
  665. vmovdqu64 %ymm0,0x0100(%rsi)
  666. vmovdqa64 %ymm12,%ymm0
  667. cmp $0x0140,%rcx
  668. jl .Lxorpart8
  669. vpxord 0x0120(%rdx),%ymm0,%ymm0
  670. vmovdqu64 %ymm0,0x0120(%rsi)
  671. vmovdqa64 %ymm6,%ymm0
  672. cmp $0x0160,%rcx
  673. jl .Lxorpart8
  674. vpxord 0x0140(%rdx),%ymm0,%ymm0
  675. vmovdqu64 %ymm0,0x0140(%rsi)
  676. vmovdqa64 %ymm14,%ymm0
  677. cmp $0x0180,%rcx
  678. jl .Lxorpart8
  679. vpxord 0x0160(%rdx),%ymm0,%ymm0
  680. vmovdqu64 %ymm0,0x0160(%rsi)
  681. vmovdqa64 %ymm5,%ymm0
  682. cmp $0x01a0,%rcx
  683. jl .Lxorpart8
  684. vpxord 0x0180(%rdx),%ymm0,%ymm0
  685. vmovdqu64 %ymm0,0x0180(%rsi)
  686. vmovdqa64 %ymm13,%ymm0
  687. cmp $0x01c0,%rcx
  688. jl .Lxorpart8
  689. vpxord 0x01a0(%rdx),%ymm0,%ymm0
  690. vmovdqu64 %ymm0,0x01a0(%rsi)
  691. vmovdqa64 %ymm7,%ymm0
  692. cmp $0x01e0,%rcx
  693. jl .Lxorpart8
  694. vpxord 0x01c0(%rdx),%ymm0,%ymm0
  695. vmovdqu64 %ymm0,0x01c0(%rsi)
  696. vmovdqa64 %ymm15,%ymm0
  697. cmp $0x0200,%rcx
  698. jl .Lxorpart8
  699. vpxord 0x01e0(%rdx),%ymm0,%ymm0
  700. vmovdqu64 %ymm0,0x01e0(%rsi)
  701. .Ldone8:
  702. vzeroupper
  703. RET
  704. .Lxorpart8:
  705. # xor remaining bytes from partial register into output
  706. mov %rcx,%rax
  707. and $0x1f,%rcx
  708. jz .Ldone8
  709. mov %rax,%r9
  710. and $~0x1f,%r9
  711. mov $1,%rax
  712. shld %cl,%rax,%rax
  713. sub $1,%rax
  714. kmovq %rax,%k1
  715. vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
  716. vpxord %ymm0,%ymm1,%ymm1
  717. vmovdqu8 %ymm1,(%rsi,%r9){%k1}
  718. jmp .Ldone8
  719. SYM_FUNC_END(chacha_8block_xor_avx512vl)