chacha-avx2-x86_64.S 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
  4. *
  5. * Copyright (C) 2015 Martin Willi
  6. */
  7. #include <linux/linkage.h>
  8. .section .rodata.cst32.ROT8, "aM", @progbits, 32
  9. .align 32
  10. ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
  11. .octa 0x0e0d0c0f0a09080b0605040702010003
  12. .section .rodata.cst32.ROT16, "aM", @progbits, 32
  13. .align 32
  14. ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
  15. .octa 0x0d0c0f0e09080b0a0504070601000302
  16. .section .rodata.cst32.CTRINC, "aM", @progbits, 32
  17. .align 32
  18. CTRINC: .octa 0x00000003000000020000000100000000
  19. .octa 0x00000007000000060000000500000004
  20. .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
  21. .align 32
  22. CTR2BL: .octa 0x00000000000000000000000000000000
  23. .octa 0x00000000000000000000000000000001
  24. .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
  25. .align 32
  26. CTR4BL: .octa 0x00000000000000000000000000000002
  27. .octa 0x00000000000000000000000000000003
  28. .text
  29. SYM_FUNC_START(chacha_2block_xor_avx2)
  30. # %rdi: Input state matrix, s
  31. # %rsi: up to 2 data blocks output, o
  32. # %rdx: up to 2 data blocks input, i
  33. # %rcx: input/output length in bytes
  34. # %r8d: nrounds
  35. # This function encrypts two ChaCha blocks by loading the state
  36. # matrix twice across four AVX registers. It performs matrix operations
  37. # on four words in each matrix in parallel, but requires shuffling to
  38. # rearrange the words after each round.
  39. vzeroupper
  40. # x0..3[0-2] = s0..3
  41. vbroadcasti128 0x00(%rdi),%ymm0
  42. vbroadcasti128 0x10(%rdi),%ymm1
  43. vbroadcasti128 0x20(%rdi),%ymm2
  44. vbroadcasti128 0x30(%rdi),%ymm3
  45. vpaddd CTR2BL(%rip),%ymm3,%ymm3
  46. vmovdqa %ymm0,%ymm8
  47. vmovdqa %ymm1,%ymm9
  48. vmovdqa %ymm2,%ymm10
  49. vmovdqa %ymm3,%ymm11
  50. vmovdqa ROT8(%rip),%ymm4
  51. vmovdqa ROT16(%rip),%ymm5
  52. mov %rcx,%rax
  53. .Ldoubleround:
  54. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  55. vpaddd %ymm1,%ymm0,%ymm0
  56. vpxor %ymm0,%ymm3,%ymm3
  57. vpshufb %ymm5,%ymm3,%ymm3
  58. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  59. vpaddd %ymm3,%ymm2,%ymm2
  60. vpxor %ymm2,%ymm1,%ymm1
  61. vmovdqa %ymm1,%ymm6
  62. vpslld $12,%ymm6,%ymm6
  63. vpsrld $20,%ymm1,%ymm1
  64. vpor %ymm6,%ymm1,%ymm1
  65. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  66. vpaddd %ymm1,%ymm0,%ymm0
  67. vpxor %ymm0,%ymm3,%ymm3
  68. vpshufb %ymm4,%ymm3,%ymm3
  69. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  70. vpaddd %ymm3,%ymm2,%ymm2
  71. vpxor %ymm2,%ymm1,%ymm1
  72. vmovdqa %ymm1,%ymm7
  73. vpslld $7,%ymm7,%ymm7
  74. vpsrld $25,%ymm1,%ymm1
  75. vpor %ymm7,%ymm1,%ymm1
  76. # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  77. vpshufd $0x39,%ymm1,%ymm1
  78. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  79. vpshufd $0x4e,%ymm2,%ymm2
  80. # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  81. vpshufd $0x93,%ymm3,%ymm3
  82. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  83. vpaddd %ymm1,%ymm0,%ymm0
  84. vpxor %ymm0,%ymm3,%ymm3
  85. vpshufb %ymm5,%ymm3,%ymm3
  86. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  87. vpaddd %ymm3,%ymm2,%ymm2
  88. vpxor %ymm2,%ymm1,%ymm1
  89. vmovdqa %ymm1,%ymm6
  90. vpslld $12,%ymm6,%ymm6
  91. vpsrld $20,%ymm1,%ymm1
  92. vpor %ymm6,%ymm1,%ymm1
  93. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  94. vpaddd %ymm1,%ymm0,%ymm0
  95. vpxor %ymm0,%ymm3,%ymm3
  96. vpshufb %ymm4,%ymm3,%ymm3
  97. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  98. vpaddd %ymm3,%ymm2,%ymm2
  99. vpxor %ymm2,%ymm1,%ymm1
  100. vmovdqa %ymm1,%ymm7
  101. vpslld $7,%ymm7,%ymm7
  102. vpsrld $25,%ymm1,%ymm1
  103. vpor %ymm7,%ymm1,%ymm1
  104. # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  105. vpshufd $0x93,%ymm1,%ymm1
  106. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  107. vpshufd $0x4e,%ymm2,%ymm2
  108. # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  109. vpshufd $0x39,%ymm3,%ymm3
  110. sub $2,%r8d
  111. jnz .Ldoubleround
  112. # o0 = i0 ^ (x0 + s0)
  113. vpaddd %ymm8,%ymm0,%ymm7
  114. cmp $0x10,%rax
  115. jl .Lxorpart2
  116. vpxor 0x00(%rdx),%xmm7,%xmm6
  117. vmovdqu %xmm6,0x00(%rsi)
  118. vextracti128 $1,%ymm7,%xmm0
  119. # o1 = i1 ^ (x1 + s1)
  120. vpaddd %ymm9,%ymm1,%ymm7
  121. cmp $0x20,%rax
  122. jl .Lxorpart2
  123. vpxor 0x10(%rdx),%xmm7,%xmm6
  124. vmovdqu %xmm6,0x10(%rsi)
  125. vextracti128 $1,%ymm7,%xmm1
  126. # o2 = i2 ^ (x2 + s2)
  127. vpaddd %ymm10,%ymm2,%ymm7
  128. cmp $0x30,%rax
  129. jl .Lxorpart2
  130. vpxor 0x20(%rdx),%xmm7,%xmm6
  131. vmovdqu %xmm6,0x20(%rsi)
  132. vextracti128 $1,%ymm7,%xmm2
  133. # o3 = i3 ^ (x3 + s3)
  134. vpaddd %ymm11,%ymm3,%ymm7
  135. cmp $0x40,%rax
  136. jl .Lxorpart2
  137. vpxor 0x30(%rdx),%xmm7,%xmm6
  138. vmovdqu %xmm6,0x30(%rsi)
  139. vextracti128 $1,%ymm7,%xmm3
  140. # xor and write second block
  141. vmovdqa %xmm0,%xmm7
  142. cmp $0x50,%rax
  143. jl .Lxorpart2
  144. vpxor 0x40(%rdx),%xmm7,%xmm6
  145. vmovdqu %xmm6,0x40(%rsi)
  146. vmovdqa %xmm1,%xmm7
  147. cmp $0x60,%rax
  148. jl .Lxorpart2
  149. vpxor 0x50(%rdx),%xmm7,%xmm6
  150. vmovdqu %xmm6,0x50(%rsi)
  151. vmovdqa %xmm2,%xmm7
  152. cmp $0x70,%rax
  153. jl .Lxorpart2
  154. vpxor 0x60(%rdx),%xmm7,%xmm6
  155. vmovdqu %xmm6,0x60(%rsi)
  156. vmovdqa %xmm3,%xmm7
  157. cmp $0x80,%rax
  158. jl .Lxorpart2
  159. vpxor 0x70(%rdx),%xmm7,%xmm6
  160. vmovdqu %xmm6,0x70(%rsi)
  161. .Ldone2:
  162. vzeroupper
  163. RET
  164. .Lxorpart2:
  165. # xor remaining bytes from partial register into output
  166. mov %rax,%r9
  167. and $0x0f,%r9
  168. jz .Ldone2
  169. and $~0x0f,%rax
  170. mov %rsi,%r11
  171. lea 8(%rsp),%r10
  172. sub $0x10,%rsp
  173. and $~31,%rsp
  174. lea (%rdx,%rax),%rsi
  175. mov %rsp,%rdi
  176. mov %r9,%rcx
  177. rep movsb
  178. vpxor 0x00(%rsp),%xmm7,%xmm7
  179. vmovdqa %xmm7,0x00(%rsp)
  180. mov %rsp,%rsi
  181. lea (%r11,%rax),%rdi
  182. mov %r9,%rcx
  183. rep movsb
  184. lea -8(%r10),%rsp
  185. jmp .Ldone2
  186. SYM_FUNC_END(chacha_2block_xor_avx2)
  187. SYM_FUNC_START(chacha_4block_xor_avx2)
  188. # %rdi: Input state matrix, s
  189. # %rsi: up to 4 data blocks output, o
  190. # %rdx: up to 4 data blocks input, i
  191. # %rcx: input/output length in bytes
  192. # %r8d: nrounds
  193. # This function encrypts four ChaCha blocks by loading the state
  194. # matrix four times across eight AVX registers. It performs matrix
  195. # operations on four words in two matrices in parallel, sequentially
  196. # to the operations on the four words of the other two matrices. The
  197. # required word shuffling has a rather high latency, we can do the
  198. # arithmetic on two matrix-pairs without much slowdown.
  199. vzeroupper
  200. # x0..3[0-4] = s0..3
  201. vbroadcasti128 0x00(%rdi),%ymm0
  202. vbroadcasti128 0x10(%rdi),%ymm1
  203. vbroadcasti128 0x20(%rdi),%ymm2
  204. vbroadcasti128 0x30(%rdi),%ymm3
  205. vmovdqa %ymm0,%ymm4
  206. vmovdqa %ymm1,%ymm5
  207. vmovdqa %ymm2,%ymm6
  208. vmovdqa %ymm3,%ymm7
  209. vpaddd CTR2BL(%rip),%ymm3,%ymm3
  210. vpaddd CTR4BL(%rip),%ymm7,%ymm7
  211. vmovdqa %ymm0,%ymm11
  212. vmovdqa %ymm1,%ymm12
  213. vmovdqa %ymm2,%ymm13
  214. vmovdqa %ymm3,%ymm14
  215. vmovdqa %ymm7,%ymm15
  216. vmovdqa ROT8(%rip),%ymm8
  217. vmovdqa ROT16(%rip),%ymm9
  218. mov %rcx,%rax
  219. .Ldoubleround4:
  220. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  221. vpaddd %ymm1,%ymm0,%ymm0
  222. vpxor %ymm0,%ymm3,%ymm3
  223. vpshufb %ymm9,%ymm3,%ymm3
  224. vpaddd %ymm5,%ymm4,%ymm4
  225. vpxor %ymm4,%ymm7,%ymm7
  226. vpshufb %ymm9,%ymm7,%ymm7
  227. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  228. vpaddd %ymm3,%ymm2,%ymm2
  229. vpxor %ymm2,%ymm1,%ymm1
  230. vmovdqa %ymm1,%ymm10
  231. vpslld $12,%ymm10,%ymm10
  232. vpsrld $20,%ymm1,%ymm1
  233. vpor %ymm10,%ymm1,%ymm1
  234. vpaddd %ymm7,%ymm6,%ymm6
  235. vpxor %ymm6,%ymm5,%ymm5
  236. vmovdqa %ymm5,%ymm10
  237. vpslld $12,%ymm10,%ymm10
  238. vpsrld $20,%ymm5,%ymm5
  239. vpor %ymm10,%ymm5,%ymm5
  240. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  241. vpaddd %ymm1,%ymm0,%ymm0
  242. vpxor %ymm0,%ymm3,%ymm3
  243. vpshufb %ymm8,%ymm3,%ymm3
  244. vpaddd %ymm5,%ymm4,%ymm4
  245. vpxor %ymm4,%ymm7,%ymm7
  246. vpshufb %ymm8,%ymm7,%ymm7
  247. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  248. vpaddd %ymm3,%ymm2,%ymm2
  249. vpxor %ymm2,%ymm1,%ymm1
  250. vmovdqa %ymm1,%ymm10
  251. vpslld $7,%ymm10,%ymm10
  252. vpsrld $25,%ymm1,%ymm1
  253. vpor %ymm10,%ymm1,%ymm1
  254. vpaddd %ymm7,%ymm6,%ymm6
  255. vpxor %ymm6,%ymm5,%ymm5
  256. vmovdqa %ymm5,%ymm10
  257. vpslld $7,%ymm10,%ymm10
  258. vpsrld $25,%ymm5,%ymm5
  259. vpor %ymm10,%ymm5,%ymm5
  260. # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  261. vpshufd $0x39,%ymm1,%ymm1
  262. vpshufd $0x39,%ymm5,%ymm5
  263. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  264. vpshufd $0x4e,%ymm2,%ymm2
  265. vpshufd $0x4e,%ymm6,%ymm6
  266. # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  267. vpshufd $0x93,%ymm3,%ymm3
  268. vpshufd $0x93,%ymm7,%ymm7
  269. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  270. vpaddd %ymm1,%ymm0,%ymm0
  271. vpxor %ymm0,%ymm3,%ymm3
  272. vpshufb %ymm9,%ymm3,%ymm3
  273. vpaddd %ymm5,%ymm4,%ymm4
  274. vpxor %ymm4,%ymm7,%ymm7
  275. vpshufb %ymm9,%ymm7,%ymm7
  276. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  277. vpaddd %ymm3,%ymm2,%ymm2
  278. vpxor %ymm2,%ymm1,%ymm1
  279. vmovdqa %ymm1,%ymm10
  280. vpslld $12,%ymm10,%ymm10
  281. vpsrld $20,%ymm1,%ymm1
  282. vpor %ymm10,%ymm1,%ymm1
  283. vpaddd %ymm7,%ymm6,%ymm6
  284. vpxor %ymm6,%ymm5,%ymm5
  285. vmovdqa %ymm5,%ymm10
  286. vpslld $12,%ymm10,%ymm10
  287. vpsrld $20,%ymm5,%ymm5
  288. vpor %ymm10,%ymm5,%ymm5
  289. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  290. vpaddd %ymm1,%ymm0,%ymm0
  291. vpxor %ymm0,%ymm3,%ymm3
  292. vpshufb %ymm8,%ymm3,%ymm3
  293. vpaddd %ymm5,%ymm4,%ymm4
  294. vpxor %ymm4,%ymm7,%ymm7
  295. vpshufb %ymm8,%ymm7,%ymm7
  296. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  297. vpaddd %ymm3,%ymm2,%ymm2
  298. vpxor %ymm2,%ymm1,%ymm1
  299. vmovdqa %ymm1,%ymm10
  300. vpslld $7,%ymm10,%ymm10
  301. vpsrld $25,%ymm1,%ymm1
  302. vpor %ymm10,%ymm1,%ymm1
  303. vpaddd %ymm7,%ymm6,%ymm6
  304. vpxor %ymm6,%ymm5,%ymm5
  305. vmovdqa %ymm5,%ymm10
  306. vpslld $7,%ymm10,%ymm10
  307. vpsrld $25,%ymm5,%ymm5
  308. vpor %ymm10,%ymm5,%ymm5
  309. # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  310. vpshufd $0x93,%ymm1,%ymm1
  311. vpshufd $0x93,%ymm5,%ymm5
  312. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  313. vpshufd $0x4e,%ymm2,%ymm2
  314. vpshufd $0x4e,%ymm6,%ymm6
  315. # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  316. vpshufd $0x39,%ymm3,%ymm3
  317. vpshufd $0x39,%ymm7,%ymm7
  318. sub $2,%r8d
  319. jnz .Ldoubleround4
  320. # o0 = i0 ^ (x0 + s0), first block
  321. vpaddd %ymm11,%ymm0,%ymm10
  322. cmp $0x10,%rax
  323. jl .Lxorpart4
  324. vpxor 0x00(%rdx),%xmm10,%xmm9
  325. vmovdqu %xmm9,0x00(%rsi)
  326. vextracti128 $1,%ymm10,%xmm0
  327. # o1 = i1 ^ (x1 + s1), first block
  328. vpaddd %ymm12,%ymm1,%ymm10
  329. cmp $0x20,%rax
  330. jl .Lxorpart4
  331. vpxor 0x10(%rdx),%xmm10,%xmm9
  332. vmovdqu %xmm9,0x10(%rsi)
  333. vextracti128 $1,%ymm10,%xmm1
  334. # o2 = i2 ^ (x2 + s2), first block
  335. vpaddd %ymm13,%ymm2,%ymm10
  336. cmp $0x30,%rax
  337. jl .Lxorpart4
  338. vpxor 0x20(%rdx),%xmm10,%xmm9
  339. vmovdqu %xmm9,0x20(%rsi)
  340. vextracti128 $1,%ymm10,%xmm2
  341. # o3 = i3 ^ (x3 + s3), first block
  342. vpaddd %ymm14,%ymm3,%ymm10
  343. cmp $0x40,%rax
  344. jl .Lxorpart4
  345. vpxor 0x30(%rdx),%xmm10,%xmm9
  346. vmovdqu %xmm9,0x30(%rsi)
  347. vextracti128 $1,%ymm10,%xmm3
  348. # xor and write second block
  349. vmovdqa %xmm0,%xmm10
  350. cmp $0x50,%rax
  351. jl .Lxorpart4
  352. vpxor 0x40(%rdx),%xmm10,%xmm9
  353. vmovdqu %xmm9,0x40(%rsi)
  354. vmovdqa %xmm1,%xmm10
  355. cmp $0x60,%rax
  356. jl .Lxorpart4
  357. vpxor 0x50(%rdx),%xmm10,%xmm9
  358. vmovdqu %xmm9,0x50(%rsi)
  359. vmovdqa %xmm2,%xmm10
  360. cmp $0x70,%rax
  361. jl .Lxorpart4
  362. vpxor 0x60(%rdx),%xmm10,%xmm9
  363. vmovdqu %xmm9,0x60(%rsi)
  364. vmovdqa %xmm3,%xmm10
  365. cmp $0x80,%rax
  366. jl .Lxorpart4
  367. vpxor 0x70(%rdx),%xmm10,%xmm9
  368. vmovdqu %xmm9,0x70(%rsi)
  369. # o0 = i0 ^ (x0 + s0), third block
  370. vpaddd %ymm11,%ymm4,%ymm10
  371. cmp $0x90,%rax
  372. jl .Lxorpart4
  373. vpxor 0x80(%rdx),%xmm10,%xmm9
  374. vmovdqu %xmm9,0x80(%rsi)
  375. vextracti128 $1,%ymm10,%xmm4
  376. # o1 = i1 ^ (x1 + s1), third block
  377. vpaddd %ymm12,%ymm5,%ymm10
  378. cmp $0xa0,%rax
  379. jl .Lxorpart4
  380. vpxor 0x90(%rdx),%xmm10,%xmm9
  381. vmovdqu %xmm9,0x90(%rsi)
  382. vextracti128 $1,%ymm10,%xmm5
  383. # o2 = i2 ^ (x2 + s2), third block
  384. vpaddd %ymm13,%ymm6,%ymm10
  385. cmp $0xb0,%rax
  386. jl .Lxorpart4
  387. vpxor 0xa0(%rdx),%xmm10,%xmm9
  388. vmovdqu %xmm9,0xa0(%rsi)
  389. vextracti128 $1,%ymm10,%xmm6
  390. # o3 = i3 ^ (x3 + s3), third block
  391. vpaddd %ymm15,%ymm7,%ymm10
  392. cmp $0xc0,%rax
  393. jl .Lxorpart4
  394. vpxor 0xb0(%rdx),%xmm10,%xmm9
  395. vmovdqu %xmm9,0xb0(%rsi)
  396. vextracti128 $1,%ymm10,%xmm7
  397. # xor and write fourth block
  398. vmovdqa %xmm4,%xmm10
  399. cmp $0xd0,%rax
  400. jl .Lxorpart4
  401. vpxor 0xc0(%rdx),%xmm10,%xmm9
  402. vmovdqu %xmm9,0xc0(%rsi)
  403. vmovdqa %xmm5,%xmm10
  404. cmp $0xe0,%rax
  405. jl .Lxorpart4
  406. vpxor 0xd0(%rdx),%xmm10,%xmm9
  407. vmovdqu %xmm9,0xd0(%rsi)
  408. vmovdqa %xmm6,%xmm10
  409. cmp $0xf0,%rax
  410. jl .Lxorpart4
  411. vpxor 0xe0(%rdx),%xmm10,%xmm9
  412. vmovdqu %xmm9,0xe0(%rsi)
  413. vmovdqa %xmm7,%xmm10
  414. cmp $0x100,%rax
  415. jl .Lxorpart4
  416. vpxor 0xf0(%rdx),%xmm10,%xmm9
  417. vmovdqu %xmm9,0xf0(%rsi)
  418. .Ldone4:
  419. vzeroupper
  420. RET
  421. .Lxorpart4:
  422. # xor remaining bytes from partial register into output
  423. mov %rax,%r9
  424. and $0x0f,%r9
  425. jz .Ldone4
  426. and $~0x0f,%rax
  427. mov %rsi,%r11
  428. lea 8(%rsp),%r10
  429. sub $0x10,%rsp
  430. and $~31,%rsp
  431. lea (%rdx,%rax),%rsi
  432. mov %rsp,%rdi
  433. mov %r9,%rcx
  434. rep movsb
  435. vpxor 0x00(%rsp),%xmm10,%xmm10
  436. vmovdqa %xmm10,0x00(%rsp)
  437. mov %rsp,%rsi
  438. lea (%r11,%rax),%rdi
  439. mov %r9,%rcx
  440. rep movsb
  441. lea -8(%r10),%rsp
  442. jmp .Ldone4
  443. SYM_FUNC_END(chacha_4block_xor_avx2)
  444. SYM_FUNC_START(chacha_8block_xor_avx2)
  445. # %rdi: Input state matrix, s
  446. # %rsi: up to 8 data blocks output, o
  447. # %rdx: up to 8 data blocks input, i
  448. # %rcx: input/output length in bytes
  449. # %r8d: nrounds
  450. # This function encrypts eight consecutive ChaCha blocks by loading
  451. # the state matrix in AVX registers eight times. As we need some
  452. # scratch registers, we save the first four registers on the stack. The
  453. # algorithm performs each operation on the corresponding word of each
  454. # state matrix, hence requires no word shuffling. For final XORing step
  455. # we transpose the matrix by interleaving 32-, 64- and then 128-bit
  456. # words, which allows us to do XOR in AVX registers. 8/16-bit word
  457. # rotation is done with the slightly better performing byte shuffling,
  458. # 7/12-bit word rotation uses traditional shift+OR.
  459. vzeroupper
  460. # 4 * 32 byte stack, 32-byte aligned
  461. lea 8(%rsp),%r10
  462. and $~31, %rsp
  463. sub $0x80, %rsp
  464. mov %rcx,%rax
  465. # x0..15[0-7] = s[0..15]
  466. vpbroadcastd 0x00(%rdi),%ymm0
  467. vpbroadcastd 0x04(%rdi),%ymm1
  468. vpbroadcastd 0x08(%rdi),%ymm2
  469. vpbroadcastd 0x0c(%rdi),%ymm3
  470. vpbroadcastd 0x10(%rdi),%ymm4
  471. vpbroadcastd 0x14(%rdi),%ymm5
  472. vpbroadcastd 0x18(%rdi),%ymm6
  473. vpbroadcastd 0x1c(%rdi),%ymm7
  474. vpbroadcastd 0x20(%rdi),%ymm8
  475. vpbroadcastd 0x24(%rdi),%ymm9
  476. vpbroadcastd 0x28(%rdi),%ymm10
  477. vpbroadcastd 0x2c(%rdi),%ymm11
  478. vpbroadcastd 0x30(%rdi),%ymm12
  479. vpbroadcastd 0x34(%rdi),%ymm13
  480. vpbroadcastd 0x38(%rdi),%ymm14
  481. vpbroadcastd 0x3c(%rdi),%ymm15
  482. # x0..3 on stack
  483. vmovdqa %ymm0,0x00(%rsp)
  484. vmovdqa %ymm1,0x20(%rsp)
  485. vmovdqa %ymm2,0x40(%rsp)
  486. vmovdqa %ymm3,0x60(%rsp)
  487. vmovdqa CTRINC(%rip),%ymm1
  488. vmovdqa ROT8(%rip),%ymm2
  489. vmovdqa ROT16(%rip),%ymm3
  490. # x12 += counter values 0-3
  491. vpaddd %ymm1,%ymm12,%ymm12
  492. .Ldoubleround8:
  493. # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  494. vpaddd 0x00(%rsp),%ymm4,%ymm0
  495. vmovdqa %ymm0,0x00(%rsp)
  496. vpxor %ymm0,%ymm12,%ymm12
  497. vpshufb %ymm3,%ymm12,%ymm12
  498. # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  499. vpaddd 0x20(%rsp),%ymm5,%ymm0
  500. vmovdqa %ymm0,0x20(%rsp)
  501. vpxor %ymm0,%ymm13,%ymm13
  502. vpshufb %ymm3,%ymm13,%ymm13
  503. # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  504. vpaddd 0x40(%rsp),%ymm6,%ymm0
  505. vmovdqa %ymm0,0x40(%rsp)
  506. vpxor %ymm0,%ymm14,%ymm14
  507. vpshufb %ymm3,%ymm14,%ymm14
  508. # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  509. vpaddd 0x60(%rsp),%ymm7,%ymm0
  510. vmovdqa %ymm0,0x60(%rsp)
  511. vpxor %ymm0,%ymm15,%ymm15
  512. vpshufb %ymm3,%ymm15,%ymm15
  513. # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  514. vpaddd %ymm12,%ymm8,%ymm8
  515. vpxor %ymm8,%ymm4,%ymm4
  516. vpslld $12,%ymm4,%ymm0
  517. vpsrld $20,%ymm4,%ymm4
  518. vpor %ymm0,%ymm4,%ymm4
  519. # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  520. vpaddd %ymm13,%ymm9,%ymm9
  521. vpxor %ymm9,%ymm5,%ymm5
  522. vpslld $12,%ymm5,%ymm0
  523. vpsrld $20,%ymm5,%ymm5
  524. vpor %ymm0,%ymm5,%ymm5
  525. # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  526. vpaddd %ymm14,%ymm10,%ymm10
  527. vpxor %ymm10,%ymm6,%ymm6
  528. vpslld $12,%ymm6,%ymm0
  529. vpsrld $20,%ymm6,%ymm6
  530. vpor %ymm0,%ymm6,%ymm6
  531. # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  532. vpaddd %ymm15,%ymm11,%ymm11
  533. vpxor %ymm11,%ymm7,%ymm7
  534. vpslld $12,%ymm7,%ymm0
  535. vpsrld $20,%ymm7,%ymm7
  536. vpor %ymm0,%ymm7,%ymm7
  537. # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  538. vpaddd 0x00(%rsp),%ymm4,%ymm0
  539. vmovdqa %ymm0,0x00(%rsp)
  540. vpxor %ymm0,%ymm12,%ymm12
  541. vpshufb %ymm2,%ymm12,%ymm12
  542. # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  543. vpaddd 0x20(%rsp),%ymm5,%ymm0
  544. vmovdqa %ymm0,0x20(%rsp)
  545. vpxor %ymm0,%ymm13,%ymm13
  546. vpshufb %ymm2,%ymm13,%ymm13
  547. # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  548. vpaddd 0x40(%rsp),%ymm6,%ymm0
  549. vmovdqa %ymm0,0x40(%rsp)
  550. vpxor %ymm0,%ymm14,%ymm14
  551. vpshufb %ymm2,%ymm14,%ymm14
  552. # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  553. vpaddd 0x60(%rsp),%ymm7,%ymm0
  554. vmovdqa %ymm0,0x60(%rsp)
  555. vpxor %ymm0,%ymm15,%ymm15
  556. vpshufb %ymm2,%ymm15,%ymm15
  557. # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  558. vpaddd %ymm12,%ymm8,%ymm8
  559. vpxor %ymm8,%ymm4,%ymm4
  560. vpslld $7,%ymm4,%ymm0
  561. vpsrld $25,%ymm4,%ymm4
  562. vpor %ymm0,%ymm4,%ymm4
  563. # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  564. vpaddd %ymm13,%ymm9,%ymm9
  565. vpxor %ymm9,%ymm5,%ymm5
  566. vpslld $7,%ymm5,%ymm0
  567. vpsrld $25,%ymm5,%ymm5
  568. vpor %ymm0,%ymm5,%ymm5
  569. # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  570. vpaddd %ymm14,%ymm10,%ymm10
  571. vpxor %ymm10,%ymm6,%ymm6
  572. vpslld $7,%ymm6,%ymm0
  573. vpsrld $25,%ymm6,%ymm6
  574. vpor %ymm0,%ymm6,%ymm6
  575. # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  576. vpaddd %ymm15,%ymm11,%ymm11
  577. vpxor %ymm11,%ymm7,%ymm7
  578. vpslld $7,%ymm7,%ymm0
  579. vpsrld $25,%ymm7,%ymm7
  580. vpor %ymm0,%ymm7,%ymm7
  581. # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  582. vpaddd 0x00(%rsp),%ymm5,%ymm0
  583. vmovdqa %ymm0,0x00(%rsp)
  584. vpxor %ymm0,%ymm15,%ymm15
  585. vpshufb %ymm3,%ymm15,%ymm15
  586. # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
  587. vpaddd 0x20(%rsp),%ymm6,%ymm0
  588. vmovdqa %ymm0,0x20(%rsp)
  589. vpxor %ymm0,%ymm12,%ymm12
  590. vpshufb %ymm3,%ymm12,%ymm12
  591. # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  592. vpaddd 0x40(%rsp),%ymm7,%ymm0
  593. vmovdqa %ymm0,0x40(%rsp)
  594. vpxor %ymm0,%ymm13,%ymm13
  595. vpshufb %ymm3,%ymm13,%ymm13
  596. # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  597. vpaddd 0x60(%rsp),%ymm4,%ymm0
  598. vmovdqa %ymm0,0x60(%rsp)
  599. vpxor %ymm0,%ymm14,%ymm14
  600. vpshufb %ymm3,%ymm14,%ymm14
  601. # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  602. vpaddd %ymm15,%ymm10,%ymm10
  603. vpxor %ymm10,%ymm5,%ymm5
  604. vpslld $12,%ymm5,%ymm0
  605. vpsrld $20,%ymm5,%ymm5
  606. vpor %ymm0,%ymm5,%ymm5
  607. # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  608. vpaddd %ymm12,%ymm11,%ymm11
  609. vpxor %ymm11,%ymm6,%ymm6
  610. vpslld $12,%ymm6,%ymm0
  611. vpsrld $20,%ymm6,%ymm6
  612. vpor %ymm0,%ymm6,%ymm6
  613. # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  614. vpaddd %ymm13,%ymm8,%ymm8
  615. vpxor %ymm8,%ymm7,%ymm7
  616. vpslld $12,%ymm7,%ymm0
  617. vpsrld $20,%ymm7,%ymm7
  618. vpor %ymm0,%ymm7,%ymm7
  619. # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  620. vpaddd %ymm14,%ymm9,%ymm9
  621. vpxor %ymm9,%ymm4,%ymm4
  622. vpslld $12,%ymm4,%ymm0
  623. vpsrld $20,%ymm4,%ymm4
  624. vpor %ymm0,%ymm4,%ymm4
  625. # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  626. vpaddd 0x00(%rsp),%ymm5,%ymm0
  627. vmovdqa %ymm0,0x00(%rsp)
  628. vpxor %ymm0,%ymm15,%ymm15
  629. vpshufb %ymm2,%ymm15,%ymm15
  630. # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  631. vpaddd 0x20(%rsp),%ymm6,%ymm0
  632. vmovdqa %ymm0,0x20(%rsp)
  633. vpxor %ymm0,%ymm12,%ymm12
  634. vpshufb %ymm2,%ymm12,%ymm12
  635. # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  636. vpaddd 0x40(%rsp),%ymm7,%ymm0
  637. vmovdqa %ymm0,0x40(%rsp)
  638. vpxor %ymm0,%ymm13,%ymm13
  639. vpshufb %ymm2,%ymm13,%ymm13
  640. # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  641. vpaddd 0x60(%rsp),%ymm4,%ymm0
  642. vmovdqa %ymm0,0x60(%rsp)
  643. vpxor %ymm0,%ymm14,%ymm14
  644. vpshufb %ymm2,%ymm14,%ymm14
  645. # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  646. vpaddd %ymm15,%ymm10,%ymm10
  647. vpxor %ymm10,%ymm5,%ymm5
  648. vpslld $7,%ymm5,%ymm0
  649. vpsrld $25,%ymm5,%ymm5
  650. vpor %ymm0,%ymm5,%ymm5
  651. # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  652. vpaddd %ymm12,%ymm11,%ymm11
  653. vpxor %ymm11,%ymm6,%ymm6
  654. vpslld $7,%ymm6,%ymm0
  655. vpsrld $25,%ymm6,%ymm6
  656. vpor %ymm0,%ymm6,%ymm6
  657. # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  658. vpaddd %ymm13,%ymm8,%ymm8
  659. vpxor %ymm8,%ymm7,%ymm7
  660. vpslld $7,%ymm7,%ymm0
  661. vpsrld $25,%ymm7,%ymm7
  662. vpor %ymm0,%ymm7,%ymm7
  663. # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  664. vpaddd %ymm14,%ymm9,%ymm9
  665. vpxor %ymm9,%ymm4,%ymm4
  666. vpslld $7,%ymm4,%ymm0
  667. vpsrld $25,%ymm4,%ymm4
  668. vpor %ymm0,%ymm4,%ymm4
  669. sub $2,%r8d
  670. jnz .Ldoubleround8
  671. # x0..15[0-3] += s[0..15]
  672. vpbroadcastd 0x00(%rdi),%ymm0
  673. vpaddd 0x00(%rsp),%ymm0,%ymm0
  674. vmovdqa %ymm0,0x00(%rsp)
  675. vpbroadcastd 0x04(%rdi),%ymm0
  676. vpaddd 0x20(%rsp),%ymm0,%ymm0
  677. vmovdqa %ymm0,0x20(%rsp)
  678. vpbroadcastd 0x08(%rdi),%ymm0
  679. vpaddd 0x40(%rsp),%ymm0,%ymm0
  680. vmovdqa %ymm0,0x40(%rsp)
  681. vpbroadcastd 0x0c(%rdi),%ymm0
  682. vpaddd 0x60(%rsp),%ymm0,%ymm0
  683. vmovdqa %ymm0,0x60(%rsp)
  684. vpbroadcastd 0x10(%rdi),%ymm0
  685. vpaddd %ymm0,%ymm4,%ymm4
  686. vpbroadcastd 0x14(%rdi),%ymm0
  687. vpaddd %ymm0,%ymm5,%ymm5
  688. vpbroadcastd 0x18(%rdi),%ymm0
  689. vpaddd %ymm0,%ymm6,%ymm6
  690. vpbroadcastd 0x1c(%rdi),%ymm0
  691. vpaddd %ymm0,%ymm7,%ymm7
  692. vpbroadcastd 0x20(%rdi),%ymm0
  693. vpaddd %ymm0,%ymm8,%ymm8
  694. vpbroadcastd 0x24(%rdi),%ymm0
  695. vpaddd %ymm0,%ymm9,%ymm9
  696. vpbroadcastd 0x28(%rdi),%ymm0
  697. vpaddd %ymm0,%ymm10,%ymm10
  698. vpbroadcastd 0x2c(%rdi),%ymm0
  699. vpaddd %ymm0,%ymm11,%ymm11
  700. vpbroadcastd 0x30(%rdi),%ymm0
  701. vpaddd %ymm0,%ymm12,%ymm12
  702. vpbroadcastd 0x34(%rdi),%ymm0
  703. vpaddd %ymm0,%ymm13,%ymm13
  704. vpbroadcastd 0x38(%rdi),%ymm0
  705. vpaddd %ymm0,%ymm14,%ymm14
  706. vpbroadcastd 0x3c(%rdi),%ymm0
  707. vpaddd %ymm0,%ymm15,%ymm15
  708. # x12 += counter values 0-3
  709. vpaddd %ymm1,%ymm12,%ymm12
  710. # interleave 32-bit words in state n, n+1
  711. vmovdqa 0x00(%rsp),%ymm0
  712. vmovdqa 0x20(%rsp),%ymm1
  713. vpunpckldq %ymm1,%ymm0,%ymm2
  714. vpunpckhdq %ymm1,%ymm0,%ymm1
  715. vmovdqa %ymm2,0x00(%rsp)
  716. vmovdqa %ymm1,0x20(%rsp)
  717. vmovdqa 0x40(%rsp),%ymm0
  718. vmovdqa 0x60(%rsp),%ymm1
  719. vpunpckldq %ymm1,%ymm0,%ymm2
  720. vpunpckhdq %ymm1,%ymm0,%ymm1
  721. vmovdqa %ymm2,0x40(%rsp)
  722. vmovdqa %ymm1,0x60(%rsp)
  723. vmovdqa %ymm4,%ymm0
  724. vpunpckldq %ymm5,%ymm0,%ymm4
  725. vpunpckhdq %ymm5,%ymm0,%ymm5
  726. vmovdqa %ymm6,%ymm0
  727. vpunpckldq %ymm7,%ymm0,%ymm6
  728. vpunpckhdq %ymm7,%ymm0,%ymm7
  729. vmovdqa %ymm8,%ymm0
  730. vpunpckldq %ymm9,%ymm0,%ymm8
  731. vpunpckhdq %ymm9,%ymm0,%ymm9
  732. vmovdqa %ymm10,%ymm0
  733. vpunpckldq %ymm11,%ymm0,%ymm10
  734. vpunpckhdq %ymm11,%ymm0,%ymm11
  735. vmovdqa %ymm12,%ymm0
  736. vpunpckldq %ymm13,%ymm0,%ymm12
  737. vpunpckhdq %ymm13,%ymm0,%ymm13
  738. vmovdqa %ymm14,%ymm0
  739. vpunpckldq %ymm15,%ymm0,%ymm14
  740. vpunpckhdq %ymm15,%ymm0,%ymm15
  741. # interleave 64-bit words in state n, n+2
  742. vmovdqa 0x00(%rsp),%ymm0
  743. vmovdqa 0x40(%rsp),%ymm2
  744. vpunpcklqdq %ymm2,%ymm0,%ymm1
  745. vpunpckhqdq %ymm2,%ymm0,%ymm2
  746. vmovdqa %ymm1,0x00(%rsp)
  747. vmovdqa %ymm2,0x40(%rsp)
  748. vmovdqa 0x20(%rsp),%ymm0
  749. vmovdqa 0x60(%rsp),%ymm2
  750. vpunpcklqdq %ymm2,%ymm0,%ymm1
  751. vpunpckhqdq %ymm2,%ymm0,%ymm2
  752. vmovdqa %ymm1,0x20(%rsp)
  753. vmovdqa %ymm2,0x60(%rsp)
  754. vmovdqa %ymm4,%ymm0
  755. vpunpcklqdq %ymm6,%ymm0,%ymm4
  756. vpunpckhqdq %ymm6,%ymm0,%ymm6
  757. vmovdqa %ymm5,%ymm0
  758. vpunpcklqdq %ymm7,%ymm0,%ymm5
  759. vpunpckhqdq %ymm7,%ymm0,%ymm7
  760. vmovdqa %ymm8,%ymm0
  761. vpunpcklqdq %ymm10,%ymm0,%ymm8
  762. vpunpckhqdq %ymm10,%ymm0,%ymm10
  763. vmovdqa %ymm9,%ymm0
  764. vpunpcklqdq %ymm11,%ymm0,%ymm9
  765. vpunpckhqdq %ymm11,%ymm0,%ymm11
  766. vmovdqa %ymm12,%ymm0
  767. vpunpcklqdq %ymm14,%ymm0,%ymm12
  768. vpunpckhqdq %ymm14,%ymm0,%ymm14
  769. vmovdqa %ymm13,%ymm0
  770. vpunpcklqdq %ymm15,%ymm0,%ymm13
  771. vpunpckhqdq %ymm15,%ymm0,%ymm15
  772. # interleave 128-bit words in state n, n+4
  773. # xor/write first four blocks
  774. vmovdqa 0x00(%rsp),%ymm1
  775. vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
  776. cmp $0x0020,%rax
  777. jl .Lxorpart8
  778. vpxor 0x0000(%rdx),%ymm0,%ymm0
  779. vmovdqu %ymm0,0x0000(%rsi)
  780. vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
  781. vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
  782. cmp $0x0040,%rax
  783. jl .Lxorpart8
  784. vpxor 0x0020(%rdx),%ymm0,%ymm0
  785. vmovdqu %ymm0,0x0020(%rsi)
  786. vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
  787. vmovdqa 0x40(%rsp),%ymm1
  788. vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
  789. cmp $0x0060,%rax
  790. jl .Lxorpart8
  791. vpxor 0x0040(%rdx),%ymm0,%ymm0
  792. vmovdqu %ymm0,0x0040(%rsi)
  793. vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
  794. vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
  795. cmp $0x0080,%rax
  796. jl .Lxorpart8
  797. vpxor 0x0060(%rdx),%ymm0,%ymm0
  798. vmovdqu %ymm0,0x0060(%rsi)
  799. vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
  800. vmovdqa 0x20(%rsp),%ymm1
  801. vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
  802. cmp $0x00a0,%rax
  803. jl .Lxorpart8
  804. vpxor 0x0080(%rdx),%ymm0,%ymm0
  805. vmovdqu %ymm0,0x0080(%rsi)
  806. vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
  807. vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
  808. cmp $0x00c0,%rax
  809. jl .Lxorpart8
  810. vpxor 0x00a0(%rdx),%ymm0,%ymm0
  811. vmovdqu %ymm0,0x00a0(%rsi)
  812. vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
  813. vmovdqa 0x60(%rsp),%ymm1
  814. vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
  815. cmp $0x00e0,%rax
  816. jl .Lxorpart8
  817. vpxor 0x00c0(%rdx),%ymm0,%ymm0
  818. vmovdqu %ymm0,0x00c0(%rsi)
  819. vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
  820. vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
  821. cmp $0x0100,%rax
  822. jl .Lxorpart8
  823. vpxor 0x00e0(%rdx),%ymm0,%ymm0
  824. vmovdqu %ymm0,0x00e0(%rsi)
  825. vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
  826. # xor remaining blocks, write to output
  827. vmovdqa %ymm4,%ymm0
  828. cmp $0x0120,%rax
  829. jl .Lxorpart8
  830. vpxor 0x0100(%rdx),%ymm0,%ymm0
  831. vmovdqu %ymm0,0x0100(%rsi)
  832. vmovdqa %ymm12,%ymm0
  833. cmp $0x0140,%rax
  834. jl .Lxorpart8
  835. vpxor 0x0120(%rdx),%ymm0,%ymm0
  836. vmovdqu %ymm0,0x0120(%rsi)
  837. vmovdqa %ymm6,%ymm0
  838. cmp $0x0160,%rax
  839. jl .Lxorpart8
  840. vpxor 0x0140(%rdx),%ymm0,%ymm0
  841. vmovdqu %ymm0,0x0140(%rsi)
  842. vmovdqa %ymm14,%ymm0
  843. cmp $0x0180,%rax
  844. jl .Lxorpart8
  845. vpxor 0x0160(%rdx),%ymm0,%ymm0
  846. vmovdqu %ymm0,0x0160(%rsi)
  847. vmovdqa %ymm5,%ymm0
  848. cmp $0x01a0,%rax
  849. jl .Lxorpart8
  850. vpxor 0x0180(%rdx),%ymm0,%ymm0
  851. vmovdqu %ymm0,0x0180(%rsi)
  852. vmovdqa %ymm13,%ymm0
  853. cmp $0x01c0,%rax
  854. jl .Lxorpart8
  855. vpxor 0x01a0(%rdx),%ymm0,%ymm0
  856. vmovdqu %ymm0,0x01a0(%rsi)
  857. vmovdqa %ymm7,%ymm0
  858. cmp $0x01e0,%rax
  859. jl .Lxorpart8
  860. vpxor 0x01c0(%rdx),%ymm0,%ymm0
  861. vmovdqu %ymm0,0x01c0(%rsi)
  862. vmovdqa %ymm15,%ymm0
  863. cmp $0x0200,%rax
  864. jl .Lxorpart8
  865. vpxor 0x01e0(%rdx),%ymm0,%ymm0
  866. vmovdqu %ymm0,0x01e0(%rsi)
  867. .Ldone8:
  868. vzeroupper
  869. lea -8(%r10),%rsp
  870. RET
  871. .Lxorpart8:
  872. # xor remaining bytes from partial register into output
  873. mov %rax,%r9
  874. and $0x1f,%r9
  875. jz .Ldone8
  876. and $~0x1f,%rax
  877. mov %rsi,%r11
  878. lea (%rdx,%rax),%rsi
  879. mov %rsp,%rdi
  880. mov %r9,%rcx
  881. rep movsb
  882. vpxor 0x00(%rsp),%ymm0,%ymm0
  883. vmovdqa %ymm0,0x00(%rsp)
  884. mov %rsp,%rsi
  885. lea (%r11,%rax),%rdi
  886. mov %r9,%rcx
  887. rep movsb
  888. jmp .Ldone8
  889. SYM_FUNC_END(chacha_8block_xor_avx2)