aria-aesni-avx-asm_64.S 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * ARIA Cipher 16-way parallel algorithm (AVX)
  4. *
  5. * Copyright (c) 2022 Taehee Yoo <[email protected]>
  6. *
  7. */
  8. #include <linux/linkage.h>
  9. #include <linux/cfi_types.h>
  10. #include <asm/frame.h>
  11. /* struct aria_ctx: */
  12. #define enc_key 0
  13. #define dec_key 272
  14. #define rounds 544
  15. /* register macros */
  16. #define CTX %rdi
  17. #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
  18. ( (((a0) & 1) << 0) | \
  19. (((a1) & 1) << 1) | \
  20. (((a2) & 1) << 2) | \
  21. (((a3) & 1) << 3) | \
  22. (((a4) & 1) << 4) | \
  23. (((a5) & 1) << 5) | \
  24. (((a6) & 1) << 6) | \
  25. (((a7) & 1) << 7) )
  26. #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
  27. ( ((l7) << (0 * 8)) | \
  28. ((l6) << (1 * 8)) | \
  29. ((l5) << (2 * 8)) | \
  30. ((l4) << (3 * 8)) | \
  31. ((l3) << (4 * 8)) | \
  32. ((l2) << (5 * 8)) | \
  33. ((l1) << (6 * 8)) | \
  34. ((l0) << (7 * 8)) )
  35. #define inc_le128(x, minus_one, tmp) \
  36. vpcmpeqq minus_one, x, tmp; \
  37. vpsubq minus_one, x, x; \
  38. vpslldq $8, tmp, tmp; \
  39. vpsubq tmp, x, x;
  40. #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
  41. vpand x, mask4bit, tmp0; \
  42. vpandn x, mask4bit, x; \
  43. vpsrld $4, x, x; \
  44. \
  45. vpshufb tmp0, lo_t, tmp0; \
  46. vpshufb x, hi_t, x; \
  47. vpxor tmp0, x, x;
  48. #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
  49. vpunpckhdq x1, x0, t2; \
  50. vpunpckldq x1, x0, x0; \
  51. \
  52. vpunpckldq x3, x2, t1; \
  53. vpunpckhdq x3, x2, x2; \
  54. \
  55. vpunpckhqdq t1, x0, x1; \
  56. vpunpcklqdq t1, x0, x0; \
  57. \
  58. vpunpckhqdq x2, t2, x3; \
  59. vpunpcklqdq x2, t2, x2;
  60. #define byteslice_16x16b(a0, b0, c0, d0, \
  61. a1, b1, c1, d1, \
  62. a2, b2, c2, d2, \
  63. a3, b3, c3, d3, \
  64. st0, st1) \
  65. vmovdqu d2, st0; \
  66. vmovdqu d3, st1; \
  67. transpose_4x4(a0, a1, a2, a3, d2, d3); \
  68. transpose_4x4(b0, b1, b2, b3, d2, d3); \
  69. vmovdqu st0, d2; \
  70. vmovdqu st1, d3; \
  71. \
  72. vmovdqu a0, st0; \
  73. vmovdqu a1, st1; \
  74. transpose_4x4(c0, c1, c2, c3, a0, a1); \
  75. transpose_4x4(d0, d1, d2, d3, a0, a1); \
  76. \
  77. vmovdqu .Lshufb_16x16b, a0; \
  78. vmovdqu st1, a1; \
  79. vpshufb a0, a2, a2; \
  80. vpshufb a0, a3, a3; \
  81. vpshufb a0, b0, b0; \
  82. vpshufb a0, b1, b1; \
  83. vpshufb a0, b2, b2; \
  84. vpshufb a0, b3, b3; \
  85. vpshufb a0, a1, a1; \
  86. vpshufb a0, c0, c0; \
  87. vpshufb a0, c1, c1; \
  88. vpshufb a0, c2, c2; \
  89. vpshufb a0, c3, c3; \
  90. vpshufb a0, d0, d0; \
  91. vpshufb a0, d1, d1; \
  92. vpshufb a0, d2, d2; \
  93. vpshufb a0, d3, d3; \
  94. vmovdqu d3, st1; \
  95. vmovdqu st0, d3; \
  96. vpshufb a0, d3, a0; \
  97. vmovdqu d2, st0; \
  98. \
  99. transpose_4x4(a0, b0, c0, d0, d2, d3); \
  100. transpose_4x4(a1, b1, c1, d1, d2, d3); \
  101. vmovdqu st0, d2; \
  102. vmovdqu st1, d3; \
  103. \
  104. vmovdqu b0, st0; \
  105. vmovdqu b1, st1; \
  106. transpose_4x4(a2, b2, c2, d2, b0, b1); \
  107. transpose_4x4(a3, b3, c3, d3, b0, b1); \
  108. vmovdqu st0, b0; \
  109. vmovdqu st1, b1; \
  110. /* does not adjust output bytes inside vectors */
  111. #define debyteslice_16x16b(a0, b0, c0, d0, \
  112. a1, b1, c1, d1, \
  113. a2, b2, c2, d2, \
  114. a3, b3, c3, d3, \
  115. st0, st1) \
  116. vmovdqu d2, st0; \
  117. vmovdqu d3, st1; \
  118. transpose_4x4(a0, a1, a2, a3, d2, d3); \
  119. transpose_4x4(b0, b1, b2, b3, d2, d3); \
  120. vmovdqu st0, d2; \
  121. vmovdqu st1, d3; \
  122. \
  123. vmovdqu a0, st0; \
  124. vmovdqu a1, st1; \
  125. transpose_4x4(c0, c1, c2, c3, a0, a1); \
  126. transpose_4x4(d0, d1, d2, d3, a0, a1); \
  127. \
  128. vmovdqu .Lshufb_16x16b, a0; \
  129. vmovdqu st1, a1; \
  130. vpshufb a0, a2, a2; \
  131. vpshufb a0, a3, a3; \
  132. vpshufb a0, b0, b0; \
  133. vpshufb a0, b1, b1; \
  134. vpshufb a0, b2, b2; \
  135. vpshufb a0, b3, b3; \
  136. vpshufb a0, a1, a1; \
  137. vpshufb a0, c0, c0; \
  138. vpshufb a0, c1, c1; \
  139. vpshufb a0, c2, c2; \
  140. vpshufb a0, c3, c3; \
  141. vpshufb a0, d0, d0; \
  142. vpshufb a0, d1, d1; \
  143. vpshufb a0, d2, d2; \
  144. vpshufb a0, d3, d3; \
  145. vmovdqu d3, st1; \
  146. vmovdqu st0, d3; \
  147. vpshufb a0, d3, a0; \
  148. vmovdqu d2, st0; \
  149. \
  150. transpose_4x4(c0, d0, a0, b0, d2, d3); \
  151. transpose_4x4(c1, d1, a1, b1, d2, d3); \
  152. vmovdqu st0, d2; \
  153. vmovdqu st1, d3; \
  154. \
  155. vmovdqu b0, st0; \
  156. vmovdqu b1, st1; \
  157. transpose_4x4(c2, d2, a2, b2, b0, b1); \
  158. transpose_4x4(c3, d3, a3, b3, b0, b1); \
  159. vmovdqu st0, b0; \
  160. vmovdqu st1, b1; \
  161. /* does not adjust output bytes inside vectors */
  162. /* load blocks to registers and apply pre-whitening */
  163. #define inpack16_pre(x0, x1, x2, x3, \
  164. x4, x5, x6, x7, \
  165. y0, y1, y2, y3, \
  166. y4, y5, y6, y7, \
  167. rio) \
  168. vmovdqu (0 * 16)(rio), x0; \
  169. vmovdqu (1 * 16)(rio), x1; \
  170. vmovdqu (2 * 16)(rio), x2; \
  171. vmovdqu (3 * 16)(rio), x3; \
  172. vmovdqu (4 * 16)(rio), x4; \
  173. vmovdqu (5 * 16)(rio), x5; \
  174. vmovdqu (6 * 16)(rio), x6; \
  175. vmovdqu (7 * 16)(rio), x7; \
  176. vmovdqu (8 * 16)(rio), y0; \
  177. vmovdqu (9 * 16)(rio), y1; \
  178. vmovdqu (10 * 16)(rio), y2; \
  179. vmovdqu (11 * 16)(rio), y3; \
  180. vmovdqu (12 * 16)(rio), y4; \
  181. vmovdqu (13 * 16)(rio), y5; \
  182. vmovdqu (14 * 16)(rio), y6; \
  183. vmovdqu (15 * 16)(rio), y7;
  184. /* byteslice pre-whitened blocks and store to temporary memory */
  185. #define inpack16_post(x0, x1, x2, x3, \
  186. x4, x5, x6, x7, \
  187. y0, y1, y2, y3, \
  188. y4, y5, y6, y7, \
  189. mem_ab, mem_cd) \
  190. byteslice_16x16b(x0, x1, x2, x3, \
  191. x4, x5, x6, x7, \
  192. y0, y1, y2, y3, \
  193. y4, y5, y6, y7, \
  194. (mem_ab), (mem_cd)); \
  195. \
  196. vmovdqu x0, 0 * 16(mem_ab); \
  197. vmovdqu x1, 1 * 16(mem_ab); \
  198. vmovdqu x2, 2 * 16(mem_ab); \
  199. vmovdqu x3, 3 * 16(mem_ab); \
  200. vmovdqu x4, 4 * 16(mem_ab); \
  201. vmovdqu x5, 5 * 16(mem_ab); \
  202. vmovdqu x6, 6 * 16(mem_ab); \
  203. vmovdqu x7, 7 * 16(mem_ab); \
  204. vmovdqu y0, 0 * 16(mem_cd); \
  205. vmovdqu y1, 1 * 16(mem_cd); \
  206. vmovdqu y2, 2 * 16(mem_cd); \
  207. vmovdqu y3, 3 * 16(mem_cd); \
  208. vmovdqu y4, 4 * 16(mem_cd); \
  209. vmovdqu y5, 5 * 16(mem_cd); \
  210. vmovdqu y6, 6 * 16(mem_cd); \
  211. vmovdqu y7, 7 * 16(mem_cd);
  212. #define write_output(x0, x1, x2, x3, \
  213. x4, x5, x6, x7, \
  214. y0, y1, y2, y3, \
  215. y4, y5, y6, y7, \
  216. mem) \
  217. vmovdqu x0, 0 * 16(mem); \
  218. vmovdqu x1, 1 * 16(mem); \
  219. vmovdqu x2, 2 * 16(mem); \
  220. vmovdqu x3, 3 * 16(mem); \
  221. vmovdqu x4, 4 * 16(mem); \
  222. vmovdqu x5, 5 * 16(mem); \
  223. vmovdqu x6, 6 * 16(mem); \
  224. vmovdqu x7, 7 * 16(mem); \
  225. vmovdqu y0, 8 * 16(mem); \
  226. vmovdqu y1, 9 * 16(mem); \
  227. vmovdqu y2, 10 * 16(mem); \
  228. vmovdqu y3, 11 * 16(mem); \
  229. vmovdqu y4, 12 * 16(mem); \
  230. vmovdqu y5, 13 * 16(mem); \
  231. vmovdqu y6, 14 * 16(mem); \
  232. vmovdqu y7, 15 * 16(mem); \
  233. #define aria_store_state_8way(x0, x1, x2, x3, \
  234. x4, x5, x6, x7, \
  235. mem_tmp, idx) \
  236. vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
  237. vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
  238. vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
  239. vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
  240. vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
  241. vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
  242. vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
  243. vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
  244. #define aria_load_state_8way(x0, x1, x2, x3, \
  245. x4, x5, x6, x7, \
  246. mem_tmp, idx) \
  247. vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
  248. vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
  249. vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
  250. vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
  251. vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
  252. vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
  253. vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
  254. vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
  255. #define aria_ark_8way(x0, x1, x2, x3, \
  256. x4, x5, x6, x7, \
  257. t0, rk, idx, round) \
  258. /* AddRoundKey */ \
  259. vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
  260. vpxor t0, x0, x0; \
  261. vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
  262. vpxor t0, x1, x1; \
  263. vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
  264. vpxor t0, x2, x2; \
  265. vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
  266. vpxor t0, x3, x3; \
  267. vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
  268. vpxor t0, x4, x4; \
  269. vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
  270. vpxor t0, x5, x5; \
  271. vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
  272. vpxor t0, x6, x6; \
  273. vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
  274. vpxor t0, x7, x7;
  275. #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
  276. x4, x5, x6, x7, \
  277. t0, t1, t2, t3, \
  278. t4, t5, t6, t7) \
  279. vpbroadcastq .Ltf_s2_bitmatrix, t0; \
  280. vpbroadcastq .Ltf_inv_bitmatrix, t1; \
  281. vpbroadcastq .Ltf_id_bitmatrix, t2; \
  282. vpbroadcastq .Ltf_aff_bitmatrix, t3; \
  283. vpbroadcastq .Ltf_x2_bitmatrix, t4; \
  284. vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
  285. vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
  286. vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
  287. vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
  288. vgf2p8affineinvqb $0, t2, x2, x2; \
  289. vgf2p8affineinvqb $0, t2, x6, x6; \
  290. vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
  291. vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
  292. vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
  293. vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
  294. vgf2p8affineinvqb $0, t2, x3, x3; \
  295. vgf2p8affineinvqb $0, t2, x7, x7
  296. #define aria_sbox_8way(x0, x1, x2, x3, \
  297. x4, x5, x6, x7, \
  298. t0, t1, t2, t3, \
  299. t4, t5, t6, t7) \
  300. vpxor t7, t7, t7; \
  301. vmovdqa .Linv_shift_row, t0; \
  302. vmovdqa .Lshift_row, t1; \
  303. vpbroadcastd .L0f0f0f0f, t6; \
  304. vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
  305. vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
  306. vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
  307. vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
  308. \
  309. vaesenclast t7, x0, x0; \
  310. vaesenclast t7, x4, x4; \
  311. vaesenclast t7, x1, x1; \
  312. vaesenclast t7, x5, x5; \
  313. vaesdeclast t7, x2, x2; \
  314. vaesdeclast t7, x6, x6; \
  315. \
  316. /* AES inverse shift rows */ \
  317. vpshufb t0, x0, x0; \
  318. vpshufb t0, x4, x4; \
  319. vpshufb t0, x1, x1; \
  320. vpshufb t0, x5, x5; \
  321. vpshufb t1, x3, x3; \
  322. vpshufb t1, x7, x7; \
  323. vpshufb t1, x2, x2; \
  324. vpshufb t1, x6, x6; \
  325. \
  326. /* affine transformation for S2 */ \
  327. filter_8bit(x1, t2, t3, t6, t0); \
  328. /* affine transformation for S2 */ \
  329. filter_8bit(x5, t2, t3, t6, t0); \
  330. \
  331. /* affine transformation for X2 */ \
  332. filter_8bit(x3, t4, t5, t6, t0); \
  333. /* affine transformation for X2 */ \
  334. filter_8bit(x7, t4, t5, t6, t0); \
  335. vaesdeclast t7, x3, x3; \
  336. vaesdeclast t7, x7, x7;
  337. #define aria_diff_m(x0, x1, x2, x3, \
  338. t0, t1, t2, t3) \
  339. /* T = rotr32(X, 8); */ \
  340. /* X ^= T */ \
  341. vpxor x0, x3, t0; \
  342. vpxor x1, x0, t1; \
  343. vpxor x2, x1, t2; \
  344. vpxor x3, x2, t3; \
  345. /* X = T ^ rotr(X, 16); */ \
  346. vpxor t2, x0, x0; \
  347. vpxor x1, t3, t3; \
  348. vpxor t0, x2, x2; \
  349. vpxor t1, x3, x1; \
  350. vmovdqu t3, x3;
  351. #define aria_diff_word(x0, x1, x2, x3, \
  352. x4, x5, x6, x7, \
  353. y0, y1, y2, y3, \
  354. y4, y5, y6, y7) \
  355. /* t1 ^= t2; */ \
  356. vpxor y0, x4, x4; \
  357. vpxor y1, x5, x5; \
  358. vpxor y2, x6, x6; \
  359. vpxor y3, x7, x7; \
  360. \
  361. /* t2 ^= t3; */ \
  362. vpxor y4, y0, y0; \
  363. vpxor y5, y1, y1; \
  364. vpxor y6, y2, y2; \
  365. vpxor y7, y3, y3; \
  366. \
  367. /* t0 ^= t1; */ \
  368. vpxor x4, x0, x0; \
  369. vpxor x5, x1, x1; \
  370. vpxor x6, x2, x2; \
  371. vpxor x7, x3, x3; \
  372. \
  373. /* t3 ^= t1; */ \
  374. vpxor x4, y4, y4; \
  375. vpxor x5, y5, y5; \
  376. vpxor x6, y6, y6; \
  377. vpxor x7, y7, y7; \
  378. \
  379. /* t2 ^= t0; */ \
  380. vpxor x0, y0, y0; \
  381. vpxor x1, y1, y1; \
  382. vpxor x2, y2, y2; \
  383. vpxor x3, y3, y3; \
  384. \
  385. /* t1 ^= t2; */ \
  386. vpxor y0, x4, x4; \
  387. vpxor y1, x5, x5; \
  388. vpxor y2, x6, x6; \
  389. vpxor y3, x7, x7;
  390. #define aria_fe(x0, x1, x2, x3, \
  391. x4, x5, x6, x7, \
  392. y0, y1, y2, y3, \
  393. y4, y5, y6, y7, \
  394. mem_tmp, rk, round) \
  395. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  396. y0, rk, 8, round); \
  397. \
  398. aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
  399. y0, y1, y2, y3, y4, y5, y6, y7); \
  400. \
  401. aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
  402. aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
  403. aria_store_state_8way(x0, x1, x2, x3, \
  404. x4, x5, x6, x7, \
  405. mem_tmp, 8); \
  406. \
  407. aria_load_state_8way(x0, x1, x2, x3, \
  408. x4, x5, x6, x7, \
  409. mem_tmp, 0); \
  410. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  411. y0, rk, 0, round); \
  412. \
  413. aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
  414. y0, y1, y2, y3, y4, y5, y6, y7); \
  415. \
  416. aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
  417. aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
  418. aria_store_state_8way(x0, x1, x2, x3, \
  419. x4, x5, x6, x7, \
  420. mem_tmp, 0); \
  421. aria_load_state_8way(y0, y1, y2, y3, \
  422. y4, y5, y6, y7, \
  423. mem_tmp, 8); \
  424. aria_diff_word(x0, x1, x2, x3, \
  425. x4, x5, x6, x7, \
  426. y0, y1, y2, y3, \
  427. y4, y5, y6, y7); \
  428. /* aria_diff_byte() \
  429. * T3 = ABCD -> BADC \
  430. * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
  431. * T0 = ABCD -> CDAB \
  432. * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
  433. * T1 = ABCD -> DCBA \
  434. * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
  435. */ \
  436. aria_diff_word(x2, x3, x0, x1, \
  437. x7, x6, x5, x4, \
  438. y0, y1, y2, y3, \
  439. y5, y4, y7, y6); \
  440. aria_store_state_8way(x3, x2, x1, x0, \
  441. x6, x7, x4, x5, \
  442. mem_tmp, 0);
  443. #define aria_fo(x0, x1, x2, x3, \
  444. x4, x5, x6, x7, \
  445. y0, y1, y2, y3, \
  446. y4, y5, y6, y7, \
  447. mem_tmp, rk, round) \
  448. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  449. y0, rk, 8, round); \
  450. \
  451. aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  452. y0, y1, y2, y3, y4, y5, y6, y7); \
  453. \
  454. aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
  455. aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
  456. aria_store_state_8way(x0, x1, x2, x3, \
  457. x4, x5, x6, x7, \
  458. mem_tmp, 8); \
  459. \
  460. aria_load_state_8way(x0, x1, x2, x3, \
  461. x4, x5, x6, x7, \
  462. mem_tmp, 0); \
  463. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  464. y0, rk, 0, round); \
  465. \
  466. aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  467. y0, y1, y2, y3, y4, y5, y6, y7); \
  468. \
  469. aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
  470. aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
  471. aria_store_state_8way(x0, x1, x2, x3, \
  472. x4, x5, x6, x7, \
  473. mem_tmp, 0); \
  474. aria_load_state_8way(y0, y1, y2, y3, \
  475. y4, y5, y6, y7, \
  476. mem_tmp, 8); \
  477. aria_diff_word(x0, x1, x2, x3, \
  478. x4, x5, x6, x7, \
  479. y0, y1, y2, y3, \
  480. y4, y5, y6, y7); \
  481. /* aria_diff_byte() \
  482. * T1 = ABCD -> BADC \
  483. * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
  484. * T2 = ABCD -> CDAB \
  485. * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
  486. * T3 = ABCD -> DCBA \
  487. * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
  488. */ \
  489. aria_diff_word(x0, x1, x2, x3, \
  490. x5, x4, x7, x6, \
  491. y2, y3, y0, y1, \
  492. y7, y6, y5, y4); \
  493. aria_store_state_8way(x3, x2, x1, x0, \
  494. x6, x7, x4, x5, \
  495. mem_tmp, 0);
  496. #define aria_ff(x0, x1, x2, x3, \
  497. x4, x5, x6, x7, \
  498. y0, y1, y2, y3, \
  499. y4, y5, y6, y7, \
  500. mem_tmp, rk, round, last_round) \
  501. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  502. y0, rk, 8, round); \
  503. \
  504. aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
  505. y0, y1, y2, y3, y4, y5, y6, y7); \
  506. \
  507. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  508. y0, rk, 8, last_round); \
  509. \
  510. aria_store_state_8way(x0, x1, x2, x3, \
  511. x4, x5, x6, x7, \
  512. mem_tmp, 8); \
  513. \
  514. aria_load_state_8way(x0, x1, x2, x3, \
  515. x4, x5, x6, x7, \
  516. mem_tmp, 0); \
  517. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  518. y0, rk, 0, round); \
  519. \
  520. aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
  521. y0, y1, y2, y3, y4, y5, y6, y7); \
  522. \
  523. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  524. y0, rk, 0, last_round); \
  525. \
  526. aria_load_state_8way(y0, y1, y2, y3, \
  527. y4, y5, y6, y7, \
  528. mem_tmp, 8);
  529. #define aria_fe_gfni(x0, x1, x2, x3, \
  530. x4, x5, x6, x7, \
  531. y0, y1, y2, y3, \
  532. y4, y5, y6, y7, \
  533. mem_tmp, rk, round) \
  534. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  535. y0, rk, 8, round); \
  536. \
  537. aria_sbox_8way_gfni(x2, x3, x0, x1, \
  538. x6, x7, x4, x5, \
  539. y0, y1, y2, y3, \
  540. y4, y5, y6, y7); \
  541. \
  542. aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
  543. aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
  544. aria_store_state_8way(x0, x1, x2, x3, \
  545. x4, x5, x6, x7, \
  546. mem_tmp, 8); \
  547. \
  548. aria_load_state_8way(x0, x1, x2, x3, \
  549. x4, x5, x6, x7, \
  550. mem_tmp, 0); \
  551. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  552. y0, rk, 0, round); \
  553. \
  554. aria_sbox_8way_gfni(x2, x3, x0, x1, \
  555. x6, x7, x4, x5, \
  556. y0, y1, y2, y3, \
  557. y4, y5, y6, y7); \
  558. \
  559. aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
  560. aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
  561. aria_store_state_8way(x0, x1, x2, x3, \
  562. x4, x5, x6, x7, \
  563. mem_tmp, 0); \
  564. aria_load_state_8way(y0, y1, y2, y3, \
  565. y4, y5, y6, y7, \
  566. mem_tmp, 8); \
  567. aria_diff_word(x0, x1, x2, x3, \
  568. x4, x5, x6, x7, \
  569. y0, y1, y2, y3, \
  570. y4, y5, y6, y7); \
  571. /* aria_diff_byte() \
  572. * T3 = ABCD -> BADC \
  573. * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
  574. * T0 = ABCD -> CDAB \
  575. * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
  576. * T1 = ABCD -> DCBA \
  577. * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
  578. */ \
  579. aria_diff_word(x2, x3, x0, x1, \
  580. x7, x6, x5, x4, \
  581. y0, y1, y2, y3, \
  582. y5, y4, y7, y6); \
  583. aria_store_state_8way(x3, x2, x1, x0, \
  584. x6, x7, x4, x5, \
  585. mem_tmp, 0);
  586. #define aria_fo_gfni(x0, x1, x2, x3, \
  587. x4, x5, x6, x7, \
  588. y0, y1, y2, y3, \
  589. y4, y5, y6, y7, \
  590. mem_tmp, rk, round) \
  591. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  592. y0, rk, 8, round); \
  593. \
  594. aria_sbox_8way_gfni(x0, x1, x2, x3, \
  595. x4, x5, x6, x7, \
  596. y0, y1, y2, y3, \
  597. y4, y5, y6, y7); \
  598. \
  599. aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
  600. aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
  601. aria_store_state_8way(x0, x1, x2, x3, \
  602. x4, x5, x6, x7, \
  603. mem_tmp, 8); \
  604. \
  605. aria_load_state_8way(x0, x1, x2, x3, \
  606. x4, x5, x6, x7, \
  607. mem_tmp, 0); \
  608. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  609. y0, rk, 0, round); \
  610. \
  611. aria_sbox_8way_gfni(x0, x1, x2, x3, \
  612. x4, x5, x6, x7, \
  613. y0, y1, y2, y3, \
  614. y4, y5, y6, y7); \
  615. \
  616. aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
  617. aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
  618. aria_store_state_8way(x0, x1, x2, x3, \
  619. x4, x5, x6, x7, \
  620. mem_tmp, 0); \
  621. aria_load_state_8way(y0, y1, y2, y3, \
  622. y4, y5, y6, y7, \
  623. mem_tmp, 8); \
  624. aria_diff_word(x0, x1, x2, x3, \
  625. x4, x5, x6, x7, \
  626. y0, y1, y2, y3, \
  627. y4, y5, y6, y7); \
  628. /* aria_diff_byte() \
  629. * T1 = ABCD -> BADC \
  630. * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
  631. * T2 = ABCD -> CDAB \
  632. * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
  633. * T3 = ABCD -> DCBA \
  634. * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
  635. */ \
  636. aria_diff_word(x0, x1, x2, x3, \
  637. x5, x4, x7, x6, \
  638. y2, y3, y0, y1, \
  639. y7, y6, y5, y4); \
  640. aria_store_state_8way(x3, x2, x1, x0, \
  641. x6, x7, x4, x5, \
  642. mem_tmp, 0);
  643. #define aria_ff_gfni(x0, x1, x2, x3, \
  644. x4, x5, x6, x7, \
  645. y0, y1, y2, y3, \
  646. y4, y5, y6, y7, \
  647. mem_tmp, rk, round, last_round) \
  648. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  649. y0, rk, 8, round); \
  650. \
  651. aria_sbox_8way_gfni(x2, x3, x0, x1, \
  652. x6, x7, x4, x5, \
  653. y0, y1, y2, y3, \
  654. y4, y5, y6, y7); \
  655. \
  656. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  657. y0, rk, 8, last_round); \
  658. \
  659. aria_store_state_8way(x0, x1, x2, x3, \
  660. x4, x5, x6, x7, \
  661. mem_tmp, 8); \
  662. \
  663. aria_load_state_8way(x0, x1, x2, x3, \
  664. x4, x5, x6, x7, \
  665. mem_tmp, 0); \
  666. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  667. y0, rk, 0, round); \
  668. \
  669. aria_sbox_8way_gfni(x2, x3, x0, x1, \
  670. x6, x7, x4, x5, \
  671. y0, y1, y2, y3, \
  672. y4, y5, y6, y7); \
  673. \
  674. aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
  675. y0, rk, 0, last_round); \
  676. \
  677. aria_load_state_8way(y0, y1, y2, y3, \
  678. y4, y5, y6, y7, \
  679. mem_tmp, 8);
  680. /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
  681. .section .rodata.cst16, "aM", @progbits, 16
  682. .align 16
  683. #define SHUFB_BYTES(idx) \
  684. 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
  685. .Lshufb_16x16b:
  686. .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
  687. /* For isolating SubBytes from AESENCLAST, inverse shift row */
  688. .Linv_shift_row:
  689. .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
  690. .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
  691. .Lshift_row:
  692. .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
  693. .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
  694. /* For CTR-mode IV byteswap */
  695. .Lbswap128_mask:
  696. .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
  697. .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
  698. /* AES inverse affine and S2 combined:
  699. * 1 1 0 0 0 0 0 1 x0 0
  700. * 0 1 0 0 1 0 0 0 x1 0
  701. * 1 1 0 0 1 1 1 1 x2 0
  702. * 0 1 1 0 1 0 0 1 x3 1
  703. * 0 1 0 0 1 1 0 0 * x4 + 0
  704. * 0 1 0 1 1 0 0 0 x5 0
  705. * 0 0 0 0 0 1 0 1 x6 0
  706. * 1 1 1 0 0 1 1 1 x7 1
  707. */
  708. .Ltf_lo__inv_aff__and__s2:
  709. .octa 0x92172DA81A9FA520B2370D883ABF8500
  710. .Ltf_hi__inv_aff__and__s2:
  711. .octa 0x2B15FFC1AF917B45E6D8320C625CB688
  712. /* X2 and AES forward affine combined:
  713. * 1 0 1 1 0 0 0 1 x0 0
  714. * 0 1 1 1 1 0 1 1 x1 0
  715. * 0 0 0 1 1 0 1 0 x2 1
  716. * 0 1 0 0 0 1 0 0 x3 0
  717. * 0 0 1 1 1 0 1 1 * x4 + 0
  718. * 0 1 0 0 1 0 0 0 x5 0
  719. * 1 1 0 1 0 0 1 1 x6 0
  720. * 0 1 0 0 1 0 1 0 x7 0
  721. */
  722. .Ltf_lo__x2__and__fwd_aff:
  723. .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
  724. .Ltf_hi__x2__and__fwd_aff:
  725. .octa 0x3F893781E95FE1576CDA64D2BA0CB204
  726. .section .rodata.cst8, "aM", @progbits, 8
  727. .align 8
  728. /* AES affine: */
  729. #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
  730. .Ltf_aff_bitmatrix:
  731. .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
  732. BV8(1, 1, 0, 0, 0, 1, 1, 1),
  733. BV8(1, 1, 1, 0, 0, 0, 1, 1),
  734. BV8(1, 1, 1, 1, 0, 0, 0, 1),
  735. BV8(1, 1, 1, 1, 1, 0, 0, 0),
  736. BV8(0, 1, 1, 1, 1, 1, 0, 0),
  737. BV8(0, 0, 1, 1, 1, 1, 1, 0),
  738. BV8(0, 0, 0, 1, 1, 1, 1, 1))
  739. /* AES inverse affine: */
  740. #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
  741. .Ltf_inv_bitmatrix:
  742. .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
  743. BV8(1, 0, 0, 1, 0, 0, 1, 0),
  744. BV8(0, 1, 0, 0, 1, 0, 0, 1),
  745. BV8(1, 0, 1, 0, 0, 1, 0, 0),
  746. BV8(0, 1, 0, 1, 0, 0, 1, 0),
  747. BV8(0, 0, 1, 0, 1, 0, 0, 1),
  748. BV8(1, 0, 0, 1, 0, 1, 0, 0),
  749. BV8(0, 1, 0, 0, 1, 0, 1, 0))
  750. /* S2: */
  751. #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
  752. .Ltf_s2_bitmatrix:
  753. .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
  754. BV8(0, 0, 1, 1, 1, 1, 1, 1),
  755. BV8(1, 1, 1, 0, 1, 1, 0, 1),
  756. BV8(1, 1, 0, 0, 0, 0, 1, 1),
  757. BV8(0, 1, 0, 0, 0, 0, 1, 1),
  758. BV8(1, 1, 0, 0, 1, 1, 1, 0),
  759. BV8(0, 1, 1, 0, 0, 0, 1, 1),
  760. BV8(1, 1, 1, 1, 0, 1, 1, 0))
  761. /* X2: */
  762. #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
  763. .Ltf_x2_bitmatrix:
  764. .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
  765. BV8(0, 0, 1, 0, 0, 1, 1, 0),
  766. BV8(0, 0, 0, 0, 1, 0, 1, 0),
  767. BV8(1, 1, 1, 0, 0, 0, 1, 1),
  768. BV8(1, 1, 1, 0, 1, 1, 0, 0),
  769. BV8(0, 1, 1, 0, 1, 0, 1, 1),
  770. BV8(1, 0, 1, 1, 1, 1, 0, 1),
  771. BV8(1, 0, 0, 1, 0, 0, 1, 1))
  772. /* Identity matrix: */
  773. .Ltf_id_bitmatrix:
  774. .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
  775. BV8(0, 1, 0, 0, 0, 0, 0, 0),
  776. BV8(0, 0, 1, 0, 0, 0, 0, 0),
  777. BV8(0, 0, 0, 1, 0, 0, 0, 0),
  778. BV8(0, 0, 0, 0, 1, 0, 0, 0),
  779. BV8(0, 0, 0, 0, 0, 1, 0, 0),
  780. BV8(0, 0, 0, 0, 0, 0, 1, 0),
  781. BV8(0, 0, 0, 0, 0, 0, 0, 1))
  782. /* 4-bit mask */
  783. .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
  784. .align 4
  785. .L0f0f0f0f:
  786. .long 0x0f0f0f0f
  787. .text
  788. SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
  789. /* input:
  790. * %r9: rk
  791. * %rsi: dst
  792. * %rdx: src
  793. * %xmm0..%xmm15: 16 byte-sliced blocks
  794. */
  795. FRAME_BEGIN
  796. movq %rsi, %rax;
  797. leaq 8 * 16(%rax), %r8;
  798. inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  799. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  800. %xmm15, %rax, %r8);
  801. aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
  802. %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  803. %rax, %r9, 0);
  804. aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  805. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  806. %xmm15, %rax, %r9, 1);
  807. aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
  808. %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  809. %rax, %r9, 2);
  810. aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  811. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  812. %xmm15, %rax, %r9, 3);
  813. aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
  814. %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  815. %rax, %r9, 4);
  816. aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  817. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  818. %xmm15, %rax, %r9, 5);
  819. aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
  820. %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  821. %rax, %r9, 6);
  822. aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  823. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  824. %xmm15, %rax, %r9, 7);
  825. aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
  826. %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  827. %rax, %r9, 8);
  828. aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  829. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  830. %xmm15, %rax, %r9, 9);
  831. aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
  832. %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  833. %rax, %r9, 10);
  834. cmpl $12, rounds(CTX);
  835. jne .Laria_192;
  836. aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  837. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  838. %xmm15, %rax, %r9, 11, 12);
  839. jmp .Laria_end;
  840. .Laria_192:
  841. aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  842. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  843. %xmm15, %rax, %r9, 11);
  844. aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
  845. %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  846. %rax, %r9, 12);
  847. cmpl $14, rounds(CTX);
  848. jne .Laria_256;
  849. aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  850. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  851. %xmm15, %rax, %r9, 13, 14);
  852. jmp .Laria_end;
  853. .Laria_256:
  854. aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  855. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  856. %xmm15, %rax, %r9, 13);
  857. aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
  858. %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  859. %rax, %r9, 14);
  860. aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  861. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  862. %xmm15, %rax, %r9, 15, 16);
  863. .Laria_end:
  864. debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
  865. %xmm9, %xmm13, %xmm0, %xmm5,
  866. %xmm10, %xmm14, %xmm3, %xmm6,
  867. %xmm11, %xmm15, %xmm2, %xmm7,
  868. (%rax), (%r8));
  869. FRAME_END
  870. RET;
  871. SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
  872. SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
  873. /* input:
  874. * %rdi: ctx, CTX
  875. * %rsi: dst
  876. * %rdx: src
  877. */
  878. FRAME_BEGIN
  879. leaq enc_key(CTX), %r9;
  880. inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  881. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  882. %xmm15, %rdx);
  883. call __aria_aesni_avx_crypt_16way;
  884. write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  885. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  886. %xmm15, %rax);
  887. FRAME_END
  888. RET;
  889. SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
  890. SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
  891. /* input:
  892. * %rdi: ctx, CTX
  893. * %rsi: dst
  894. * %rdx: src
  895. */
  896. FRAME_BEGIN
  897. leaq dec_key(CTX), %r9;
  898. inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  899. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  900. %xmm15, %rdx);
  901. call __aria_aesni_avx_crypt_16way;
  902. write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  903. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  904. %xmm15, %rax);
  905. FRAME_END
  906. RET;
  907. SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
  908. SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
  909. /* input:
  910. * %rdi: ctx
  911. * %rsi: dst
  912. * %rdx: src
  913. * %rcx: keystream
  914. * %r8: iv (big endian, 128bit)
  915. */
  916. FRAME_BEGIN
  917. /* load IV and byteswap */
  918. vmovdqu (%r8), %xmm8;
  919. vmovdqa .Lbswap128_mask (%rip), %xmm1;
  920. vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
  921. vpcmpeqd %xmm0, %xmm0, %xmm0;
  922. vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
  923. /* construct IVs */
  924. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  925. vpshufb %xmm1, %xmm3, %xmm9;
  926. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  927. vpshufb %xmm1, %xmm3, %xmm10;
  928. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  929. vpshufb %xmm1, %xmm3, %xmm11;
  930. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  931. vpshufb %xmm1, %xmm3, %xmm12;
  932. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  933. vpshufb %xmm1, %xmm3, %xmm13;
  934. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  935. vpshufb %xmm1, %xmm3, %xmm14;
  936. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  937. vpshufb %xmm1, %xmm3, %xmm15;
  938. vmovdqu %xmm8, (0 * 16)(%rcx);
  939. vmovdqu %xmm9, (1 * 16)(%rcx);
  940. vmovdqu %xmm10, (2 * 16)(%rcx);
  941. vmovdqu %xmm11, (3 * 16)(%rcx);
  942. vmovdqu %xmm12, (4 * 16)(%rcx);
  943. vmovdqu %xmm13, (5 * 16)(%rcx);
  944. vmovdqu %xmm14, (6 * 16)(%rcx);
  945. vmovdqu %xmm15, (7 * 16)(%rcx);
  946. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  947. vpshufb %xmm1, %xmm3, %xmm8;
  948. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  949. vpshufb %xmm1, %xmm3, %xmm9;
  950. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  951. vpshufb %xmm1, %xmm3, %xmm10;
  952. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  953. vpshufb %xmm1, %xmm3, %xmm11;
  954. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  955. vpshufb %xmm1, %xmm3, %xmm12;
  956. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  957. vpshufb %xmm1, %xmm3, %xmm13;
  958. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  959. vpshufb %xmm1, %xmm3, %xmm14;
  960. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  961. vpshufb %xmm1, %xmm3, %xmm15;
  962. inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
  963. vpshufb %xmm1, %xmm3, %xmm4;
  964. vmovdqu %xmm4, (%r8);
  965. vmovdqu (0 * 16)(%rcx), %xmm0;
  966. vmovdqu (1 * 16)(%rcx), %xmm1;
  967. vmovdqu (2 * 16)(%rcx), %xmm2;
  968. vmovdqu (3 * 16)(%rcx), %xmm3;
  969. vmovdqu (4 * 16)(%rcx), %xmm4;
  970. vmovdqu (5 * 16)(%rcx), %xmm5;
  971. vmovdqu (6 * 16)(%rcx), %xmm6;
  972. vmovdqu (7 * 16)(%rcx), %xmm7;
  973. FRAME_END
  974. RET;
  975. SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
  976. SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
  977. /* input:
  978. * %rdi: ctx
  979. * %rsi: dst
  980. * %rdx: src
  981. * %rcx: keystream
  982. * %r8: iv (big endian, 128bit)
  983. */
  984. FRAME_BEGIN
  985. call __aria_aesni_avx_ctr_gen_keystream_16way;
  986. leaq (%rsi), %r10;
  987. leaq (%rdx), %r11;
  988. leaq (%rcx), %rsi;
  989. leaq (%rcx), %rdx;
  990. leaq enc_key(CTX), %r9;
  991. call __aria_aesni_avx_crypt_16way;
  992. vpxor (0 * 16)(%r11), %xmm1, %xmm1;
  993. vpxor (1 * 16)(%r11), %xmm0, %xmm0;
  994. vpxor (2 * 16)(%r11), %xmm3, %xmm3;
  995. vpxor (3 * 16)(%r11), %xmm2, %xmm2;
  996. vpxor (4 * 16)(%r11), %xmm4, %xmm4;
  997. vpxor (5 * 16)(%r11), %xmm5, %xmm5;
  998. vpxor (6 * 16)(%r11), %xmm6, %xmm6;
  999. vpxor (7 * 16)(%r11), %xmm7, %xmm7;
  1000. vpxor (8 * 16)(%r11), %xmm8, %xmm8;
  1001. vpxor (9 * 16)(%r11), %xmm9, %xmm9;
  1002. vpxor (10 * 16)(%r11), %xmm10, %xmm10;
  1003. vpxor (11 * 16)(%r11), %xmm11, %xmm11;
  1004. vpxor (12 * 16)(%r11), %xmm12, %xmm12;
  1005. vpxor (13 * 16)(%r11), %xmm13, %xmm13;
  1006. vpxor (14 * 16)(%r11), %xmm14, %xmm14;
  1007. vpxor (15 * 16)(%r11), %xmm15, %xmm15;
  1008. write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  1009. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  1010. %xmm15, %r10);
  1011. FRAME_END
  1012. RET;
  1013. SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
  1014. SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
  1015. /* input:
  1016. * %r9: rk
  1017. * %rsi: dst
  1018. * %rdx: src
  1019. * %xmm0..%xmm15: 16 byte-sliced blocks
  1020. */
  1021. FRAME_BEGIN
  1022. movq %rsi, %rax;
  1023. leaq 8 * 16(%rax), %r8;
  1024. inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
  1025. %xmm4, %xmm5, %xmm6, %xmm7,
  1026. %xmm8, %xmm9, %xmm10, %xmm11,
  1027. %xmm12, %xmm13, %xmm14,
  1028. %xmm15, %rax, %r8);
  1029. aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
  1030. %xmm12, %xmm13, %xmm14, %xmm15,
  1031. %xmm0, %xmm1, %xmm2, %xmm3,
  1032. %xmm4, %xmm5, %xmm6, %xmm7,
  1033. %rax, %r9, 0);
  1034. aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1035. %xmm4, %xmm5, %xmm6, %xmm7,
  1036. %xmm8, %xmm9, %xmm10, %xmm11,
  1037. %xmm12, %xmm13, %xmm14,
  1038. %xmm15, %rax, %r9, 1);
  1039. aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
  1040. %xmm12, %xmm13, %xmm14, %xmm15,
  1041. %xmm0, %xmm1, %xmm2, %xmm3,
  1042. %xmm4, %xmm5, %xmm6, %xmm7,
  1043. %rax, %r9, 2);
  1044. aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1045. %xmm4, %xmm5, %xmm6, %xmm7,
  1046. %xmm8, %xmm9, %xmm10, %xmm11,
  1047. %xmm12, %xmm13, %xmm14,
  1048. %xmm15, %rax, %r9, 3);
  1049. aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
  1050. %xmm12, %xmm13, %xmm14, %xmm15,
  1051. %xmm0, %xmm1, %xmm2, %xmm3,
  1052. %xmm4, %xmm5, %xmm6, %xmm7,
  1053. %rax, %r9, 4);
  1054. aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1055. %xmm4, %xmm5, %xmm6, %xmm7,
  1056. %xmm8, %xmm9, %xmm10, %xmm11,
  1057. %xmm12, %xmm13, %xmm14,
  1058. %xmm15, %rax, %r9, 5);
  1059. aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
  1060. %xmm12, %xmm13, %xmm14, %xmm15,
  1061. %xmm0, %xmm1, %xmm2, %xmm3,
  1062. %xmm4, %xmm5, %xmm6, %xmm7,
  1063. %rax, %r9, 6);
  1064. aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1065. %xmm4, %xmm5, %xmm6, %xmm7,
  1066. %xmm8, %xmm9, %xmm10, %xmm11,
  1067. %xmm12, %xmm13, %xmm14,
  1068. %xmm15, %rax, %r9, 7);
  1069. aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
  1070. %xmm12, %xmm13, %xmm14, %xmm15,
  1071. %xmm0, %xmm1, %xmm2, %xmm3,
  1072. %xmm4, %xmm5, %xmm6, %xmm7,
  1073. %rax, %r9, 8);
  1074. aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1075. %xmm4, %xmm5, %xmm6, %xmm7,
  1076. %xmm8, %xmm9, %xmm10, %xmm11,
  1077. %xmm12, %xmm13, %xmm14,
  1078. %xmm15, %rax, %r9, 9);
  1079. aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
  1080. %xmm12, %xmm13, %xmm14, %xmm15,
  1081. %xmm0, %xmm1, %xmm2, %xmm3,
  1082. %xmm4, %xmm5, %xmm6, %xmm7,
  1083. %rax, %r9, 10);
  1084. cmpl $12, rounds(CTX);
  1085. jne .Laria_gfni_192;
  1086. aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  1087. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  1088. %xmm15, %rax, %r9, 11, 12);
  1089. jmp .Laria_gfni_end;
  1090. .Laria_gfni_192:
  1091. aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1092. %xmm4, %xmm5, %xmm6, %xmm7,
  1093. %xmm8, %xmm9, %xmm10, %xmm11,
  1094. %xmm12, %xmm13, %xmm14,
  1095. %xmm15, %rax, %r9, 11);
  1096. aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
  1097. %xmm12, %xmm13, %xmm14, %xmm15,
  1098. %xmm0, %xmm1, %xmm2, %xmm3,
  1099. %xmm4, %xmm5, %xmm6, %xmm7,
  1100. %rax, %r9, 12);
  1101. cmpl $14, rounds(CTX);
  1102. jne .Laria_gfni_256;
  1103. aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1104. %xmm4, %xmm5, %xmm6, %xmm7,
  1105. %xmm8, %xmm9, %xmm10, %xmm11,
  1106. %xmm12, %xmm13, %xmm14,
  1107. %xmm15, %rax, %r9, 13, 14);
  1108. jmp .Laria_gfni_end;
  1109. .Laria_gfni_256:
  1110. aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1111. %xmm4, %xmm5, %xmm6, %xmm7,
  1112. %xmm8, %xmm9, %xmm10, %xmm11,
  1113. %xmm12, %xmm13, %xmm14,
  1114. %xmm15, %rax, %r9, 13);
  1115. aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
  1116. %xmm12, %xmm13, %xmm14, %xmm15,
  1117. %xmm0, %xmm1, %xmm2, %xmm3,
  1118. %xmm4, %xmm5, %xmm6, %xmm7,
  1119. %rax, %r9, 14);
  1120. aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
  1121. %xmm4, %xmm5, %xmm6, %xmm7,
  1122. %xmm8, %xmm9, %xmm10, %xmm11,
  1123. %xmm12, %xmm13, %xmm14,
  1124. %xmm15, %rax, %r9, 15, 16);
  1125. .Laria_gfni_end:
  1126. debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
  1127. %xmm9, %xmm13, %xmm0, %xmm5,
  1128. %xmm10, %xmm14, %xmm3, %xmm6,
  1129. %xmm11, %xmm15, %xmm2, %xmm7,
  1130. (%rax), (%r8));
  1131. FRAME_END
  1132. RET;
  1133. SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
  1134. SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
  1135. /* input:
  1136. * %rdi: ctx, CTX
  1137. * %rsi: dst
  1138. * %rdx: src
  1139. */
  1140. FRAME_BEGIN
  1141. leaq enc_key(CTX), %r9;
  1142. inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  1143. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  1144. %xmm15, %rdx);
  1145. call __aria_aesni_avx_gfni_crypt_16way;
  1146. write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  1147. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  1148. %xmm15, %rax);
  1149. FRAME_END
  1150. RET;
  1151. SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
  1152. SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
  1153. /* input:
  1154. * %rdi: ctx, CTX
  1155. * %rsi: dst
  1156. * %rdx: src
  1157. */
  1158. FRAME_BEGIN
  1159. leaq dec_key(CTX), %r9;
  1160. inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  1161. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  1162. %xmm15, %rdx);
  1163. call __aria_aesni_avx_gfni_crypt_16way;
  1164. write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  1165. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  1166. %xmm15, %rax);
  1167. FRAME_END
  1168. RET;
  1169. SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
  1170. SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
  1171. /* input:
  1172. * %rdi: ctx
  1173. * %rsi: dst
  1174. * %rdx: src
  1175. * %rcx: keystream
  1176. * %r8: iv (big endian, 128bit)
  1177. */
  1178. FRAME_BEGIN
  1179. call __aria_aesni_avx_ctr_gen_keystream_16way
  1180. leaq (%rsi), %r10;
  1181. leaq (%rdx), %r11;
  1182. leaq (%rcx), %rsi;
  1183. leaq (%rcx), %rdx;
  1184. leaq enc_key(CTX), %r9;
  1185. call __aria_aesni_avx_gfni_crypt_16way;
  1186. vpxor (0 * 16)(%r11), %xmm1, %xmm1;
  1187. vpxor (1 * 16)(%r11), %xmm0, %xmm0;
  1188. vpxor (2 * 16)(%r11), %xmm3, %xmm3;
  1189. vpxor (3 * 16)(%r11), %xmm2, %xmm2;
  1190. vpxor (4 * 16)(%r11), %xmm4, %xmm4;
  1191. vpxor (5 * 16)(%r11), %xmm5, %xmm5;
  1192. vpxor (6 * 16)(%r11), %xmm6, %xmm6;
  1193. vpxor (7 * 16)(%r11), %xmm7, %xmm7;
  1194. vpxor (8 * 16)(%r11), %xmm8, %xmm8;
  1195. vpxor (9 * 16)(%r11), %xmm9, %xmm9;
  1196. vpxor (10 * 16)(%r11), %xmm10, %xmm10;
  1197. vpxor (11 * 16)(%r11), %xmm11, %xmm11;
  1198. vpxor (12 * 16)(%r11), %xmm12, %xmm12;
  1199. vpxor (13 * 16)(%r11), %xmm13, %xmm13;
  1200. vpxor (14 * 16)(%r11), %xmm14, %xmm14;
  1201. vpxor (15 * 16)(%r11), %xmm15, %xmm15;
  1202. write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
  1203. %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
  1204. %xmm15, %r10);
  1205. FRAME_END
  1206. RET;
  1207. SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)