12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304 |
- /* SPDX-License-Identifier: GPL-2.0-or-later */
- /*
- * ARIA Cipher 16-way parallel algorithm (AVX)
- *
- * Copyright (c) 2022 Taehee Yoo <[email protected]>
- *
- */
- #include <linux/linkage.h>
- #include <linux/cfi_types.h>
- #include <asm/frame.h>
- /* struct aria_ctx: */
- #define enc_key 0
- #define dec_key 272
- #define rounds 544
- /* register macros */
- #define CTX %rdi
- #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
- ( (((a0) & 1) << 0) | \
- (((a1) & 1) << 1) | \
- (((a2) & 1) << 2) | \
- (((a3) & 1) << 3) | \
- (((a4) & 1) << 4) | \
- (((a5) & 1) << 5) | \
- (((a6) & 1) << 6) | \
- (((a7) & 1) << 7) )
- #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
- ( ((l7) << (0 * 8)) | \
- ((l6) << (1 * 8)) | \
- ((l5) << (2 * 8)) | \
- ((l4) << (3 * 8)) | \
- ((l3) << (4 * 8)) | \
- ((l2) << (5 * 8)) | \
- ((l1) << (6 * 8)) | \
- ((l0) << (7 * 8)) )
- #define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
- #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
- vpand x, mask4bit, tmp0; \
- vpandn x, mask4bit, x; \
- vpsrld $4, x, x; \
- \
- vpshufb tmp0, lo_t, tmp0; \
- vpshufb x, hi_t, x; \
- vpxor tmp0, x, x;
- #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
- vpunpckhdq x1, x0, t2; \
- vpunpckldq x1, x0, x0; \
- \
- vpunpckldq x3, x2, t1; \
- vpunpckhdq x3, x2, x2; \
- \
- vpunpckhqdq t1, x0, x1; \
- vpunpcklqdq t1, x0, x0; \
- \
- vpunpckhqdq x2, t2, x3; \
- vpunpcklqdq x2, t2, x2;
- #define byteslice_16x16b(a0, b0, c0, d0, \
- a1, b1, c1, d1, \
- a2, b2, c2, d2, \
- a3, b3, c3, d3, \
- st0, st1) \
- vmovdqu d2, st0; \
- vmovdqu d3, st1; \
- transpose_4x4(a0, a1, a2, a3, d2, d3); \
- transpose_4x4(b0, b1, b2, b3, d2, d3); \
- vmovdqu st0, d2; \
- vmovdqu st1, d3; \
- \
- vmovdqu a0, st0; \
- vmovdqu a1, st1; \
- transpose_4x4(c0, c1, c2, c3, a0, a1); \
- transpose_4x4(d0, d1, d2, d3, a0, a1); \
- \
- vmovdqu .Lshufb_16x16b, a0; \
- vmovdqu st1, a1; \
- vpshufb a0, a2, a2; \
- vpshufb a0, a3, a3; \
- vpshufb a0, b0, b0; \
- vpshufb a0, b1, b1; \
- vpshufb a0, b2, b2; \
- vpshufb a0, b3, b3; \
- vpshufb a0, a1, a1; \
- vpshufb a0, c0, c0; \
- vpshufb a0, c1, c1; \
- vpshufb a0, c2, c2; \
- vpshufb a0, c3, c3; \
- vpshufb a0, d0, d0; \
- vpshufb a0, d1, d1; \
- vpshufb a0, d2, d2; \
- vpshufb a0, d3, d3; \
- vmovdqu d3, st1; \
- vmovdqu st0, d3; \
- vpshufb a0, d3, a0; \
- vmovdqu d2, st0; \
- \
- transpose_4x4(a0, b0, c0, d0, d2, d3); \
- transpose_4x4(a1, b1, c1, d1, d2, d3); \
- vmovdqu st0, d2; \
- vmovdqu st1, d3; \
- \
- vmovdqu b0, st0; \
- vmovdqu b1, st1; \
- transpose_4x4(a2, b2, c2, d2, b0, b1); \
- transpose_4x4(a3, b3, c3, d3, b0, b1); \
- vmovdqu st0, b0; \
- vmovdqu st1, b1; \
- /* does not adjust output bytes inside vectors */
- #define debyteslice_16x16b(a0, b0, c0, d0, \
- a1, b1, c1, d1, \
- a2, b2, c2, d2, \
- a3, b3, c3, d3, \
- st0, st1) \
- vmovdqu d2, st0; \
- vmovdqu d3, st1; \
- transpose_4x4(a0, a1, a2, a3, d2, d3); \
- transpose_4x4(b0, b1, b2, b3, d2, d3); \
- vmovdqu st0, d2; \
- vmovdqu st1, d3; \
- \
- vmovdqu a0, st0; \
- vmovdqu a1, st1; \
- transpose_4x4(c0, c1, c2, c3, a0, a1); \
- transpose_4x4(d0, d1, d2, d3, a0, a1); \
- \
- vmovdqu .Lshufb_16x16b, a0; \
- vmovdqu st1, a1; \
- vpshufb a0, a2, a2; \
- vpshufb a0, a3, a3; \
- vpshufb a0, b0, b0; \
- vpshufb a0, b1, b1; \
- vpshufb a0, b2, b2; \
- vpshufb a0, b3, b3; \
- vpshufb a0, a1, a1; \
- vpshufb a0, c0, c0; \
- vpshufb a0, c1, c1; \
- vpshufb a0, c2, c2; \
- vpshufb a0, c3, c3; \
- vpshufb a0, d0, d0; \
- vpshufb a0, d1, d1; \
- vpshufb a0, d2, d2; \
- vpshufb a0, d3, d3; \
- vmovdqu d3, st1; \
- vmovdqu st0, d3; \
- vpshufb a0, d3, a0; \
- vmovdqu d2, st0; \
- \
- transpose_4x4(c0, d0, a0, b0, d2, d3); \
- transpose_4x4(c1, d1, a1, b1, d2, d3); \
- vmovdqu st0, d2; \
- vmovdqu st1, d3; \
- \
- vmovdqu b0, st0; \
- vmovdqu b1, st1; \
- transpose_4x4(c2, d2, a2, b2, b0, b1); \
- transpose_4x4(c3, d3, a3, b3, b0, b1); \
- vmovdqu st0, b0; \
- vmovdqu st1, b1; \
- /* does not adjust output bytes inside vectors */
- /* load blocks to registers and apply pre-whitening */
- #define inpack16_pre(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- rio) \
- vmovdqu (0 * 16)(rio), x0; \
- vmovdqu (1 * 16)(rio), x1; \
- vmovdqu (2 * 16)(rio), x2; \
- vmovdqu (3 * 16)(rio), x3; \
- vmovdqu (4 * 16)(rio), x4; \
- vmovdqu (5 * 16)(rio), x5; \
- vmovdqu (6 * 16)(rio), x6; \
- vmovdqu (7 * 16)(rio), x7; \
- vmovdqu (8 * 16)(rio), y0; \
- vmovdqu (9 * 16)(rio), y1; \
- vmovdqu (10 * 16)(rio), y2; \
- vmovdqu (11 * 16)(rio), y3; \
- vmovdqu (12 * 16)(rio), y4; \
- vmovdqu (13 * 16)(rio), y5; \
- vmovdqu (14 * 16)(rio), y6; \
- vmovdqu (15 * 16)(rio), y7;
- /* byteslice pre-whitened blocks and store to temporary memory */
- #define inpack16_post(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_ab, mem_cd) \
- byteslice_16x16b(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- (mem_ab), (mem_cd)); \
- \
- vmovdqu x0, 0 * 16(mem_ab); \
- vmovdqu x1, 1 * 16(mem_ab); \
- vmovdqu x2, 2 * 16(mem_ab); \
- vmovdqu x3, 3 * 16(mem_ab); \
- vmovdqu x4, 4 * 16(mem_ab); \
- vmovdqu x5, 5 * 16(mem_ab); \
- vmovdqu x6, 6 * 16(mem_ab); \
- vmovdqu x7, 7 * 16(mem_ab); \
- vmovdqu y0, 0 * 16(mem_cd); \
- vmovdqu y1, 1 * 16(mem_cd); \
- vmovdqu y2, 2 * 16(mem_cd); \
- vmovdqu y3, 3 * 16(mem_cd); \
- vmovdqu y4, 4 * 16(mem_cd); \
- vmovdqu y5, 5 * 16(mem_cd); \
- vmovdqu y6, 6 * 16(mem_cd); \
- vmovdqu y7, 7 * 16(mem_cd);
- #define write_output(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem) \
- vmovdqu x0, 0 * 16(mem); \
- vmovdqu x1, 1 * 16(mem); \
- vmovdqu x2, 2 * 16(mem); \
- vmovdqu x3, 3 * 16(mem); \
- vmovdqu x4, 4 * 16(mem); \
- vmovdqu x5, 5 * 16(mem); \
- vmovdqu x6, 6 * 16(mem); \
- vmovdqu x7, 7 * 16(mem); \
- vmovdqu y0, 8 * 16(mem); \
- vmovdqu y1, 9 * 16(mem); \
- vmovdqu y2, 10 * 16(mem); \
- vmovdqu y3, 11 * 16(mem); \
- vmovdqu y4, 12 * 16(mem); \
- vmovdqu y5, 13 * 16(mem); \
- vmovdqu y6, 14 * 16(mem); \
- vmovdqu y7, 15 * 16(mem); \
- #define aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, idx) \
- vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
- vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
- vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
- vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
- vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
- vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
- vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
- vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
- #define aria_load_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, idx) \
- vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
- vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
- vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
- vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
- vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
- vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
- vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
- vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
- #define aria_ark_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- t0, rk, idx, round) \
- /* AddRoundKey */ \
- vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
- vpxor t0, x0, x0; \
- vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
- vpxor t0, x1, x1; \
- vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
- vpxor t0, x2, x2; \
- vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
- vpxor t0, x3, x3; \
- vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
- vpxor t0, x4, x4; \
- vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
- vpxor t0, x5, x5; \
- vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
- vpxor t0, x6, x6; \
- vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
- vpxor t0, x7, x7;
- #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- t0, t1, t2, t3, \
- t4, t5, t6, t7) \
- vpbroadcastq .Ltf_s2_bitmatrix, t0; \
- vpbroadcastq .Ltf_inv_bitmatrix, t1; \
- vpbroadcastq .Ltf_id_bitmatrix, t2; \
- vpbroadcastq .Ltf_aff_bitmatrix, t3; \
- vpbroadcastq .Ltf_x2_bitmatrix, t4; \
- vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
- vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
- vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
- vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
- vgf2p8affineinvqb $0, t2, x2, x2; \
- vgf2p8affineinvqb $0, t2, x6, x6; \
- vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
- vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
- vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
- vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
- vgf2p8affineinvqb $0, t2, x3, x3; \
- vgf2p8affineinvqb $0, t2, x7, x7
- #define aria_sbox_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- t0, t1, t2, t3, \
- t4, t5, t6, t7) \
- vpxor t7, t7, t7; \
- vmovdqa .Linv_shift_row, t0; \
- vmovdqa .Lshift_row, t1; \
- vpbroadcastd .L0f0f0f0f, t6; \
- vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
- vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
- vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
- vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
- \
- vaesenclast t7, x0, x0; \
- vaesenclast t7, x4, x4; \
- vaesenclast t7, x1, x1; \
- vaesenclast t7, x5, x5; \
- vaesdeclast t7, x2, x2; \
- vaesdeclast t7, x6, x6; \
- \
- /* AES inverse shift rows */ \
- vpshufb t0, x0, x0; \
- vpshufb t0, x4, x4; \
- vpshufb t0, x1, x1; \
- vpshufb t0, x5, x5; \
- vpshufb t1, x3, x3; \
- vpshufb t1, x7, x7; \
- vpshufb t1, x2, x2; \
- vpshufb t1, x6, x6; \
- \
- /* affine transformation for S2 */ \
- filter_8bit(x1, t2, t3, t6, t0); \
- /* affine transformation for S2 */ \
- filter_8bit(x5, t2, t3, t6, t0); \
- \
- /* affine transformation for X2 */ \
- filter_8bit(x3, t4, t5, t6, t0); \
- /* affine transformation for X2 */ \
- filter_8bit(x7, t4, t5, t6, t0); \
- vaesdeclast t7, x3, x3; \
- vaesdeclast t7, x7, x7;
- #define aria_diff_m(x0, x1, x2, x3, \
- t0, t1, t2, t3) \
- /* T = rotr32(X, 8); */ \
- /* X ^= T */ \
- vpxor x0, x3, t0; \
- vpxor x1, x0, t1; \
- vpxor x2, x1, t2; \
- vpxor x3, x2, t3; \
- /* X = T ^ rotr(X, 16); */ \
- vpxor t2, x0, x0; \
- vpxor x1, t3, t3; \
- vpxor t0, x2, x2; \
- vpxor t1, x3, x1; \
- vmovdqu t3, x3;
- #define aria_diff_word(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7) \
- /* t1 ^= t2; */ \
- vpxor y0, x4, x4; \
- vpxor y1, x5, x5; \
- vpxor y2, x6, x6; \
- vpxor y3, x7, x7; \
- \
- /* t2 ^= t3; */ \
- vpxor y4, y0, y0; \
- vpxor y5, y1, y1; \
- vpxor y6, y2, y2; \
- vpxor y7, y3, y3; \
- \
- /* t0 ^= t1; */ \
- vpxor x4, x0, x0; \
- vpxor x5, x1, x1; \
- vpxor x6, x2, x2; \
- vpxor x7, x3, x3; \
- \
- /* t3 ^= t1; */ \
- vpxor x4, y4, y4; \
- vpxor x5, y5, y5; \
- vpxor x6, y6, y6; \
- vpxor x7, y7, y7; \
- \
- /* t2 ^= t0; */ \
- vpxor x0, y0, y0; \
- vpxor x1, y1, y1; \
- vpxor x2, y2, y2; \
- vpxor x3, y3, y3; \
- \
- /* t1 ^= t2; */ \
- vpxor y0, x4, x4; \
- vpxor y1, x5, x5; \
- vpxor y2, x6, x6; \
- vpxor y3, x7, x7;
- #define aria_fe(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, rk, round) \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
- \
- aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
- y0, y1, y2, y3, y4, y5, y6, y7); \
- \
- aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
- aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 8); \
- \
- aria_load_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
- \
- aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
- y0, y1, y2, y3, y4, y5, y6, y7); \
- \
- aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
- aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_load_state_8way(y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, 8); \
- aria_diff_word(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- /* aria_diff_byte() \
- * T3 = ABCD -> BADC \
- * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
- * T0 = ABCD -> CDAB \
- * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
- * T1 = ABCD -> DCBA \
- * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
- */ \
- aria_diff_word(x2, x3, x0, x1, \
- x7, x6, x5, x4, \
- y0, y1, y2, y3, \
- y5, y4, y7, y6); \
- aria_store_state_8way(x3, x2, x1, x0, \
- x6, x7, x4, x5, \
- mem_tmp, 0);
- #define aria_fo(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, rk, round) \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
- \
- aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, y1, y2, y3, y4, y5, y6, y7); \
- \
- aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
- aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 8); \
- \
- aria_load_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
- \
- aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, y1, y2, y3, y4, y5, y6, y7); \
- \
- aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
- aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_load_state_8way(y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, 8); \
- aria_diff_word(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- /* aria_diff_byte() \
- * T1 = ABCD -> BADC \
- * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
- * T2 = ABCD -> CDAB \
- * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
- * T3 = ABCD -> DCBA \
- * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
- */ \
- aria_diff_word(x0, x1, x2, x3, \
- x5, x4, x7, x6, \
- y2, y3, y0, y1, \
- y7, y6, y5, y4); \
- aria_store_state_8way(x3, x2, x1, x0, \
- x6, x7, x4, x5, \
- mem_tmp, 0);
- #define aria_ff(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, rk, round, last_round) \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
- \
- aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
- y0, y1, y2, y3, y4, y5, y6, y7); \
- \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, last_round); \
- \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 8); \
- \
- aria_load_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
- \
- aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
- y0, y1, y2, y3, y4, y5, y6, y7); \
- \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, last_round); \
- \
- aria_load_state_8way(y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, 8);
- #define aria_fe_gfni(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, rk, round) \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
- \
- aria_sbox_8way_gfni(x2, x3, x0, x1, \
- x6, x7, x4, x5, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- \
- aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
- aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 8); \
- \
- aria_load_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
- \
- aria_sbox_8way_gfni(x2, x3, x0, x1, \
- x6, x7, x4, x5, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- \
- aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
- aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_load_state_8way(y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, 8); \
- aria_diff_word(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- /* aria_diff_byte() \
- * T3 = ABCD -> BADC \
- * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
- * T0 = ABCD -> CDAB \
- * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
- * T1 = ABCD -> DCBA \
- * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
- */ \
- aria_diff_word(x2, x3, x0, x1, \
- x7, x6, x5, x4, \
- y0, y1, y2, y3, \
- y5, y4, y7, y6); \
- aria_store_state_8way(x3, x2, x1, x0, \
- x6, x7, x4, x5, \
- mem_tmp, 0);
- #define aria_fo_gfni(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, rk, round) \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
- \
- aria_sbox_8way_gfni(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- \
- aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
- aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 8); \
- \
- aria_load_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
- \
- aria_sbox_8way_gfni(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- \
- aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
- aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_load_state_8way(y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, 8); \
- aria_diff_word(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- /* aria_diff_byte() \
- * T1 = ABCD -> BADC \
- * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
- * T2 = ABCD -> CDAB \
- * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
- * T3 = ABCD -> DCBA \
- * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
- */ \
- aria_diff_word(x0, x1, x2, x3, \
- x5, x4, x7, x6, \
- y2, y3, y0, y1, \
- y7, y6, y5, y4); \
- aria_store_state_8way(x3, x2, x1, x0, \
- x6, x7, x4, x5, \
- mem_tmp, 0);
- #define aria_ff_gfni(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, rk, round, last_round) \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
- \
- aria_sbox_8way_gfni(x2, x3, x0, x1, \
- x6, x7, x4, x5, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, last_round); \
- \
- aria_store_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 8); \
- \
- aria_load_state_8way(x0, x1, x2, x3, \
- x4, x5, x6, x7, \
- mem_tmp, 0); \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
- \
- aria_sbox_8way_gfni(x2, x3, x0, x1, \
- x6, x7, x4, x5, \
- y0, y1, y2, y3, \
- y4, y5, y6, y7); \
- \
- aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, last_round); \
- \
- aria_load_state_8way(y0, y1, y2, y3, \
- y4, y5, y6, y7, \
- mem_tmp, 8);
- /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
- .section .rodata.cst16, "aM", @progbits, 16
- .align 16
- #define SHUFB_BYTES(idx) \
- 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
- .Lshufb_16x16b:
- .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
- /* For isolating SubBytes from AESENCLAST, inverse shift row */
- .Linv_shift_row:
- .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
- .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
- .Lshift_row:
- .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
- .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
- /* For CTR-mode IV byteswap */
- .Lbswap128_mask:
- .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
- .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
- /* AES inverse affine and S2 combined:
- * 1 1 0 0 0 0 0 1 x0 0
- * 0 1 0 0 1 0 0 0 x1 0
- * 1 1 0 0 1 1 1 1 x2 0
- * 0 1 1 0 1 0 0 1 x3 1
- * 0 1 0 0 1 1 0 0 * x4 + 0
- * 0 1 0 1 1 0 0 0 x5 0
- * 0 0 0 0 0 1 0 1 x6 0
- * 1 1 1 0 0 1 1 1 x7 1
- */
- .Ltf_lo__inv_aff__and__s2:
- .octa 0x92172DA81A9FA520B2370D883ABF8500
- .Ltf_hi__inv_aff__and__s2:
- .octa 0x2B15FFC1AF917B45E6D8320C625CB688
- /* X2 and AES forward affine combined:
- * 1 0 1 1 0 0 0 1 x0 0
- * 0 1 1 1 1 0 1 1 x1 0
- * 0 0 0 1 1 0 1 0 x2 1
- * 0 1 0 0 0 1 0 0 x3 0
- * 0 0 1 1 1 0 1 1 * x4 + 0
- * 0 1 0 0 1 0 0 0 x5 0
- * 1 1 0 1 0 0 1 1 x6 0
- * 0 1 0 0 1 0 1 0 x7 0
- */
- .Ltf_lo__x2__and__fwd_aff:
- .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
- .Ltf_hi__x2__and__fwd_aff:
- .octa 0x3F893781E95FE1576CDA64D2BA0CB204
- .section .rodata.cst8, "aM", @progbits, 8
- .align 8
- /* AES affine: */
- #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
- .Ltf_aff_bitmatrix:
- .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
- BV8(1, 1, 0, 0, 0, 1, 1, 1),
- BV8(1, 1, 1, 0, 0, 0, 1, 1),
- BV8(1, 1, 1, 1, 0, 0, 0, 1),
- BV8(1, 1, 1, 1, 1, 0, 0, 0),
- BV8(0, 1, 1, 1, 1, 1, 0, 0),
- BV8(0, 0, 1, 1, 1, 1, 1, 0),
- BV8(0, 0, 0, 1, 1, 1, 1, 1))
- /* AES inverse affine: */
- #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
- .Ltf_inv_bitmatrix:
- .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
- BV8(1, 0, 0, 1, 0, 0, 1, 0),
- BV8(0, 1, 0, 0, 1, 0, 0, 1),
- BV8(1, 0, 1, 0, 0, 1, 0, 0),
- BV8(0, 1, 0, 1, 0, 0, 1, 0),
- BV8(0, 0, 1, 0, 1, 0, 0, 1),
- BV8(1, 0, 0, 1, 0, 1, 0, 0),
- BV8(0, 1, 0, 0, 1, 0, 1, 0))
- /* S2: */
- #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
- .Ltf_s2_bitmatrix:
- .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
- BV8(0, 0, 1, 1, 1, 1, 1, 1),
- BV8(1, 1, 1, 0, 1, 1, 0, 1),
- BV8(1, 1, 0, 0, 0, 0, 1, 1),
- BV8(0, 1, 0, 0, 0, 0, 1, 1),
- BV8(1, 1, 0, 0, 1, 1, 1, 0),
- BV8(0, 1, 1, 0, 0, 0, 1, 1),
- BV8(1, 1, 1, 1, 0, 1, 1, 0))
- /* X2: */
- #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
- .Ltf_x2_bitmatrix:
- .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
- BV8(0, 0, 1, 0, 0, 1, 1, 0),
- BV8(0, 0, 0, 0, 1, 0, 1, 0),
- BV8(1, 1, 1, 0, 0, 0, 1, 1),
- BV8(1, 1, 1, 0, 1, 1, 0, 0),
- BV8(0, 1, 1, 0, 1, 0, 1, 1),
- BV8(1, 0, 1, 1, 1, 1, 0, 1),
- BV8(1, 0, 0, 1, 0, 0, 1, 1))
- /* Identity matrix: */
- .Ltf_id_bitmatrix:
- .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
- BV8(0, 1, 0, 0, 0, 0, 0, 0),
- BV8(0, 0, 1, 0, 0, 0, 0, 0),
- BV8(0, 0, 0, 1, 0, 0, 0, 0),
- BV8(0, 0, 0, 0, 1, 0, 0, 0),
- BV8(0, 0, 0, 0, 0, 1, 0, 0),
- BV8(0, 0, 0, 0, 0, 0, 1, 0),
- BV8(0, 0, 0, 0, 0, 0, 0, 1))
- /* 4-bit mask */
- .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
- .align 4
- .L0f0f0f0f:
- .long 0x0f0f0f0f
- .text
- SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
- /* input:
- * %r9: rk
- * %rsi: dst
- * %rdx: src
- * %xmm0..%xmm15: 16 byte-sliced blocks
- */
- FRAME_BEGIN
- movq %rsi, %rax;
- leaq 8 * 16(%rax), %r8;
- inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r8);
- aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 0);
- aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 1);
- aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 2);
- aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 3);
- aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 4);
- aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 5);
- aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 6);
- aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 7);
- aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 8);
- aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 9);
- aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 10);
- cmpl $12, rounds(CTX);
- jne .Laria_192;
- aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 11, 12);
- jmp .Laria_end;
- .Laria_192:
- aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 11);
- aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 12);
- cmpl $14, rounds(CTX);
- jne .Laria_256;
- aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 13, 14);
- jmp .Laria_end;
- .Laria_256:
- aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 13);
- aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 14);
- aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 15, 16);
- .Laria_end:
- debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
- %xmm9, %xmm13, %xmm0, %xmm5,
- %xmm10, %xmm14, %xmm3, %xmm6,
- %xmm11, %xmm15, %xmm2, %xmm7,
- (%rax), (%r8));
- FRAME_END
- RET;
- SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
- SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- */
- FRAME_BEGIN
- leaq enc_key(CTX), %r9;
- inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rdx);
- call __aria_aesni_avx_crypt_16way;
- write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax);
- FRAME_END
- RET;
- SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
- SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- */
- FRAME_BEGIN
- leaq dec_key(CTX), %r9;
- inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rdx);
- call __aria_aesni_avx_crypt_16way;
- write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax);
- FRAME_END
- RET;
- SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
- SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
- /* input:
- * %rdi: ctx
- * %rsi: dst
- * %rdx: src
- * %rcx: keystream
- * %r8: iv (big endian, 128bit)
- */
- FRAME_BEGIN
- /* load IV and byteswap */
- vmovdqu (%r8), %xmm8;
- vmovdqa .Lbswap128_mask (%rip), %xmm1;
- vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
- vpcmpeqd %xmm0, %xmm0, %xmm0;
- vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
- /* construct IVs */
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm9;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm10;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm11;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm12;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm13;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm14;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm15;
- vmovdqu %xmm8, (0 * 16)(%rcx);
- vmovdqu %xmm9, (1 * 16)(%rcx);
- vmovdqu %xmm10, (2 * 16)(%rcx);
- vmovdqu %xmm11, (3 * 16)(%rcx);
- vmovdqu %xmm12, (4 * 16)(%rcx);
- vmovdqu %xmm13, (5 * 16)(%rcx);
- vmovdqu %xmm14, (6 * 16)(%rcx);
- vmovdqu %xmm15, (7 * 16)(%rcx);
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm8;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm9;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm10;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm11;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm12;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm13;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm14;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm15;
- inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
- vpshufb %xmm1, %xmm3, %xmm4;
- vmovdqu %xmm4, (%r8);
- vmovdqu (0 * 16)(%rcx), %xmm0;
- vmovdqu (1 * 16)(%rcx), %xmm1;
- vmovdqu (2 * 16)(%rcx), %xmm2;
- vmovdqu (3 * 16)(%rcx), %xmm3;
- vmovdqu (4 * 16)(%rcx), %xmm4;
- vmovdqu (5 * 16)(%rcx), %xmm5;
- vmovdqu (6 * 16)(%rcx), %xmm6;
- vmovdqu (7 * 16)(%rcx), %xmm7;
- FRAME_END
- RET;
- SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
- SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
- /* input:
- * %rdi: ctx
- * %rsi: dst
- * %rdx: src
- * %rcx: keystream
- * %r8: iv (big endian, 128bit)
- */
- FRAME_BEGIN
- call __aria_aesni_avx_ctr_gen_keystream_16way;
- leaq (%rsi), %r10;
- leaq (%rdx), %r11;
- leaq (%rcx), %rsi;
- leaq (%rcx), %rdx;
- leaq enc_key(CTX), %r9;
- call __aria_aesni_avx_crypt_16way;
- vpxor (0 * 16)(%r11), %xmm1, %xmm1;
- vpxor (1 * 16)(%r11), %xmm0, %xmm0;
- vpxor (2 * 16)(%r11), %xmm3, %xmm3;
- vpxor (3 * 16)(%r11), %xmm2, %xmm2;
- vpxor (4 * 16)(%r11), %xmm4, %xmm4;
- vpxor (5 * 16)(%r11), %xmm5, %xmm5;
- vpxor (6 * 16)(%r11), %xmm6, %xmm6;
- vpxor (7 * 16)(%r11), %xmm7, %xmm7;
- vpxor (8 * 16)(%r11), %xmm8, %xmm8;
- vpxor (9 * 16)(%r11), %xmm9, %xmm9;
- vpxor (10 * 16)(%r11), %xmm10, %xmm10;
- vpxor (11 * 16)(%r11), %xmm11, %xmm11;
- vpxor (12 * 16)(%r11), %xmm12, %xmm12;
- vpxor (13 * 16)(%r11), %xmm13, %xmm13;
- vpxor (14 * 16)(%r11), %xmm14, %xmm14;
- vpxor (15 * 16)(%r11), %xmm15, %xmm15;
- write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %r10);
- FRAME_END
- RET;
- SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
- SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
- /* input:
- * %r9: rk
- * %rsi: dst
- * %rdx: src
- * %xmm0..%xmm15: 16 byte-sliced blocks
- */
- FRAME_BEGIN
- movq %rsi, %rax;
- leaq 8 * 16(%rax), %r8;
- inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r8);
- aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 0);
- aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 1);
- aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
- %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 2);
- aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 3);
- aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
- %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 4);
- aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 5);
- aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
- %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 6);
- aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 7);
- aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
- %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 8);
- aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 9);
- aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
- %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 10);
- cmpl $12, rounds(CTX);
- jne .Laria_gfni_192;
- aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 11, 12);
- jmp .Laria_gfni_end;
- .Laria_gfni_192:
- aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 11);
- aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
- %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 12);
- cmpl $14, rounds(CTX);
- jne .Laria_gfni_256;
- aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 13, 14);
- jmp .Laria_gfni_end;
- .Laria_gfni_256:
- aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 13);
- aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
- %xmm12, %xmm13, %xmm14, %xmm15,
- %xmm0, %xmm1, %xmm2, %xmm3,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %rax, %r9, 14);
- aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
- %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11,
- %xmm12, %xmm13, %xmm14,
- %xmm15, %rax, %r9, 15, 16);
- .Laria_gfni_end:
- debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
- %xmm9, %xmm13, %xmm0, %xmm5,
- %xmm10, %xmm14, %xmm3, %xmm6,
- %xmm11, %xmm15, %xmm2, %xmm7,
- (%rax), (%r8));
- FRAME_END
- RET;
- SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
- SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- */
- FRAME_BEGIN
- leaq enc_key(CTX), %r9;
- inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rdx);
- call __aria_aesni_avx_gfni_crypt_16way;
- write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax);
- FRAME_END
- RET;
- SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
- SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- */
- FRAME_BEGIN
- leaq dec_key(CTX), %r9;
- inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rdx);
- call __aria_aesni_avx_gfni_crypt_16way;
- write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %rax);
- FRAME_END
- RET;
- SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
- SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
- /* input:
- * %rdi: ctx
- * %rsi: dst
- * %rdx: src
- * %rcx: keystream
- * %r8: iv (big endian, 128bit)
- */
- FRAME_BEGIN
- call __aria_aesni_avx_ctr_gen_keystream_16way
- leaq (%rsi), %r10;
- leaq (%rdx), %r11;
- leaq (%rcx), %rsi;
- leaq (%rcx), %rdx;
- leaq enc_key(CTX), %r9;
- call __aria_aesni_avx_gfni_crypt_16way;
- vpxor (0 * 16)(%r11), %xmm1, %xmm1;
- vpxor (1 * 16)(%r11), %xmm0, %xmm0;
- vpxor (2 * 16)(%r11), %xmm3, %xmm3;
- vpxor (3 * 16)(%r11), %xmm2, %xmm2;
- vpxor (4 * 16)(%r11), %xmm4, %xmm4;
- vpxor (5 * 16)(%r11), %xmm5, %xmm5;
- vpxor (6 * 16)(%r11), %xmm6, %xmm6;
- vpxor (7 * 16)(%r11), %xmm7, %xmm7;
- vpxor (8 * 16)(%r11), %xmm8, %xmm8;
- vpxor (9 * 16)(%r11), %xmm9, %xmm9;
- vpxor (10 * 16)(%r11), %xmm10, %xmm10;
- vpxor (11 * 16)(%r11), %xmm11, %xmm11;
- vpxor (12 * 16)(%r11), %xmm12, %xmm12;
- vpxor (13 * 16)(%r11), %xmm13, %xmm13;
- vpxor (14 * 16)(%r11), %xmm14, %xmm14;
- vpxor (15 * 16)(%r11), %xmm15, %xmm15;
- write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
- %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
- %xmm15, %r10);
- FRAME_END
- RET;
- SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
|