aesni-intel_asm.S 82 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * Implement AES algorithm in Intel AES-NI instructions.
  4. *
  5. * The white paper of AES-NI instructions can be downloaded from:
  6. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  7. *
  8. * Copyright (C) 2008, Intel Corp.
  9. * Author: Huang Ying <[email protected]>
  10. * Vinodh Gopal <[email protected]>
  11. * Kahraman Akdemir
  12. *
  13. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14. * interface for 64-bit kernels.
  15. * Authors: Erdinc Ozturk ([email protected])
  16. * Aidan O'Mahony ([email protected])
  17. * Adrian Hoban <[email protected]>
  18. * James Guilford ([email protected])
  19. * Gabriele Paoloni <[email protected]>
  20. * Tadeusz Struk ([email protected])
  21. * Wajdi Feghali ([email protected])
  22. * Copyright (c) 2010, Intel Corporation.
  23. *
  24. * Ported x86_64 version to x86:
  25. * Author: Mathias Krause <[email protected]>
  26. */
  27. #include <linux/linkage.h>
  28. #include <asm/frame.h>
  29. #include <asm/nospec-branch.h>
  30. /*
  31. * The following macros are used to move an (un)aligned 16 byte value to/from
  32. * an XMM register. This can done for either FP or integer values, for FP use
  33. * movaps (move aligned packed single) or integer use movdqa (move double quad
  34. * aligned). It doesn't make a performance difference which instruction is used
  35. * since Nehalem (original Core i7) was released. However, the movaps is a byte
  36. * shorter, so that is the one we'll use for now. (same for unaligned).
  37. */
  38. #define MOVADQ movaps
  39. #define MOVUDQ movups
  40. #ifdef __x86_64__
  41. # constants in mergeable sections, linker can reorder and merge
  42. .section .rodata.cst16.POLY, "aM", @progbits, 16
  43. .align 16
  44. POLY: .octa 0xC2000000000000000000000000000001
  45. .section .rodata.cst16.TWOONE, "aM", @progbits, 16
  46. .align 16
  47. TWOONE: .octa 0x00000001000000000000000000000001
  48. .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  49. .align 16
  50. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  51. .section .rodata.cst16.MASK1, "aM", @progbits, 16
  52. .align 16
  53. MASK1: .octa 0x0000000000000000ffffffffffffffff
  54. .section .rodata.cst16.MASK2, "aM", @progbits, 16
  55. .align 16
  56. MASK2: .octa 0xffffffffffffffff0000000000000000
  57. .section .rodata.cst16.ONE, "aM", @progbits, 16
  58. .align 16
  59. ONE: .octa 0x00000000000000000000000000000001
  60. .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  61. .align 16
  62. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  63. .section .rodata.cst16.dec, "aM", @progbits, 16
  64. .align 16
  65. dec: .octa 0x1
  66. .section .rodata.cst16.enc, "aM", @progbits, 16
  67. .align 16
  68. enc: .octa 0x2
  69. # order of these constants should not change.
  70. # more specifically, ALL_F should follow SHIFT_MASK,
  71. # and zero should follow ALL_F
  72. .section .rodata, "a", @progbits
  73. .align 16
  74. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  75. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  76. .octa 0x00000000000000000000000000000000
  77. .text
  78. #define STACK_OFFSET 8*3
  79. #define AadHash 16*0
  80. #define AadLen 16*1
  81. #define InLen (16*1)+8
  82. #define PBlockEncKey 16*2
  83. #define OrigIV 16*3
  84. #define CurCount 16*4
  85. #define PBlockLen 16*5
  86. #define HashKey 16*6 // store HashKey <<1 mod poly here
  87. #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
  88. #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
  89. #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
  90. #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
  91. // bits of HashKey <<1 mod poly here
  92. //(for Karatsuba purposes)
  93. #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
  94. // bits of HashKey^2 <<1 mod poly here
  95. // (for Karatsuba purposes)
  96. #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
  97. // bits of HashKey^3 <<1 mod poly here
  98. // (for Karatsuba purposes)
  99. #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
  100. // bits of HashKey^4 <<1 mod poly here
  101. // (for Karatsuba purposes)
  102. #define arg1 rdi
  103. #define arg2 rsi
  104. #define arg3 rdx
  105. #define arg4 rcx
  106. #define arg5 r8
  107. #define arg6 r9
  108. #define arg7 STACK_OFFSET+8(%rsp)
  109. #define arg8 STACK_OFFSET+16(%rsp)
  110. #define arg9 STACK_OFFSET+24(%rsp)
  111. #define arg10 STACK_OFFSET+32(%rsp)
  112. #define arg11 STACK_OFFSET+40(%rsp)
  113. #define keysize 2*15*16(%arg1)
  114. #endif
  115. #define STATE1 %xmm0
  116. #define STATE2 %xmm4
  117. #define STATE3 %xmm5
  118. #define STATE4 %xmm6
  119. #define STATE STATE1
  120. #define IN1 %xmm1
  121. #define IN2 %xmm7
  122. #define IN3 %xmm8
  123. #define IN4 %xmm9
  124. #define IN IN1
  125. #define KEY %xmm2
  126. #define IV %xmm3
  127. #define BSWAP_MASK %xmm10
  128. #define CTR %xmm11
  129. #define INC %xmm12
  130. #define GF128MUL_MASK %xmm7
  131. #ifdef __x86_64__
  132. #define AREG %rax
  133. #define KEYP %rdi
  134. #define OUTP %rsi
  135. #define UKEYP OUTP
  136. #define INP %rdx
  137. #define LEN %rcx
  138. #define IVP %r8
  139. #define KLEN %r9d
  140. #define T1 %r10
  141. #define TKEYP T1
  142. #define T2 %r11
  143. #define TCTR_LOW T2
  144. #else
  145. #define AREG %eax
  146. #define KEYP %edi
  147. #define OUTP AREG
  148. #define UKEYP OUTP
  149. #define INP %edx
  150. #define LEN %esi
  151. #define IVP %ebp
  152. #define KLEN %ebx
  153. #define T1 %ecx
  154. #define TKEYP T1
  155. #endif
  156. .macro FUNC_SAVE
  157. push %r12
  158. push %r13
  159. push %r14
  160. #
  161. # states of %xmm registers %xmm6:%xmm15 not saved
  162. # all %xmm registers are clobbered
  163. #
  164. .endm
  165. .macro FUNC_RESTORE
  166. pop %r14
  167. pop %r13
  168. pop %r12
  169. .endm
  170. # Precompute hashkeys.
  171. # Input: Hash subkey.
  172. # Output: HashKeys stored in gcm_context_data. Only needs to be called
  173. # once per key.
  174. # clobbers r12, and tmp xmm registers.
  175. .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
  176. mov \SUBKEY, %r12
  177. movdqu (%r12), \TMP3
  178. movdqa SHUF_MASK(%rip), \TMP2
  179. pshufb \TMP2, \TMP3
  180. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  181. movdqa \TMP3, \TMP2
  182. psllq $1, \TMP3
  183. psrlq $63, \TMP2
  184. movdqa \TMP2, \TMP1
  185. pslldq $8, \TMP2
  186. psrldq $8, \TMP1
  187. por \TMP2, \TMP3
  188. # reduce HashKey<<1
  189. pshufd $0x24, \TMP1, \TMP2
  190. pcmpeqd TWOONE(%rip), \TMP2
  191. pand POLY(%rip), \TMP2
  192. pxor \TMP2, \TMP3
  193. movdqu \TMP3, HashKey(%arg2)
  194. movdqa \TMP3, \TMP5
  195. pshufd $78, \TMP3, \TMP1
  196. pxor \TMP3, \TMP1
  197. movdqu \TMP1, HashKey_k(%arg2)
  198. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  199. # TMP5 = HashKey^2<<1 (mod poly)
  200. movdqu \TMP5, HashKey_2(%arg2)
  201. # HashKey_2 = HashKey^2<<1 (mod poly)
  202. pshufd $78, \TMP5, \TMP1
  203. pxor \TMP5, \TMP1
  204. movdqu \TMP1, HashKey_2_k(%arg2)
  205. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  206. # TMP5 = HashKey^3<<1 (mod poly)
  207. movdqu \TMP5, HashKey_3(%arg2)
  208. pshufd $78, \TMP5, \TMP1
  209. pxor \TMP5, \TMP1
  210. movdqu \TMP1, HashKey_3_k(%arg2)
  211. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  212. # TMP5 = HashKey^3<<1 (mod poly)
  213. movdqu \TMP5, HashKey_4(%arg2)
  214. pshufd $78, \TMP5, \TMP1
  215. pxor \TMP5, \TMP1
  216. movdqu \TMP1, HashKey_4_k(%arg2)
  217. .endm
  218. # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
  219. # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
  220. .macro GCM_INIT Iv SUBKEY AAD AADLEN
  221. mov \AADLEN, %r11
  222. mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
  223. xor %r11d, %r11d
  224. mov %r11, InLen(%arg2) # ctx_data.in_length = 0
  225. mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
  226. mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
  227. mov \Iv, %rax
  228. movdqu (%rax), %xmm0
  229. movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
  230. movdqa SHUF_MASK(%rip), %xmm2
  231. pshufb %xmm2, %xmm0
  232. movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
  233. PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
  234. movdqu HashKey(%arg2), %xmm13
  235. CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
  236. %xmm4, %xmm5, %xmm6
  237. .endm
  238. # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
  239. # struct has been initialized by GCM_INIT.
  240. # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
  241. # Clobbers rax, r10-r13, and xmm0-xmm15
  242. .macro GCM_ENC_DEC operation
  243. movdqu AadHash(%arg2), %xmm8
  244. movdqu HashKey(%arg2), %xmm13
  245. add %arg5, InLen(%arg2)
  246. xor %r11d, %r11d # initialise the data pointer offset as zero
  247. PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
  248. sub %r11, %arg5 # sub partial block data used
  249. mov %arg5, %r13 # save the number of bytes
  250. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  251. mov %r13, %r12
  252. # Encrypt/Decrypt first few blocks
  253. and $(3<<4), %r12
  254. jz _initial_num_blocks_is_0_\@
  255. cmp $(2<<4), %r12
  256. jb _initial_num_blocks_is_1_\@
  257. je _initial_num_blocks_is_2_\@
  258. _initial_num_blocks_is_3_\@:
  259. INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  260. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
  261. sub $48, %r13
  262. jmp _initial_blocks_\@
  263. _initial_num_blocks_is_2_\@:
  264. INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  265. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
  266. sub $32, %r13
  267. jmp _initial_blocks_\@
  268. _initial_num_blocks_is_1_\@:
  269. INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  270. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
  271. sub $16, %r13
  272. jmp _initial_blocks_\@
  273. _initial_num_blocks_is_0_\@:
  274. INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  275. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
  276. _initial_blocks_\@:
  277. # Main loop - Encrypt/Decrypt remaining blocks
  278. test %r13, %r13
  279. je _zero_cipher_left_\@
  280. sub $64, %r13
  281. je _four_cipher_left_\@
  282. _crypt_by_4_\@:
  283. GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
  284. %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
  285. %xmm7, %xmm8, enc
  286. add $64, %r11
  287. sub $64, %r13
  288. jne _crypt_by_4_\@
  289. _four_cipher_left_\@:
  290. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  291. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  292. _zero_cipher_left_\@:
  293. movdqu %xmm8, AadHash(%arg2)
  294. movdqu %xmm0, CurCount(%arg2)
  295. mov %arg5, %r13
  296. and $15, %r13 # %r13 = arg5 (mod 16)
  297. je _multiple_of_16_bytes_\@
  298. mov %r13, PBlockLen(%arg2)
  299. # Handle the last <16 Byte block separately
  300. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  301. movdqu %xmm0, CurCount(%arg2)
  302. movdqa SHUF_MASK(%rip), %xmm10
  303. pshufb %xmm10, %xmm0
  304. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  305. movdqu %xmm0, PBlockEncKey(%arg2)
  306. cmp $16, %arg5
  307. jge _large_enough_update_\@
  308. lea (%arg4,%r11,1), %r10
  309. mov %r13, %r12
  310. READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
  311. jmp _data_read_\@
  312. _large_enough_update_\@:
  313. sub $16, %r11
  314. add %r13, %r11
  315. # receive the last <16 Byte block
  316. movdqu (%arg4, %r11, 1), %xmm1
  317. sub %r13, %r11
  318. add $16, %r11
  319. lea SHIFT_MASK+16(%rip), %r12
  320. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  321. # (r13 is the number of bytes in plaintext mod 16)
  322. sub %r13, %r12
  323. # get the appropriate shuffle mask
  324. movdqu (%r12), %xmm2
  325. # shift right 16-r13 bytes
  326. pshufb %xmm2, %xmm1
  327. _data_read_\@:
  328. lea ALL_F+16(%rip), %r12
  329. sub %r13, %r12
  330. .ifc \operation, dec
  331. movdqa %xmm1, %xmm2
  332. .endif
  333. pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
  334. movdqu (%r12), %xmm1
  335. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  336. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  337. .ifc \operation, dec
  338. pand %xmm1, %xmm2
  339. movdqa SHUF_MASK(%rip), %xmm10
  340. pshufb %xmm10 ,%xmm2
  341. pxor %xmm2, %xmm8
  342. .else
  343. movdqa SHUF_MASK(%rip), %xmm10
  344. pshufb %xmm10,%xmm0
  345. pxor %xmm0, %xmm8
  346. .endif
  347. movdqu %xmm8, AadHash(%arg2)
  348. .ifc \operation, enc
  349. # GHASH computation for the last <16 byte block
  350. movdqa SHUF_MASK(%rip), %xmm10
  351. # shuffle xmm0 back to output as ciphertext
  352. pshufb %xmm10, %xmm0
  353. .endif
  354. # Output %r13 bytes
  355. movq %xmm0, %rax
  356. cmp $8, %r13
  357. jle _less_than_8_bytes_left_\@
  358. mov %rax, (%arg3 , %r11, 1)
  359. add $8, %r11
  360. psrldq $8, %xmm0
  361. movq %xmm0, %rax
  362. sub $8, %r13
  363. _less_than_8_bytes_left_\@:
  364. mov %al, (%arg3, %r11, 1)
  365. add $1, %r11
  366. shr $8, %rax
  367. sub $1, %r13
  368. jne _less_than_8_bytes_left_\@
  369. _multiple_of_16_bytes_\@:
  370. .endm
  371. # GCM_COMPLETE Finishes update of tag of last partial block
  372. # Output: Authorization Tag (AUTH_TAG)
  373. # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
  374. .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
  375. movdqu AadHash(%arg2), %xmm8
  376. movdqu HashKey(%arg2), %xmm13
  377. mov PBlockLen(%arg2), %r12
  378. test %r12, %r12
  379. je _partial_done\@
  380. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  381. _partial_done\@:
  382. mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
  383. shl $3, %r12 # convert into number of bits
  384. movd %r12d, %xmm15 # len(A) in %xmm15
  385. mov InLen(%arg2), %r12
  386. shl $3, %r12 # len(C) in bits (*128)
  387. movq %r12, %xmm1
  388. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  389. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  390. pxor %xmm15, %xmm8
  391. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  392. # final GHASH computation
  393. movdqa SHUF_MASK(%rip), %xmm10
  394. pshufb %xmm10, %xmm8
  395. movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
  396. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  397. pxor %xmm8, %xmm0
  398. _return_T_\@:
  399. mov \AUTHTAG, %r10 # %r10 = authTag
  400. mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
  401. cmp $16, %r11
  402. je _T_16_\@
  403. cmp $8, %r11
  404. jl _T_4_\@
  405. _T_8_\@:
  406. movq %xmm0, %rax
  407. mov %rax, (%r10)
  408. add $8, %r10
  409. sub $8, %r11
  410. psrldq $8, %xmm0
  411. test %r11, %r11
  412. je _return_T_done_\@
  413. _T_4_\@:
  414. movd %xmm0, %eax
  415. mov %eax, (%r10)
  416. add $4, %r10
  417. sub $4, %r11
  418. psrldq $4, %xmm0
  419. test %r11, %r11
  420. je _return_T_done_\@
  421. _T_123_\@:
  422. movd %xmm0, %eax
  423. cmp $2, %r11
  424. jl _T_1_\@
  425. mov %ax, (%r10)
  426. cmp $2, %r11
  427. je _return_T_done_\@
  428. add $2, %r10
  429. sar $16, %eax
  430. _T_1_\@:
  431. mov %al, (%r10)
  432. jmp _return_T_done_\@
  433. _T_16_\@:
  434. movdqu %xmm0, (%r10)
  435. _return_T_done_\@:
  436. .endm
  437. #ifdef __x86_64__
  438. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  439. *
  440. *
  441. * Input: A and B (128-bits each, bit-reflected)
  442. * Output: C = A*B*x mod poly, (i.e. >>1 )
  443. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  444. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  445. *
  446. */
  447. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  448. movdqa \GH, \TMP1
  449. pshufd $78, \GH, \TMP2
  450. pshufd $78, \HK, \TMP3
  451. pxor \GH, \TMP2 # TMP2 = a1+a0
  452. pxor \HK, \TMP3 # TMP3 = b1+b0
  453. pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
  454. pclmulqdq $0x00, \HK, \GH # GH = a0*b0
  455. pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  456. pxor \GH, \TMP2
  457. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  458. movdqa \TMP2, \TMP3
  459. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  460. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  461. pxor \TMP3, \GH
  462. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  463. # first phase of the reduction
  464. movdqa \GH, \TMP2
  465. movdqa \GH, \TMP3
  466. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  467. # in in order to perform
  468. # independent shifts
  469. pslld $31, \TMP2 # packed right shift <<31
  470. pslld $30, \TMP3 # packed right shift <<30
  471. pslld $25, \TMP4 # packed right shift <<25
  472. pxor \TMP3, \TMP2 # xor the shifted versions
  473. pxor \TMP4, \TMP2
  474. movdqa \TMP2, \TMP5
  475. psrldq $4, \TMP5 # right shift TMP5 1 DW
  476. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  477. pxor \TMP2, \GH
  478. # second phase of the reduction
  479. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  480. # in in order to perform
  481. # independent shifts
  482. movdqa \GH,\TMP3
  483. movdqa \GH,\TMP4
  484. psrld $1,\TMP2 # packed left shift >>1
  485. psrld $2,\TMP3 # packed left shift >>2
  486. psrld $7,\TMP4 # packed left shift >>7
  487. pxor \TMP3,\TMP2 # xor the shifted versions
  488. pxor \TMP4,\TMP2
  489. pxor \TMP5, \TMP2
  490. pxor \TMP2, \GH
  491. pxor \TMP1, \GH # result is in TMP1
  492. .endm
  493. # Reads DLEN bytes starting at DPTR and stores in XMMDst
  494. # where 0 < DLEN < 16
  495. # Clobbers %rax, DLEN and XMM1
  496. .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
  497. cmp $8, \DLEN
  498. jl _read_lt8_\@
  499. mov (\DPTR), %rax
  500. movq %rax, \XMMDst
  501. sub $8, \DLEN
  502. jz _done_read_partial_block_\@
  503. xor %eax, %eax
  504. _read_next_byte_\@:
  505. shl $8, %rax
  506. mov 7(\DPTR, \DLEN, 1), %al
  507. dec \DLEN
  508. jnz _read_next_byte_\@
  509. movq %rax, \XMM1
  510. pslldq $8, \XMM1
  511. por \XMM1, \XMMDst
  512. jmp _done_read_partial_block_\@
  513. _read_lt8_\@:
  514. xor %eax, %eax
  515. _read_next_byte_lt8_\@:
  516. shl $8, %rax
  517. mov -1(\DPTR, \DLEN, 1), %al
  518. dec \DLEN
  519. jnz _read_next_byte_lt8_\@
  520. movq %rax, \XMMDst
  521. _done_read_partial_block_\@:
  522. .endm
  523. # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
  524. # clobbers r10-11, xmm14
  525. .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
  526. TMP6 TMP7
  527. MOVADQ SHUF_MASK(%rip), %xmm14
  528. mov \AAD, %r10 # %r10 = AAD
  529. mov \AADLEN, %r11 # %r11 = aadLen
  530. pxor \TMP7, \TMP7
  531. pxor \TMP6, \TMP6
  532. cmp $16, %r11
  533. jl _get_AAD_rest\@
  534. _get_AAD_blocks\@:
  535. movdqu (%r10), \TMP7
  536. pshufb %xmm14, \TMP7 # byte-reflect the AAD data
  537. pxor \TMP7, \TMP6
  538. GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
  539. add $16, %r10
  540. sub $16, %r11
  541. cmp $16, %r11
  542. jge _get_AAD_blocks\@
  543. movdqu \TMP6, \TMP7
  544. /* read the last <16B of AAD */
  545. _get_AAD_rest\@:
  546. test %r11, %r11
  547. je _get_AAD_done\@
  548. READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
  549. pshufb %xmm14, \TMP7 # byte-reflect the AAD data
  550. pxor \TMP6, \TMP7
  551. GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
  552. movdqu \TMP7, \TMP6
  553. _get_AAD_done\@:
  554. movdqu \TMP6, AadHash(%arg2)
  555. .endm
  556. # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
  557. # between update calls.
  558. # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
  559. # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
  560. # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
  561. .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
  562. AAD_HASH operation
  563. mov PBlockLen(%arg2), %r13
  564. test %r13, %r13
  565. je _partial_block_done_\@ # Leave Macro if no partial blocks
  566. # Read in input data without over reading
  567. cmp $16, \PLAIN_CYPH_LEN
  568. jl _fewer_than_16_bytes_\@
  569. movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
  570. jmp _data_read_\@
  571. _fewer_than_16_bytes_\@:
  572. lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
  573. mov \PLAIN_CYPH_LEN, %r12
  574. READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
  575. mov PBlockLen(%arg2), %r13
  576. _data_read_\@: # Finished reading in data
  577. movdqu PBlockEncKey(%arg2), %xmm9
  578. movdqu HashKey(%arg2), %xmm13
  579. lea SHIFT_MASK(%rip), %r12
  580. # adjust the shuffle mask pointer to be able to shift r13 bytes
  581. # r16-r13 is the number of bytes in plaintext mod 16)
  582. add %r13, %r12
  583. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  584. pshufb %xmm2, %xmm9 # shift right r13 bytes
  585. .ifc \operation, dec
  586. movdqa %xmm1, %xmm3
  587. pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
  588. mov \PLAIN_CYPH_LEN, %r10
  589. add %r13, %r10
  590. # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
  591. sub $16, %r10
  592. # Determine if if partial block is not being filled and
  593. # shift mask accordingly
  594. jge _no_extra_mask_1_\@
  595. sub %r10, %r12
  596. _no_extra_mask_1_\@:
  597. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  598. # get the appropriate mask to mask out bottom r13 bytes of xmm9
  599. pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
  600. pand %xmm1, %xmm3
  601. movdqa SHUF_MASK(%rip), %xmm10
  602. pshufb %xmm10, %xmm3
  603. pshufb %xmm2, %xmm3
  604. pxor %xmm3, \AAD_HASH
  605. test %r10, %r10
  606. jl _partial_incomplete_1_\@
  607. # GHASH computation for the last <16 Byte block
  608. GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  609. xor %eax, %eax
  610. mov %rax, PBlockLen(%arg2)
  611. jmp _dec_done_\@
  612. _partial_incomplete_1_\@:
  613. add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
  614. _dec_done_\@:
  615. movdqu \AAD_HASH, AadHash(%arg2)
  616. .else
  617. pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
  618. mov \PLAIN_CYPH_LEN, %r10
  619. add %r13, %r10
  620. # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
  621. sub $16, %r10
  622. # Determine if if partial block is not being filled and
  623. # shift mask accordingly
  624. jge _no_extra_mask_2_\@
  625. sub %r10, %r12
  626. _no_extra_mask_2_\@:
  627. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  628. # get the appropriate mask to mask out bottom r13 bytes of xmm9
  629. pand %xmm1, %xmm9
  630. movdqa SHUF_MASK(%rip), %xmm1
  631. pshufb %xmm1, %xmm9
  632. pshufb %xmm2, %xmm9
  633. pxor %xmm9, \AAD_HASH
  634. test %r10, %r10
  635. jl _partial_incomplete_2_\@
  636. # GHASH computation for the last <16 Byte block
  637. GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  638. xor %eax, %eax
  639. mov %rax, PBlockLen(%arg2)
  640. jmp _encode_done_\@
  641. _partial_incomplete_2_\@:
  642. add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
  643. _encode_done_\@:
  644. movdqu \AAD_HASH, AadHash(%arg2)
  645. movdqa SHUF_MASK(%rip), %xmm10
  646. # shuffle xmm9 back to output as ciphertext
  647. pshufb %xmm10, %xmm9
  648. pshufb %xmm2, %xmm9
  649. .endif
  650. # output encrypted Bytes
  651. test %r10, %r10
  652. jl _partial_fill_\@
  653. mov %r13, %r12
  654. mov $16, %r13
  655. # Set r13 to be the number of bytes to write out
  656. sub %r12, %r13
  657. jmp _count_set_\@
  658. _partial_fill_\@:
  659. mov \PLAIN_CYPH_LEN, %r13
  660. _count_set_\@:
  661. movdqa %xmm9, %xmm0
  662. movq %xmm0, %rax
  663. cmp $8, %r13
  664. jle _less_than_8_bytes_left_\@
  665. mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
  666. add $8, \DATA_OFFSET
  667. psrldq $8, %xmm0
  668. movq %xmm0, %rax
  669. sub $8, %r13
  670. _less_than_8_bytes_left_\@:
  671. movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
  672. add $1, \DATA_OFFSET
  673. shr $8, %rax
  674. sub $1, %r13
  675. jne _less_than_8_bytes_left_\@
  676. _partial_block_done_\@:
  677. .endm # PARTIAL_BLOCK
  678. /*
  679. * if a = number of total plaintext bytes
  680. * b = floor(a/16)
  681. * num_initial_blocks = b mod 4
  682. * encrypt the initial num_initial_blocks blocks and apply ghash on
  683. * the ciphertext
  684. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  685. * are clobbered
  686. * arg1, %arg2, %arg3 are used as a pointer only, not modified
  687. */
  688. .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  689. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  690. MOVADQ SHUF_MASK(%rip), %xmm14
  691. movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
  692. # start AES for num_initial_blocks blocks
  693. movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
  694. .if (\i == 5) || (\i == 6) || (\i == 7)
  695. MOVADQ ONE(%RIP),\TMP1
  696. MOVADQ 0(%arg1),\TMP2
  697. .irpc index, \i_seq
  698. paddd \TMP1, \XMM0 # INCR Y0
  699. .ifc \operation, dec
  700. movdqa \XMM0, %xmm\index
  701. .else
  702. MOVADQ \XMM0, %xmm\index
  703. .endif
  704. pshufb %xmm14, %xmm\index # perform a 16 byte swap
  705. pxor \TMP2, %xmm\index
  706. .endr
  707. lea 0x10(%arg1),%r10
  708. mov keysize,%eax
  709. shr $2,%eax # 128->4, 192->6, 256->8
  710. add $5,%eax # 128->9, 192->11, 256->13
  711. aes_loop_initial_\@:
  712. MOVADQ (%r10),\TMP1
  713. .irpc index, \i_seq
  714. aesenc \TMP1, %xmm\index
  715. .endr
  716. add $16,%r10
  717. sub $1,%eax
  718. jnz aes_loop_initial_\@
  719. MOVADQ (%r10), \TMP1
  720. .irpc index, \i_seq
  721. aesenclast \TMP1, %xmm\index # Last Round
  722. .endr
  723. .irpc index, \i_seq
  724. movdqu (%arg4 , %r11, 1), \TMP1
  725. pxor \TMP1, %xmm\index
  726. movdqu %xmm\index, (%arg3 , %r11, 1)
  727. # write back plaintext/ciphertext for num_initial_blocks
  728. add $16, %r11
  729. .ifc \operation, dec
  730. movdqa \TMP1, %xmm\index
  731. .endif
  732. pshufb %xmm14, %xmm\index
  733. # prepare plaintext/ciphertext for GHASH computation
  734. .endr
  735. .endif
  736. # apply GHASH on num_initial_blocks blocks
  737. .if \i == 5
  738. pxor %xmm5, %xmm6
  739. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  740. pxor %xmm6, %xmm7
  741. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  742. pxor %xmm7, %xmm8
  743. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  744. .elseif \i == 6
  745. pxor %xmm6, %xmm7
  746. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  747. pxor %xmm7, %xmm8
  748. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  749. .elseif \i == 7
  750. pxor %xmm7, %xmm8
  751. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  752. .endif
  753. cmp $64, %r13
  754. jl _initial_blocks_done\@
  755. # no need for precomputed values
  756. /*
  757. *
  758. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  759. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  760. */
  761. MOVADQ ONE(%RIP),\TMP1
  762. paddd \TMP1, \XMM0 # INCR Y0
  763. MOVADQ \XMM0, \XMM1
  764. pshufb %xmm14, \XMM1 # perform a 16 byte swap
  765. paddd \TMP1, \XMM0 # INCR Y0
  766. MOVADQ \XMM0, \XMM2
  767. pshufb %xmm14, \XMM2 # perform a 16 byte swap
  768. paddd \TMP1, \XMM0 # INCR Y0
  769. MOVADQ \XMM0, \XMM3
  770. pshufb %xmm14, \XMM3 # perform a 16 byte swap
  771. paddd \TMP1, \XMM0 # INCR Y0
  772. MOVADQ \XMM0, \XMM4
  773. pshufb %xmm14, \XMM4 # perform a 16 byte swap
  774. MOVADQ 0(%arg1),\TMP1
  775. pxor \TMP1, \XMM1
  776. pxor \TMP1, \XMM2
  777. pxor \TMP1, \XMM3
  778. pxor \TMP1, \XMM4
  779. .irpc index, 1234 # do 4 rounds
  780. movaps 0x10*\index(%arg1), \TMP1
  781. aesenc \TMP1, \XMM1
  782. aesenc \TMP1, \XMM2
  783. aesenc \TMP1, \XMM3
  784. aesenc \TMP1, \XMM4
  785. .endr
  786. .irpc index, 56789 # do next 5 rounds
  787. movaps 0x10*\index(%arg1), \TMP1
  788. aesenc \TMP1, \XMM1
  789. aesenc \TMP1, \XMM2
  790. aesenc \TMP1, \XMM3
  791. aesenc \TMP1, \XMM4
  792. .endr
  793. lea 0xa0(%arg1),%r10
  794. mov keysize,%eax
  795. shr $2,%eax # 128->4, 192->6, 256->8
  796. sub $4,%eax # 128->0, 192->2, 256->4
  797. jz aes_loop_pre_done\@
  798. aes_loop_pre_\@:
  799. MOVADQ (%r10),\TMP2
  800. .irpc index, 1234
  801. aesenc \TMP2, %xmm\index
  802. .endr
  803. add $16,%r10
  804. sub $1,%eax
  805. jnz aes_loop_pre_\@
  806. aes_loop_pre_done\@:
  807. MOVADQ (%r10), \TMP2
  808. aesenclast \TMP2, \XMM1
  809. aesenclast \TMP2, \XMM2
  810. aesenclast \TMP2, \XMM3
  811. aesenclast \TMP2, \XMM4
  812. movdqu 16*0(%arg4 , %r11 , 1), \TMP1
  813. pxor \TMP1, \XMM1
  814. .ifc \operation, dec
  815. movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
  816. movdqa \TMP1, \XMM1
  817. .endif
  818. movdqu 16*1(%arg4 , %r11 , 1), \TMP1
  819. pxor \TMP1, \XMM2
  820. .ifc \operation, dec
  821. movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
  822. movdqa \TMP1, \XMM2
  823. .endif
  824. movdqu 16*2(%arg4 , %r11 , 1), \TMP1
  825. pxor \TMP1, \XMM3
  826. .ifc \operation, dec
  827. movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
  828. movdqa \TMP1, \XMM3
  829. .endif
  830. movdqu 16*3(%arg4 , %r11 , 1), \TMP1
  831. pxor \TMP1, \XMM4
  832. .ifc \operation, dec
  833. movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
  834. movdqa \TMP1, \XMM4
  835. .else
  836. movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
  837. movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
  838. movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
  839. movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
  840. .endif
  841. add $64, %r11
  842. pshufb %xmm14, \XMM1 # perform a 16 byte swap
  843. pxor \XMMDst, \XMM1
  844. # combine GHASHed value with the corresponding ciphertext
  845. pshufb %xmm14, \XMM2 # perform a 16 byte swap
  846. pshufb %xmm14, \XMM3 # perform a 16 byte swap
  847. pshufb %xmm14, \XMM4 # perform a 16 byte swap
  848. _initial_blocks_done\@:
  849. .endm
  850. /*
  851. * encrypt 4 blocks at a time
  852. * ghash the 4 previously encrypted ciphertext blocks
  853. * arg1, %arg3, %arg4 are used as pointers only, not modified
  854. * %r11 is the data offset value
  855. */
  856. .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
  857. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  858. movdqa \XMM1, \XMM5
  859. movdqa \XMM2, \XMM6
  860. movdqa \XMM3, \XMM7
  861. movdqa \XMM4, \XMM8
  862. movdqa SHUF_MASK(%rip), %xmm15
  863. # multiply TMP5 * HashKey using karatsuba
  864. movdqa \XMM5, \TMP4
  865. pshufd $78, \XMM5, \TMP6
  866. pxor \XMM5, \TMP6
  867. paddd ONE(%rip), \XMM0 # INCR CNT
  868. movdqu HashKey_4(%arg2), \TMP5
  869. pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  870. movdqa \XMM0, \XMM1
  871. paddd ONE(%rip), \XMM0 # INCR CNT
  872. movdqa \XMM0, \XMM2
  873. paddd ONE(%rip), \XMM0 # INCR CNT
  874. movdqa \XMM0, \XMM3
  875. paddd ONE(%rip), \XMM0 # INCR CNT
  876. movdqa \XMM0, \XMM4
  877. pshufb %xmm15, \XMM1 # perform a 16 byte swap
  878. pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  879. pshufb %xmm15, \XMM2 # perform a 16 byte swap
  880. pshufb %xmm15, \XMM3 # perform a 16 byte swap
  881. pshufb %xmm15, \XMM4 # perform a 16 byte swap
  882. pxor (%arg1), \XMM1
  883. pxor (%arg1), \XMM2
  884. pxor (%arg1), \XMM3
  885. pxor (%arg1), \XMM4
  886. movdqu HashKey_4_k(%arg2), \TMP5
  887. pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  888. movaps 0x10(%arg1), \TMP1
  889. aesenc \TMP1, \XMM1 # Round 1
  890. aesenc \TMP1, \XMM2
  891. aesenc \TMP1, \XMM3
  892. aesenc \TMP1, \XMM4
  893. movaps 0x20(%arg1), \TMP1
  894. aesenc \TMP1, \XMM1 # Round 2
  895. aesenc \TMP1, \XMM2
  896. aesenc \TMP1, \XMM3
  897. aesenc \TMP1, \XMM4
  898. movdqa \XMM6, \TMP1
  899. pshufd $78, \XMM6, \TMP2
  900. pxor \XMM6, \TMP2
  901. movdqu HashKey_3(%arg2), \TMP5
  902. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  903. movaps 0x30(%arg1), \TMP3
  904. aesenc \TMP3, \XMM1 # Round 3
  905. aesenc \TMP3, \XMM2
  906. aesenc \TMP3, \XMM3
  907. aesenc \TMP3, \XMM4
  908. pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  909. movaps 0x40(%arg1), \TMP3
  910. aesenc \TMP3, \XMM1 # Round 4
  911. aesenc \TMP3, \XMM2
  912. aesenc \TMP3, \XMM3
  913. aesenc \TMP3, \XMM4
  914. movdqu HashKey_3_k(%arg2), \TMP5
  915. pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  916. movaps 0x50(%arg1), \TMP3
  917. aesenc \TMP3, \XMM1 # Round 5
  918. aesenc \TMP3, \XMM2
  919. aesenc \TMP3, \XMM3
  920. aesenc \TMP3, \XMM4
  921. pxor \TMP1, \TMP4
  922. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  923. pxor \XMM6, \XMM5
  924. pxor \TMP2, \TMP6
  925. movdqa \XMM7, \TMP1
  926. pshufd $78, \XMM7, \TMP2
  927. pxor \XMM7, \TMP2
  928. movdqu HashKey_2(%arg2), \TMP5
  929. # Multiply TMP5 * HashKey using karatsuba
  930. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  931. movaps 0x60(%arg1), \TMP3
  932. aesenc \TMP3, \XMM1 # Round 6
  933. aesenc \TMP3, \XMM2
  934. aesenc \TMP3, \XMM3
  935. aesenc \TMP3, \XMM4
  936. pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  937. movaps 0x70(%arg1), \TMP3
  938. aesenc \TMP3, \XMM1 # Round 7
  939. aesenc \TMP3, \XMM2
  940. aesenc \TMP3, \XMM3
  941. aesenc \TMP3, \XMM4
  942. movdqu HashKey_2_k(%arg2), \TMP5
  943. pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  944. movaps 0x80(%arg1), \TMP3
  945. aesenc \TMP3, \XMM1 # Round 8
  946. aesenc \TMP3, \XMM2
  947. aesenc \TMP3, \XMM3
  948. aesenc \TMP3, \XMM4
  949. pxor \TMP1, \TMP4
  950. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  951. pxor \XMM7, \XMM5
  952. pxor \TMP2, \TMP6
  953. # Multiply XMM8 * HashKey
  954. # XMM8 and TMP5 hold the values for the two operands
  955. movdqa \XMM8, \TMP1
  956. pshufd $78, \XMM8, \TMP2
  957. pxor \XMM8, \TMP2
  958. movdqu HashKey(%arg2), \TMP5
  959. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  960. movaps 0x90(%arg1), \TMP3
  961. aesenc \TMP3, \XMM1 # Round 9
  962. aesenc \TMP3, \XMM2
  963. aesenc \TMP3, \XMM3
  964. aesenc \TMP3, \XMM4
  965. pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  966. lea 0xa0(%arg1),%r10
  967. mov keysize,%eax
  968. shr $2,%eax # 128->4, 192->6, 256->8
  969. sub $4,%eax # 128->0, 192->2, 256->4
  970. jz aes_loop_par_enc_done\@
  971. aes_loop_par_enc\@:
  972. MOVADQ (%r10),\TMP3
  973. .irpc index, 1234
  974. aesenc \TMP3, %xmm\index
  975. .endr
  976. add $16,%r10
  977. sub $1,%eax
  978. jnz aes_loop_par_enc\@
  979. aes_loop_par_enc_done\@:
  980. MOVADQ (%r10), \TMP3
  981. aesenclast \TMP3, \XMM1 # Round 10
  982. aesenclast \TMP3, \XMM2
  983. aesenclast \TMP3, \XMM3
  984. aesenclast \TMP3, \XMM4
  985. movdqu HashKey_k(%arg2), \TMP5
  986. pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  987. movdqu (%arg4,%r11,1), \TMP3
  988. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  989. movdqu 16(%arg4,%r11,1), \TMP3
  990. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  991. movdqu 32(%arg4,%r11,1), \TMP3
  992. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  993. movdqu 48(%arg4,%r11,1), \TMP3
  994. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  995. movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
  996. movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
  997. movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
  998. movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
  999. pshufb %xmm15, \XMM1 # perform a 16 byte swap
  1000. pshufb %xmm15, \XMM2 # perform a 16 byte swap
  1001. pshufb %xmm15, \XMM3 # perform a 16 byte swap
  1002. pshufb %xmm15, \XMM4 # perform a 16 byte swap
  1003. pxor \TMP4, \TMP1
  1004. pxor \XMM8, \XMM5
  1005. pxor \TMP6, \TMP2
  1006. pxor \TMP1, \TMP2
  1007. pxor \XMM5, \TMP2
  1008. movdqa \TMP2, \TMP3
  1009. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  1010. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1011. pxor \TMP3, \XMM5
  1012. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  1013. # first phase of reduction
  1014. movdqa \XMM5, \TMP2
  1015. movdqa \XMM5, \TMP3
  1016. movdqa \XMM5, \TMP4
  1017. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  1018. pslld $31, \TMP2 # packed right shift << 31
  1019. pslld $30, \TMP3 # packed right shift << 30
  1020. pslld $25, \TMP4 # packed right shift << 25
  1021. pxor \TMP3, \TMP2 # xor the shifted versions
  1022. pxor \TMP4, \TMP2
  1023. movdqa \TMP2, \TMP5
  1024. psrldq $4, \TMP5 # right shift T5 1 DW
  1025. pslldq $12, \TMP2 # left shift T2 3 DWs
  1026. pxor \TMP2, \XMM5
  1027. # second phase of reduction
  1028. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  1029. movdqa \XMM5,\TMP3
  1030. movdqa \XMM5,\TMP4
  1031. psrld $1, \TMP2 # packed left shift >>1
  1032. psrld $2, \TMP3 # packed left shift >>2
  1033. psrld $7, \TMP4 # packed left shift >>7
  1034. pxor \TMP3,\TMP2 # xor the shifted versions
  1035. pxor \TMP4,\TMP2
  1036. pxor \TMP5, \TMP2
  1037. pxor \TMP2, \XMM5
  1038. pxor \TMP1, \XMM5 # result is in TMP1
  1039. pxor \XMM5, \XMM1
  1040. .endm
  1041. /*
  1042. * decrypt 4 blocks at a time
  1043. * ghash the 4 previously decrypted ciphertext blocks
  1044. * arg1, %arg3, %arg4 are used as pointers only, not modified
  1045. * %r11 is the data offset value
  1046. */
  1047. .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
  1048. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  1049. movdqa \XMM1, \XMM5
  1050. movdqa \XMM2, \XMM6
  1051. movdqa \XMM3, \XMM7
  1052. movdqa \XMM4, \XMM8
  1053. movdqa SHUF_MASK(%rip), %xmm15
  1054. # multiply TMP5 * HashKey using karatsuba
  1055. movdqa \XMM5, \TMP4
  1056. pshufd $78, \XMM5, \TMP6
  1057. pxor \XMM5, \TMP6
  1058. paddd ONE(%rip), \XMM0 # INCR CNT
  1059. movdqu HashKey_4(%arg2), \TMP5
  1060. pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  1061. movdqa \XMM0, \XMM1
  1062. paddd ONE(%rip), \XMM0 # INCR CNT
  1063. movdqa \XMM0, \XMM2
  1064. paddd ONE(%rip), \XMM0 # INCR CNT
  1065. movdqa \XMM0, \XMM3
  1066. paddd ONE(%rip), \XMM0 # INCR CNT
  1067. movdqa \XMM0, \XMM4
  1068. pshufb %xmm15, \XMM1 # perform a 16 byte swap
  1069. pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  1070. pshufb %xmm15, \XMM2 # perform a 16 byte swap
  1071. pshufb %xmm15, \XMM3 # perform a 16 byte swap
  1072. pshufb %xmm15, \XMM4 # perform a 16 byte swap
  1073. pxor (%arg1), \XMM1
  1074. pxor (%arg1), \XMM2
  1075. pxor (%arg1), \XMM3
  1076. pxor (%arg1), \XMM4
  1077. movdqu HashKey_4_k(%arg2), \TMP5
  1078. pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  1079. movaps 0x10(%arg1), \TMP1
  1080. aesenc \TMP1, \XMM1 # Round 1
  1081. aesenc \TMP1, \XMM2
  1082. aesenc \TMP1, \XMM3
  1083. aesenc \TMP1, \XMM4
  1084. movaps 0x20(%arg1), \TMP1
  1085. aesenc \TMP1, \XMM1 # Round 2
  1086. aesenc \TMP1, \XMM2
  1087. aesenc \TMP1, \XMM3
  1088. aesenc \TMP1, \XMM4
  1089. movdqa \XMM6, \TMP1
  1090. pshufd $78, \XMM6, \TMP2
  1091. pxor \XMM6, \TMP2
  1092. movdqu HashKey_3(%arg2), \TMP5
  1093. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  1094. movaps 0x30(%arg1), \TMP3
  1095. aesenc \TMP3, \XMM1 # Round 3
  1096. aesenc \TMP3, \XMM2
  1097. aesenc \TMP3, \XMM3
  1098. aesenc \TMP3, \XMM4
  1099. pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  1100. movaps 0x40(%arg1), \TMP3
  1101. aesenc \TMP3, \XMM1 # Round 4
  1102. aesenc \TMP3, \XMM2
  1103. aesenc \TMP3, \XMM3
  1104. aesenc \TMP3, \XMM4
  1105. movdqu HashKey_3_k(%arg2), \TMP5
  1106. pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1107. movaps 0x50(%arg1), \TMP3
  1108. aesenc \TMP3, \XMM1 # Round 5
  1109. aesenc \TMP3, \XMM2
  1110. aesenc \TMP3, \XMM3
  1111. aesenc \TMP3, \XMM4
  1112. pxor \TMP1, \TMP4
  1113. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  1114. pxor \XMM6, \XMM5
  1115. pxor \TMP2, \TMP6
  1116. movdqa \XMM7, \TMP1
  1117. pshufd $78, \XMM7, \TMP2
  1118. pxor \XMM7, \TMP2
  1119. movdqu HashKey_2(%arg2), \TMP5
  1120. # Multiply TMP5 * HashKey using karatsuba
  1121. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1122. movaps 0x60(%arg1), \TMP3
  1123. aesenc \TMP3, \XMM1 # Round 6
  1124. aesenc \TMP3, \XMM2
  1125. aesenc \TMP3, \XMM3
  1126. aesenc \TMP3, \XMM4
  1127. pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  1128. movaps 0x70(%arg1), \TMP3
  1129. aesenc \TMP3, \XMM1 # Round 7
  1130. aesenc \TMP3, \XMM2
  1131. aesenc \TMP3, \XMM3
  1132. aesenc \TMP3, \XMM4
  1133. movdqu HashKey_2_k(%arg2), \TMP5
  1134. pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1135. movaps 0x80(%arg1), \TMP3
  1136. aesenc \TMP3, \XMM1 # Round 8
  1137. aesenc \TMP3, \XMM2
  1138. aesenc \TMP3, \XMM3
  1139. aesenc \TMP3, \XMM4
  1140. pxor \TMP1, \TMP4
  1141. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  1142. pxor \XMM7, \XMM5
  1143. pxor \TMP2, \TMP6
  1144. # Multiply XMM8 * HashKey
  1145. # XMM8 and TMP5 hold the values for the two operands
  1146. movdqa \XMM8, \TMP1
  1147. pshufd $78, \XMM8, \TMP2
  1148. pxor \XMM8, \TMP2
  1149. movdqu HashKey(%arg2), \TMP5
  1150. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1151. movaps 0x90(%arg1), \TMP3
  1152. aesenc \TMP3, \XMM1 # Round 9
  1153. aesenc \TMP3, \XMM2
  1154. aesenc \TMP3, \XMM3
  1155. aesenc \TMP3, \XMM4
  1156. pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  1157. lea 0xa0(%arg1),%r10
  1158. mov keysize,%eax
  1159. shr $2,%eax # 128->4, 192->6, 256->8
  1160. sub $4,%eax # 128->0, 192->2, 256->4
  1161. jz aes_loop_par_dec_done\@
  1162. aes_loop_par_dec\@:
  1163. MOVADQ (%r10),\TMP3
  1164. .irpc index, 1234
  1165. aesenc \TMP3, %xmm\index
  1166. .endr
  1167. add $16,%r10
  1168. sub $1,%eax
  1169. jnz aes_loop_par_dec\@
  1170. aes_loop_par_dec_done\@:
  1171. MOVADQ (%r10), \TMP3
  1172. aesenclast \TMP3, \XMM1 # last round
  1173. aesenclast \TMP3, \XMM2
  1174. aesenclast \TMP3, \XMM3
  1175. aesenclast \TMP3, \XMM4
  1176. movdqu HashKey_k(%arg2), \TMP5
  1177. pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1178. movdqu (%arg4,%r11,1), \TMP3
  1179. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  1180. movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
  1181. movdqa \TMP3, \XMM1
  1182. movdqu 16(%arg4,%r11,1), \TMP3
  1183. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  1184. movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
  1185. movdqa \TMP3, \XMM2
  1186. movdqu 32(%arg4,%r11,1), \TMP3
  1187. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  1188. movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
  1189. movdqa \TMP3, \XMM3
  1190. movdqu 48(%arg4,%r11,1), \TMP3
  1191. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  1192. movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
  1193. movdqa \TMP3, \XMM4
  1194. pshufb %xmm15, \XMM1 # perform a 16 byte swap
  1195. pshufb %xmm15, \XMM2 # perform a 16 byte swap
  1196. pshufb %xmm15, \XMM3 # perform a 16 byte swap
  1197. pshufb %xmm15, \XMM4 # perform a 16 byte swap
  1198. pxor \TMP4, \TMP1
  1199. pxor \XMM8, \XMM5
  1200. pxor \TMP6, \TMP2
  1201. pxor \TMP1, \TMP2
  1202. pxor \XMM5, \TMP2
  1203. movdqa \TMP2, \TMP3
  1204. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  1205. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1206. pxor \TMP3, \XMM5
  1207. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  1208. # first phase of reduction
  1209. movdqa \XMM5, \TMP2
  1210. movdqa \XMM5, \TMP3
  1211. movdqa \XMM5, \TMP4
  1212. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  1213. pslld $31, \TMP2 # packed right shift << 31
  1214. pslld $30, \TMP3 # packed right shift << 30
  1215. pslld $25, \TMP4 # packed right shift << 25
  1216. pxor \TMP3, \TMP2 # xor the shifted versions
  1217. pxor \TMP4, \TMP2
  1218. movdqa \TMP2, \TMP5
  1219. psrldq $4, \TMP5 # right shift T5 1 DW
  1220. pslldq $12, \TMP2 # left shift T2 3 DWs
  1221. pxor \TMP2, \XMM5
  1222. # second phase of reduction
  1223. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  1224. movdqa \XMM5,\TMP3
  1225. movdqa \XMM5,\TMP4
  1226. psrld $1, \TMP2 # packed left shift >>1
  1227. psrld $2, \TMP3 # packed left shift >>2
  1228. psrld $7, \TMP4 # packed left shift >>7
  1229. pxor \TMP3,\TMP2 # xor the shifted versions
  1230. pxor \TMP4,\TMP2
  1231. pxor \TMP5, \TMP2
  1232. pxor \TMP2, \XMM5
  1233. pxor \TMP1, \XMM5 # result is in TMP1
  1234. pxor \XMM5, \XMM1
  1235. .endm
  1236. /* GHASH the last 4 ciphertext blocks. */
  1237. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  1238. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  1239. # Multiply TMP6 * HashKey (using Karatsuba)
  1240. movdqa \XMM1, \TMP6
  1241. pshufd $78, \XMM1, \TMP2
  1242. pxor \XMM1, \TMP2
  1243. movdqu HashKey_4(%arg2), \TMP5
  1244. pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  1245. pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  1246. movdqu HashKey_4_k(%arg2), \TMP4
  1247. pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1248. movdqa \XMM1, \XMMDst
  1249. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  1250. # Multiply TMP1 * HashKey (using Karatsuba)
  1251. movdqa \XMM2, \TMP1
  1252. pshufd $78, \XMM2, \TMP2
  1253. pxor \XMM2, \TMP2
  1254. movdqu HashKey_3(%arg2), \TMP5
  1255. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1256. pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  1257. movdqu HashKey_3_k(%arg2), \TMP4
  1258. pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1259. pxor \TMP1, \TMP6
  1260. pxor \XMM2, \XMMDst
  1261. pxor \TMP2, \XMM1
  1262. # results accumulated in TMP6, XMMDst, XMM1
  1263. # Multiply TMP1 * HashKey (using Karatsuba)
  1264. movdqa \XMM3, \TMP1
  1265. pshufd $78, \XMM3, \TMP2
  1266. pxor \XMM3, \TMP2
  1267. movdqu HashKey_2(%arg2), \TMP5
  1268. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1269. pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  1270. movdqu HashKey_2_k(%arg2), \TMP4
  1271. pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1272. pxor \TMP1, \TMP6
  1273. pxor \XMM3, \XMMDst
  1274. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  1275. # Multiply TMP1 * HashKey (using Karatsuba)
  1276. movdqa \XMM4, \TMP1
  1277. pshufd $78, \XMM4, \TMP2
  1278. pxor \XMM4, \TMP2
  1279. movdqu HashKey(%arg2), \TMP5
  1280. pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1281. pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  1282. movdqu HashKey_k(%arg2), \TMP4
  1283. pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1284. pxor \TMP1, \TMP6
  1285. pxor \XMM4, \XMMDst
  1286. pxor \XMM1, \TMP2
  1287. pxor \TMP6, \TMP2
  1288. pxor \XMMDst, \TMP2
  1289. # middle section of the temp results combined as in karatsuba algorithm
  1290. movdqa \TMP2, \TMP4
  1291. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  1292. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1293. pxor \TMP4, \XMMDst
  1294. pxor \TMP2, \TMP6
  1295. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  1296. # first phase of the reduction
  1297. movdqa \XMMDst, \TMP2
  1298. movdqa \XMMDst, \TMP3
  1299. movdqa \XMMDst, \TMP4
  1300. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  1301. pslld $31, \TMP2 # packed right shifting << 31
  1302. pslld $30, \TMP3 # packed right shifting << 30
  1303. pslld $25, \TMP4 # packed right shifting << 25
  1304. pxor \TMP3, \TMP2 # xor the shifted versions
  1305. pxor \TMP4, \TMP2
  1306. movdqa \TMP2, \TMP7
  1307. psrldq $4, \TMP7 # right shift TMP7 1 DW
  1308. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  1309. pxor \TMP2, \XMMDst
  1310. # second phase of the reduction
  1311. movdqa \XMMDst, \TMP2
  1312. # make 3 copies of XMMDst for doing 3 shift operations
  1313. movdqa \XMMDst, \TMP3
  1314. movdqa \XMMDst, \TMP4
  1315. psrld $1, \TMP2 # packed left shift >> 1
  1316. psrld $2, \TMP3 # packed left shift >> 2
  1317. psrld $7, \TMP4 # packed left shift >> 7
  1318. pxor \TMP3, \TMP2 # xor the shifted versions
  1319. pxor \TMP4, \TMP2
  1320. pxor \TMP7, \TMP2
  1321. pxor \TMP2, \XMMDst
  1322. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  1323. .endm
  1324. /* Encryption of a single block
  1325. * uses eax & r10
  1326. */
  1327. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  1328. pxor (%arg1), \XMM0
  1329. mov keysize,%eax
  1330. shr $2,%eax # 128->4, 192->6, 256->8
  1331. add $5,%eax # 128->9, 192->11, 256->13
  1332. lea 16(%arg1), %r10 # get first expanded key address
  1333. _esb_loop_\@:
  1334. MOVADQ (%r10),\TMP1
  1335. aesenc \TMP1,\XMM0
  1336. add $16,%r10
  1337. sub $1,%eax
  1338. jnz _esb_loop_\@
  1339. MOVADQ (%r10),\TMP1
  1340. aesenclast \TMP1,\XMM0
  1341. .endm
  1342. /*****************************************************************************
  1343. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1344. * struct gcm_context_data *data
  1345. * // Context data
  1346. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  1347. * const u8 *in, // Ciphertext input
  1348. * u64 plaintext_len, // Length of data in bytes for decryption.
  1349. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1350. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1351. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1352. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1353. * const u8 *aad, // Additional Authentication Data (AAD)
  1354. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1355. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  1356. * // given authentication tag and only return the plaintext if they match.
  1357. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  1358. * // (most likely), 12 or 8.
  1359. *
  1360. * Assumptions:
  1361. *
  1362. * keys:
  1363. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  1364. * set of 11 keys in the data structure void *aes_ctx
  1365. *
  1366. * iv:
  1367. * 0 1 2 3
  1368. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1369. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1370. * | Salt (From the SA) |
  1371. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1372. * | Initialization Vector |
  1373. * | (This is the sequence number from IPSec header) |
  1374. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1375. * | 0x1 |
  1376. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1377. *
  1378. *
  1379. *
  1380. * AAD:
  1381. * AAD padded to 128 bits with 0
  1382. * for example, assume AAD is a u32 vector
  1383. *
  1384. * if AAD is 8 bytes:
  1385. * AAD[3] = {A0, A1};
  1386. * padded AAD in xmm register = {A1 A0 0 0}
  1387. *
  1388. * 0 1 2 3
  1389. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1390. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1391. * | SPI (A1) |
  1392. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1393. * | 32-bit Sequence Number (A0) |
  1394. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1395. * | 0x0 |
  1396. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1397. *
  1398. * AAD Format with 32-bit Sequence Number
  1399. *
  1400. * if AAD is 12 bytes:
  1401. * AAD[3] = {A0, A1, A2};
  1402. * padded AAD in xmm register = {A2 A1 A0 0}
  1403. *
  1404. * 0 1 2 3
  1405. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1406. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1407. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1408. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1409. * | SPI (A2) |
  1410. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1411. * | 64-bit Extended Sequence Number {A1,A0} |
  1412. * | |
  1413. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1414. * | 0x0 |
  1415. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1416. *
  1417. * AAD Format with 64-bit Extended Sequence Number
  1418. *
  1419. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1420. *
  1421. *****************************************************************************/
  1422. SYM_FUNC_START(aesni_gcm_dec)
  1423. FUNC_SAVE
  1424. GCM_INIT %arg6, arg7, arg8, arg9
  1425. GCM_ENC_DEC dec
  1426. GCM_COMPLETE arg10, arg11
  1427. FUNC_RESTORE
  1428. RET
  1429. SYM_FUNC_END(aesni_gcm_dec)
  1430. /*****************************************************************************
  1431. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1432. * struct gcm_context_data *data
  1433. * // Context data
  1434. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1435. * const u8 *in, // Plaintext input
  1436. * u64 plaintext_len, // Length of data in bytes for encryption.
  1437. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1438. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1439. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1440. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1441. * const u8 *aad, // Additional Authentication Data (AAD)
  1442. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1443. * u8 *auth_tag, // Authenticated Tag output.
  1444. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1445. * // 12 or 8.
  1446. *
  1447. * Assumptions:
  1448. *
  1449. * keys:
  1450. * keys are pre-expanded and aligned to 16 bytes. we are using the
  1451. * first set of 11 keys in the data structure void *aes_ctx
  1452. *
  1453. *
  1454. * iv:
  1455. * 0 1 2 3
  1456. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1457. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1458. * | Salt (From the SA) |
  1459. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1460. * | Initialization Vector |
  1461. * | (This is the sequence number from IPSec header) |
  1462. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1463. * | 0x1 |
  1464. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1465. *
  1466. *
  1467. *
  1468. * AAD:
  1469. * AAD padded to 128 bits with 0
  1470. * for example, assume AAD is a u32 vector
  1471. *
  1472. * if AAD is 8 bytes:
  1473. * AAD[3] = {A0, A1};
  1474. * padded AAD in xmm register = {A1 A0 0 0}
  1475. *
  1476. * 0 1 2 3
  1477. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1478. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1479. * | SPI (A1) |
  1480. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1481. * | 32-bit Sequence Number (A0) |
  1482. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1483. * | 0x0 |
  1484. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1485. *
  1486. * AAD Format with 32-bit Sequence Number
  1487. *
  1488. * if AAD is 12 bytes:
  1489. * AAD[3] = {A0, A1, A2};
  1490. * padded AAD in xmm register = {A2 A1 A0 0}
  1491. *
  1492. * 0 1 2 3
  1493. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1494. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1495. * | SPI (A2) |
  1496. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1497. * | 64-bit Extended Sequence Number {A1,A0} |
  1498. * | |
  1499. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1500. * | 0x0 |
  1501. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1502. *
  1503. * AAD Format with 64-bit Extended Sequence Number
  1504. *
  1505. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1506. ***************************************************************************/
  1507. SYM_FUNC_START(aesni_gcm_enc)
  1508. FUNC_SAVE
  1509. GCM_INIT %arg6, arg7, arg8, arg9
  1510. GCM_ENC_DEC enc
  1511. GCM_COMPLETE arg10, arg11
  1512. FUNC_RESTORE
  1513. RET
  1514. SYM_FUNC_END(aesni_gcm_enc)
  1515. /*****************************************************************************
  1516. * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1517. * struct gcm_context_data *data,
  1518. * // context data
  1519. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1520. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1521. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1522. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1523. * const u8 *aad, // Additional Authentication Data (AAD)
  1524. * u64 aad_len) // Length of AAD in bytes.
  1525. */
  1526. SYM_FUNC_START(aesni_gcm_init)
  1527. FUNC_SAVE
  1528. GCM_INIT %arg3, %arg4,%arg5, %arg6
  1529. FUNC_RESTORE
  1530. RET
  1531. SYM_FUNC_END(aesni_gcm_init)
  1532. /*****************************************************************************
  1533. * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1534. * struct gcm_context_data *data,
  1535. * // context data
  1536. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1537. * const u8 *in, // Plaintext input
  1538. * u64 plaintext_len, // Length of data in bytes for encryption.
  1539. */
  1540. SYM_FUNC_START(aesni_gcm_enc_update)
  1541. FUNC_SAVE
  1542. GCM_ENC_DEC enc
  1543. FUNC_RESTORE
  1544. RET
  1545. SYM_FUNC_END(aesni_gcm_enc_update)
  1546. /*****************************************************************************
  1547. * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1548. * struct gcm_context_data *data,
  1549. * // context data
  1550. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1551. * const u8 *in, // Plaintext input
  1552. * u64 plaintext_len, // Length of data in bytes for encryption.
  1553. */
  1554. SYM_FUNC_START(aesni_gcm_dec_update)
  1555. FUNC_SAVE
  1556. GCM_ENC_DEC dec
  1557. FUNC_RESTORE
  1558. RET
  1559. SYM_FUNC_END(aesni_gcm_dec_update)
  1560. /*****************************************************************************
  1561. * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1562. * struct gcm_context_data *data,
  1563. * // context data
  1564. * u8 *auth_tag, // Authenticated Tag output.
  1565. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1566. * // 12 or 8.
  1567. */
  1568. SYM_FUNC_START(aesni_gcm_finalize)
  1569. FUNC_SAVE
  1570. GCM_COMPLETE %arg3 %arg4
  1571. FUNC_RESTORE
  1572. RET
  1573. SYM_FUNC_END(aesni_gcm_finalize)
  1574. #endif
  1575. SYM_FUNC_START_LOCAL(_key_expansion_256a)
  1576. pshufd $0b11111111, %xmm1, %xmm1
  1577. shufps $0b00010000, %xmm0, %xmm4
  1578. pxor %xmm4, %xmm0
  1579. shufps $0b10001100, %xmm0, %xmm4
  1580. pxor %xmm4, %xmm0
  1581. pxor %xmm1, %xmm0
  1582. movaps %xmm0, (TKEYP)
  1583. add $0x10, TKEYP
  1584. RET
  1585. SYM_FUNC_END(_key_expansion_256a)
  1586. SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
  1587. SYM_FUNC_START_LOCAL(_key_expansion_192a)
  1588. pshufd $0b01010101, %xmm1, %xmm1
  1589. shufps $0b00010000, %xmm0, %xmm4
  1590. pxor %xmm4, %xmm0
  1591. shufps $0b10001100, %xmm0, %xmm4
  1592. pxor %xmm4, %xmm0
  1593. pxor %xmm1, %xmm0
  1594. movaps %xmm2, %xmm5
  1595. movaps %xmm2, %xmm6
  1596. pslldq $4, %xmm5
  1597. pshufd $0b11111111, %xmm0, %xmm3
  1598. pxor %xmm3, %xmm2
  1599. pxor %xmm5, %xmm2
  1600. movaps %xmm0, %xmm1
  1601. shufps $0b01000100, %xmm0, %xmm6
  1602. movaps %xmm6, (TKEYP)
  1603. shufps $0b01001110, %xmm2, %xmm1
  1604. movaps %xmm1, 0x10(TKEYP)
  1605. add $0x20, TKEYP
  1606. RET
  1607. SYM_FUNC_END(_key_expansion_192a)
  1608. SYM_FUNC_START_LOCAL(_key_expansion_192b)
  1609. pshufd $0b01010101, %xmm1, %xmm1
  1610. shufps $0b00010000, %xmm0, %xmm4
  1611. pxor %xmm4, %xmm0
  1612. shufps $0b10001100, %xmm0, %xmm4
  1613. pxor %xmm4, %xmm0
  1614. pxor %xmm1, %xmm0
  1615. movaps %xmm2, %xmm5
  1616. pslldq $4, %xmm5
  1617. pshufd $0b11111111, %xmm0, %xmm3
  1618. pxor %xmm3, %xmm2
  1619. pxor %xmm5, %xmm2
  1620. movaps %xmm0, (TKEYP)
  1621. add $0x10, TKEYP
  1622. RET
  1623. SYM_FUNC_END(_key_expansion_192b)
  1624. SYM_FUNC_START_LOCAL(_key_expansion_256b)
  1625. pshufd $0b10101010, %xmm1, %xmm1
  1626. shufps $0b00010000, %xmm2, %xmm4
  1627. pxor %xmm4, %xmm2
  1628. shufps $0b10001100, %xmm2, %xmm4
  1629. pxor %xmm4, %xmm2
  1630. pxor %xmm1, %xmm2
  1631. movaps %xmm2, (TKEYP)
  1632. add $0x10, TKEYP
  1633. RET
  1634. SYM_FUNC_END(_key_expansion_256b)
  1635. /*
  1636. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1637. * unsigned int key_len)
  1638. */
  1639. SYM_FUNC_START(aesni_set_key)
  1640. FRAME_BEGIN
  1641. #ifndef __x86_64__
  1642. pushl KEYP
  1643. movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
  1644. movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
  1645. movl (FRAME_OFFSET+16)(%esp), %edx # key_len
  1646. #endif
  1647. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1648. movaps %xmm0, (KEYP)
  1649. lea 0x10(KEYP), TKEYP # key addr
  1650. movl %edx, 480(KEYP)
  1651. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1652. cmp $24, %dl
  1653. jb .Lenc_key128
  1654. je .Lenc_key192
  1655. movups 0x10(UKEYP), %xmm2 # other user key
  1656. movaps %xmm2, (TKEYP)
  1657. add $0x10, TKEYP
  1658. aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
  1659. call _key_expansion_256a
  1660. aeskeygenassist $0x1, %xmm0, %xmm1
  1661. call _key_expansion_256b
  1662. aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
  1663. call _key_expansion_256a
  1664. aeskeygenassist $0x2, %xmm0, %xmm1
  1665. call _key_expansion_256b
  1666. aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
  1667. call _key_expansion_256a
  1668. aeskeygenassist $0x4, %xmm0, %xmm1
  1669. call _key_expansion_256b
  1670. aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
  1671. call _key_expansion_256a
  1672. aeskeygenassist $0x8, %xmm0, %xmm1
  1673. call _key_expansion_256b
  1674. aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
  1675. call _key_expansion_256a
  1676. aeskeygenassist $0x10, %xmm0, %xmm1
  1677. call _key_expansion_256b
  1678. aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
  1679. call _key_expansion_256a
  1680. aeskeygenassist $0x20, %xmm0, %xmm1
  1681. call _key_expansion_256b
  1682. aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
  1683. call _key_expansion_256a
  1684. jmp .Ldec_key
  1685. .Lenc_key192:
  1686. movq 0x10(UKEYP), %xmm2 # other user key
  1687. aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
  1688. call _key_expansion_192a
  1689. aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
  1690. call _key_expansion_192b
  1691. aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
  1692. call _key_expansion_192a
  1693. aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
  1694. call _key_expansion_192b
  1695. aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
  1696. call _key_expansion_192a
  1697. aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
  1698. call _key_expansion_192b
  1699. aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
  1700. call _key_expansion_192a
  1701. aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
  1702. call _key_expansion_192b
  1703. jmp .Ldec_key
  1704. .Lenc_key128:
  1705. aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
  1706. call _key_expansion_128
  1707. aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
  1708. call _key_expansion_128
  1709. aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
  1710. call _key_expansion_128
  1711. aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
  1712. call _key_expansion_128
  1713. aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
  1714. call _key_expansion_128
  1715. aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
  1716. call _key_expansion_128
  1717. aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
  1718. call _key_expansion_128
  1719. aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
  1720. call _key_expansion_128
  1721. aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
  1722. call _key_expansion_128
  1723. aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
  1724. call _key_expansion_128
  1725. .Ldec_key:
  1726. sub $0x10, TKEYP
  1727. movaps (KEYP), %xmm0
  1728. movaps (TKEYP), %xmm1
  1729. movaps %xmm0, 240(TKEYP)
  1730. movaps %xmm1, 240(KEYP)
  1731. add $0x10, KEYP
  1732. lea 240-16(TKEYP), UKEYP
  1733. .align 4
  1734. .Ldec_key_loop:
  1735. movaps (KEYP), %xmm0
  1736. aesimc %xmm0, %xmm1
  1737. movaps %xmm1, (UKEYP)
  1738. add $0x10, KEYP
  1739. sub $0x10, UKEYP
  1740. cmp TKEYP, KEYP
  1741. jb .Ldec_key_loop
  1742. xor AREG, AREG
  1743. #ifndef __x86_64__
  1744. popl KEYP
  1745. #endif
  1746. FRAME_END
  1747. RET
  1748. SYM_FUNC_END(aesni_set_key)
  1749. /*
  1750. * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
  1751. */
  1752. SYM_FUNC_START(aesni_enc)
  1753. FRAME_BEGIN
  1754. #ifndef __x86_64__
  1755. pushl KEYP
  1756. pushl KLEN
  1757. movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
  1758. movl (FRAME_OFFSET+16)(%esp), OUTP # dst
  1759. movl (FRAME_OFFSET+20)(%esp), INP # src
  1760. #endif
  1761. movl 480(KEYP), KLEN # key length
  1762. movups (INP), STATE # input
  1763. call _aesni_enc1
  1764. movups STATE, (OUTP) # output
  1765. #ifndef __x86_64__
  1766. popl KLEN
  1767. popl KEYP
  1768. #endif
  1769. FRAME_END
  1770. RET
  1771. SYM_FUNC_END(aesni_enc)
  1772. /*
  1773. * _aesni_enc1: internal ABI
  1774. * input:
  1775. * KEYP: key struct pointer
  1776. * KLEN: round count
  1777. * STATE: initial state (input)
  1778. * output:
  1779. * STATE: finial state (output)
  1780. * changed:
  1781. * KEY
  1782. * TKEYP (T1)
  1783. */
  1784. SYM_FUNC_START_LOCAL(_aesni_enc1)
  1785. movaps (KEYP), KEY # key
  1786. mov KEYP, TKEYP
  1787. pxor KEY, STATE # round 0
  1788. add $0x30, TKEYP
  1789. cmp $24, KLEN
  1790. jb .Lenc128
  1791. lea 0x20(TKEYP), TKEYP
  1792. je .Lenc192
  1793. add $0x20, TKEYP
  1794. movaps -0x60(TKEYP), KEY
  1795. aesenc KEY, STATE
  1796. movaps -0x50(TKEYP), KEY
  1797. aesenc KEY, STATE
  1798. .align 4
  1799. .Lenc192:
  1800. movaps -0x40(TKEYP), KEY
  1801. aesenc KEY, STATE
  1802. movaps -0x30(TKEYP), KEY
  1803. aesenc KEY, STATE
  1804. .align 4
  1805. .Lenc128:
  1806. movaps -0x20(TKEYP), KEY
  1807. aesenc KEY, STATE
  1808. movaps -0x10(TKEYP), KEY
  1809. aesenc KEY, STATE
  1810. movaps (TKEYP), KEY
  1811. aesenc KEY, STATE
  1812. movaps 0x10(TKEYP), KEY
  1813. aesenc KEY, STATE
  1814. movaps 0x20(TKEYP), KEY
  1815. aesenc KEY, STATE
  1816. movaps 0x30(TKEYP), KEY
  1817. aesenc KEY, STATE
  1818. movaps 0x40(TKEYP), KEY
  1819. aesenc KEY, STATE
  1820. movaps 0x50(TKEYP), KEY
  1821. aesenc KEY, STATE
  1822. movaps 0x60(TKEYP), KEY
  1823. aesenc KEY, STATE
  1824. movaps 0x70(TKEYP), KEY
  1825. aesenclast KEY, STATE
  1826. RET
  1827. SYM_FUNC_END(_aesni_enc1)
  1828. /*
  1829. * _aesni_enc4: internal ABI
  1830. * input:
  1831. * KEYP: key struct pointer
  1832. * KLEN: round count
  1833. * STATE1: initial state (input)
  1834. * STATE2
  1835. * STATE3
  1836. * STATE4
  1837. * output:
  1838. * STATE1: finial state (output)
  1839. * STATE2
  1840. * STATE3
  1841. * STATE4
  1842. * changed:
  1843. * KEY
  1844. * TKEYP (T1)
  1845. */
  1846. SYM_FUNC_START_LOCAL(_aesni_enc4)
  1847. movaps (KEYP), KEY # key
  1848. mov KEYP, TKEYP
  1849. pxor KEY, STATE1 # round 0
  1850. pxor KEY, STATE2
  1851. pxor KEY, STATE3
  1852. pxor KEY, STATE4
  1853. add $0x30, TKEYP
  1854. cmp $24, KLEN
  1855. jb .L4enc128
  1856. lea 0x20(TKEYP), TKEYP
  1857. je .L4enc192
  1858. add $0x20, TKEYP
  1859. movaps -0x60(TKEYP), KEY
  1860. aesenc KEY, STATE1
  1861. aesenc KEY, STATE2
  1862. aesenc KEY, STATE3
  1863. aesenc KEY, STATE4
  1864. movaps -0x50(TKEYP), KEY
  1865. aesenc KEY, STATE1
  1866. aesenc KEY, STATE2
  1867. aesenc KEY, STATE3
  1868. aesenc KEY, STATE4
  1869. #.align 4
  1870. .L4enc192:
  1871. movaps -0x40(TKEYP), KEY
  1872. aesenc KEY, STATE1
  1873. aesenc KEY, STATE2
  1874. aesenc KEY, STATE3
  1875. aesenc KEY, STATE4
  1876. movaps -0x30(TKEYP), KEY
  1877. aesenc KEY, STATE1
  1878. aesenc KEY, STATE2
  1879. aesenc KEY, STATE3
  1880. aesenc KEY, STATE4
  1881. #.align 4
  1882. .L4enc128:
  1883. movaps -0x20(TKEYP), KEY
  1884. aesenc KEY, STATE1
  1885. aesenc KEY, STATE2
  1886. aesenc KEY, STATE3
  1887. aesenc KEY, STATE4
  1888. movaps -0x10(TKEYP), KEY
  1889. aesenc KEY, STATE1
  1890. aesenc KEY, STATE2
  1891. aesenc KEY, STATE3
  1892. aesenc KEY, STATE4
  1893. movaps (TKEYP), KEY
  1894. aesenc KEY, STATE1
  1895. aesenc KEY, STATE2
  1896. aesenc KEY, STATE3
  1897. aesenc KEY, STATE4
  1898. movaps 0x10(TKEYP), KEY
  1899. aesenc KEY, STATE1
  1900. aesenc KEY, STATE2
  1901. aesenc KEY, STATE3
  1902. aesenc KEY, STATE4
  1903. movaps 0x20(TKEYP), KEY
  1904. aesenc KEY, STATE1
  1905. aesenc KEY, STATE2
  1906. aesenc KEY, STATE3
  1907. aesenc KEY, STATE4
  1908. movaps 0x30(TKEYP), KEY
  1909. aesenc KEY, STATE1
  1910. aesenc KEY, STATE2
  1911. aesenc KEY, STATE3
  1912. aesenc KEY, STATE4
  1913. movaps 0x40(TKEYP), KEY
  1914. aesenc KEY, STATE1
  1915. aesenc KEY, STATE2
  1916. aesenc KEY, STATE3
  1917. aesenc KEY, STATE4
  1918. movaps 0x50(TKEYP), KEY
  1919. aesenc KEY, STATE1
  1920. aesenc KEY, STATE2
  1921. aesenc KEY, STATE3
  1922. aesenc KEY, STATE4
  1923. movaps 0x60(TKEYP), KEY
  1924. aesenc KEY, STATE1
  1925. aesenc KEY, STATE2
  1926. aesenc KEY, STATE3
  1927. aesenc KEY, STATE4
  1928. movaps 0x70(TKEYP), KEY
  1929. aesenclast KEY, STATE1 # last round
  1930. aesenclast KEY, STATE2
  1931. aesenclast KEY, STATE3
  1932. aesenclast KEY, STATE4
  1933. RET
  1934. SYM_FUNC_END(_aesni_enc4)
  1935. /*
  1936. * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
  1937. */
  1938. SYM_FUNC_START(aesni_dec)
  1939. FRAME_BEGIN
  1940. #ifndef __x86_64__
  1941. pushl KEYP
  1942. pushl KLEN
  1943. movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
  1944. movl (FRAME_OFFSET+16)(%esp), OUTP # dst
  1945. movl (FRAME_OFFSET+20)(%esp), INP # src
  1946. #endif
  1947. mov 480(KEYP), KLEN # key length
  1948. add $240, KEYP
  1949. movups (INP), STATE # input
  1950. call _aesni_dec1
  1951. movups STATE, (OUTP) #output
  1952. #ifndef __x86_64__
  1953. popl KLEN
  1954. popl KEYP
  1955. #endif
  1956. FRAME_END
  1957. RET
  1958. SYM_FUNC_END(aesni_dec)
  1959. /*
  1960. * _aesni_dec1: internal ABI
  1961. * input:
  1962. * KEYP: key struct pointer
  1963. * KLEN: key length
  1964. * STATE: initial state (input)
  1965. * output:
  1966. * STATE: finial state (output)
  1967. * changed:
  1968. * KEY
  1969. * TKEYP (T1)
  1970. */
  1971. SYM_FUNC_START_LOCAL(_aesni_dec1)
  1972. movaps (KEYP), KEY # key
  1973. mov KEYP, TKEYP
  1974. pxor KEY, STATE # round 0
  1975. add $0x30, TKEYP
  1976. cmp $24, KLEN
  1977. jb .Ldec128
  1978. lea 0x20(TKEYP), TKEYP
  1979. je .Ldec192
  1980. add $0x20, TKEYP
  1981. movaps -0x60(TKEYP), KEY
  1982. aesdec KEY, STATE
  1983. movaps -0x50(TKEYP), KEY
  1984. aesdec KEY, STATE
  1985. .align 4
  1986. .Ldec192:
  1987. movaps -0x40(TKEYP), KEY
  1988. aesdec KEY, STATE
  1989. movaps -0x30(TKEYP), KEY
  1990. aesdec KEY, STATE
  1991. .align 4
  1992. .Ldec128:
  1993. movaps -0x20(TKEYP), KEY
  1994. aesdec KEY, STATE
  1995. movaps -0x10(TKEYP), KEY
  1996. aesdec KEY, STATE
  1997. movaps (TKEYP), KEY
  1998. aesdec KEY, STATE
  1999. movaps 0x10(TKEYP), KEY
  2000. aesdec KEY, STATE
  2001. movaps 0x20(TKEYP), KEY
  2002. aesdec KEY, STATE
  2003. movaps 0x30(TKEYP), KEY
  2004. aesdec KEY, STATE
  2005. movaps 0x40(TKEYP), KEY
  2006. aesdec KEY, STATE
  2007. movaps 0x50(TKEYP), KEY
  2008. aesdec KEY, STATE
  2009. movaps 0x60(TKEYP), KEY
  2010. aesdec KEY, STATE
  2011. movaps 0x70(TKEYP), KEY
  2012. aesdeclast KEY, STATE
  2013. RET
  2014. SYM_FUNC_END(_aesni_dec1)
  2015. /*
  2016. * _aesni_dec4: internal ABI
  2017. * input:
  2018. * KEYP: key struct pointer
  2019. * KLEN: key length
  2020. * STATE1: initial state (input)
  2021. * STATE2
  2022. * STATE3
  2023. * STATE4
  2024. * output:
  2025. * STATE1: finial state (output)
  2026. * STATE2
  2027. * STATE3
  2028. * STATE4
  2029. * changed:
  2030. * KEY
  2031. * TKEYP (T1)
  2032. */
  2033. SYM_FUNC_START_LOCAL(_aesni_dec4)
  2034. movaps (KEYP), KEY # key
  2035. mov KEYP, TKEYP
  2036. pxor KEY, STATE1 # round 0
  2037. pxor KEY, STATE2
  2038. pxor KEY, STATE3
  2039. pxor KEY, STATE4
  2040. add $0x30, TKEYP
  2041. cmp $24, KLEN
  2042. jb .L4dec128
  2043. lea 0x20(TKEYP), TKEYP
  2044. je .L4dec192
  2045. add $0x20, TKEYP
  2046. movaps -0x60(TKEYP), KEY
  2047. aesdec KEY, STATE1
  2048. aesdec KEY, STATE2
  2049. aesdec KEY, STATE3
  2050. aesdec KEY, STATE4
  2051. movaps -0x50(TKEYP), KEY
  2052. aesdec KEY, STATE1
  2053. aesdec KEY, STATE2
  2054. aesdec KEY, STATE3
  2055. aesdec KEY, STATE4
  2056. .align 4
  2057. .L4dec192:
  2058. movaps -0x40(TKEYP), KEY
  2059. aesdec KEY, STATE1
  2060. aesdec KEY, STATE2
  2061. aesdec KEY, STATE3
  2062. aesdec KEY, STATE4
  2063. movaps -0x30(TKEYP), KEY
  2064. aesdec KEY, STATE1
  2065. aesdec KEY, STATE2
  2066. aesdec KEY, STATE3
  2067. aesdec KEY, STATE4
  2068. .align 4
  2069. .L4dec128:
  2070. movaps -0x20(TKEYP), KEY
  2071. aesdec KEY, STATE1
  2072. aesdec KEY, STATE2
  2073. aesdec KEY, STATE3
  2074. aesdec KEY, STATE4
  2075. movaps -0x10(TKEYP), KEY
  2076. aesdec KEY, STATE1
  2077. aesdec KEY, STATE2
  2078. aesdec KEY, STATE3
  2079. aesdec KEY, STATE4
  2080. movaps (TKEYP), KEY
  2081. aesdec KEY, STATE1
  2082. aesdec KEY, STATE2
  2083. aesdec KEY, STATE3
  2084. aesdec KEY, STATE4
  2085. movaps 0x10(TKEYP), KEY
  2086. aesdec KEY, STATE1
  2087. aesdec KEY, STATE2
  2088. aesdec KEY, STATE3
  2089. aesdec KEY, STATE4
  2090. movaps 0x20(TKEYP), KEY
  2091. aesdec KEY, STATE1
  2092. aesdec KEY, STATE2
  2093. aesdec KEY, STATE3
  2094. aesdec KEY, STATE4
  2095. movaps 0x30(TKEYP), KEY
  2096. aesdec KEY, STATE1
  2097. aesdec KEY, STATE2
  2098. aesdec KEY, STATE3
  2099. aesdec KEY, STATE4
  2100. movaps 0x40(TKEYP), KEY
  2101. aesdec KEY, STATE1
  2102. aesdec KEY, STATE2
  2103. aesdec KEY, STATE3
  2104. aesdec KEY, STATE4
  2105. movaps 0x50(TKEYP), KEY
  2106. aesdec KEY, STATE1
  2107. aesdec KEY, STATE2
  2108. aesdec KEY, STATE3
  2109. aesdec KEY, STATE4
  2110. movaps 0x60(TKEYP), KEY
  2111. aesdec KEY, STATE1
  2112. aesdec KEY, STATE2
  2113. aesdec KEY, STATE3
  2114. aesdec KEY, STATE4
  2115. movaps 0x70(TKEYP), KEY
  2116. aesdeclast KEY, STATE1 # last round
  2117. aesdeclast KEY, STATE2
  2118. aesdeclast KEY, STATE3
  2119. aesdeclast KEY, STATE4
  2120. RET
  2121. SYM_FUNC_END(_aesni_dec4)
  2122. /*
  2123. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2124. * size_t len)
  2125. */
  2126. SYM_FUNC_START(aesni_ecb_enc)
  2127. FRAME_BEGIN
  2128. #ifndef __x86_64__
  2129. pushl LEN
  2130. pushl KEYP
  2131. pushl KLEN
  2132. movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
  2133. movl (FRAME_OFFSET+20)(%esp), OUTP # dst
  2134. movl (FRAME_OFFSET+24)(%esp), INP # src
  2135. movl (FRAME_OFFSET+28)(%esp), LEN # len
  2136. #endif
  2137. test LEN, LEN # check length
  2138. jz .Lecb_enc_ret
  2139. mov 480(KEYP), KLEN
  2140. cmp $16, LEN
  2141. jb .Lecb_enc_ret
  2142. cmp $64, LEN
  2143. jb .Lecb_enc_loop1
  2144. .align 4
  2145. .Lecb_enc_loop4:
  2146. movups (INP), STATE1
  2147. movups 0x10(INP), STATE2
  2148. movups 0x20(INP), STATE3
  2149. movups 0x30(INP), STATE4
  2150. call _aesni_enc4
  2151. movups STATE1, (OUTP)
  2152. movups STATE2, 0x10(OUTP)
  2153. movups STATE3, 0x20(OUTP)
  2154. movups STATE4, 0x30(OUTP)
  2155. sub $64, LEN
  2156. add $64, INP
  2157. add $64, OUTP
  2158. cmp $64, LEN
  2159. jge .Lecb_enc_loop4
  2160. cmp $16, LEN
  2161. jb .Lecb_enc_ret
  2162. .align 4
  2163. .Lecb_enc_loop1:
  2164. movups (INP), STATE1
  2165. call _aesni_enc1
  2166. movups STATE1, (OUTP)
  2167. sub $16, LEN
  2168. add $16, INP
  2169. add $16, OUTP
  2170. cmp $16, LEN
  2171. jge .Lecb_enc_loop1
  2172. .Lecb_enc_ret:
  2173. #ifndef __x86_64__
  2174. popl KLEN
  2175. popl KEYP
  2176. popl LEN
  2177. #endif
  2178. FRAME_END
  2179. RET
  2180. SYM_FUNC_END(aesni_ecb_enc)
  2181. /*
  2182. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2183. * size_t len);
  2184. */
  2185. SYM_FUNC_START(aesni_ecb_dec)
  2186. FRAME_BEGIN
  2187. #ifndef __x86_64__
  2188. pushl LEN
  2189. pushl KEYP
  2190. pushl KLEN
  2191. movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
  2192. movl (FRAME_OFFSET+20)(%esp), OUTP # dst
  2193. movl (FRAME_OFFSET+24)(%esp), INP # src
  2194. movl (FRAME_OFFSET+28)(%esp), LEN # len
  2195. #endif
  2196. test LEN, LEN
  2197. jz .Lecb_dec_ret
  2198. mov 480(KEYP), KLEN
  2199. add $240, KEYP
  2200. cmp $16, LEN
  2201. jb .Lecb_dec_ret
  2202. cmp $64, LEN
  2203. jb .Lecb_dec_loop1
  2204. .align 4
  2205. .Lecb_dec_loop4:
  2206. movups (INP), STATE1
  2207. movups 0x10(INP), STATE2
  2208. movups 0x20(INP), STATE3
  2209. movups 0x30(INP), STATE4
  2210. call _aesni_dec4
  2211. movups STATE1, (OUTP)
  2212. movups STATE2, 0x10(OUTP)
  2213. movups STATE3, 0x20(OUTP)
  2214. movups STATE4, 0x30(OUTP)
  2215. sub $64, LEN
  2216. add $64, INP
  2217. add $64, OUTP
  2218. cmp $64, LEN
  2219. jge .Lecb_dec_loop4
  2220. cmp $16, LEN
  2221. jb .Lecb_dec_ret
  2222. .align 4
  2223. .Lecb_dec_loop1:
  2224. movups (INP), STATE1
  2225. call _aesni_dec1
  2226. movups STATE1, (OUTP)
  2227. sub $16, LEN
  2228. add $16, INP
  2229. add $16, OUTP
  2230. cmp $16, LEN
  2231. jge .Lecb_dec_loop1
  2232. .Lecb_dec_ret:
  2233. #ifndef __x86_64__
  2234. popl KLEN
  2235. popl KEYP
  2236. popl LEN
  2237. #endif
  2238. FRAME_END
  2239. RET
  2240. SYM_FUNC_END(aesni_ecb_dec)
  2241. /*
  2242. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2243. * size_t len, u8 *iv)
  2244. */
  2245. SYM_FUNC_START(aesni_cbc_enc)
  2246. FRAME_BEGIN
  2247. #ifndef __x86_64__
  2248. pushl IVP
  2249. pushl LEN
  2250. pushl KEYP
  2251. pushl KLEN
  2252. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2253. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2254. movl (FRAME_OFFSET+28)(%esp), INP # src
  2255. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2256. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2257. #endif
  2258. cmp $16, LEN
  2259. jb .Lcbc_enc_ret
  2260. mov 480(KEYP), KLEN
  2261. movups (IVP), STATE # load iv as initial state
  2262. .align 4
  2263. .Lcbc_enc_loop:
  2264. movups (INP), IN # load input
  2265. pxor IN, STATE
  2266. call _aesni_enc1
  2267. movups STATE, (OUTP) # store output
  2268. sub $16, LEN
  2269. add $16, INP
  2270. add $16, OUTP
  2271. cmp $16, LEN
  2272. jge .Lcbc_enc_loop
  2273. movups STATE, (IVP)
  2274. .Lcbc_enc_ret:
  2275. #ifndef __x86_64__
  2276. popl KLEN
  2277. popl KEYP
  2278. popl LEN
  2279. popl IVP
  2280. #endif
  2281. FRAME_END
  2282. RET
  2283. SYM_FUNC_END(aesni_cbc_enc)
  2284. /*
  2285. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2286. * size_t len, u8 *iv)
  2287. */
  2288. SYM_FUNC_START(aesni_cbc_dec)
  2289. FRAME_BEGIN
  2290. #ifndef __x86_64__
  2291. pushl IVP
  2292. pushl LEN
  2293. pushl KEYP
  2294. pushl KLEN
  2295. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2296. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2297. movl (FRAME_OFFSET+28)(%esp), INP # src
  2298. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2299. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2300. #endif
  2301. cmp $16, LEN
  2302. jb .Lcbc_dec_just_ret
  2303. mov 480(KEYP), KLEN
  2304. add $240, KEYP
  2305. movups (IVP), IV
  2306. cmp $64, LEN
  2307. jb .Lcbc_dec_loop1
  2308. .align 4
  2309. .Lcbc_dec_loop4:
  2310. movups (INP), IN1
  2311. movaps IN1, STATE1
  2312. movups 0x10(INP), IN2
  2313. movaps IN2, STATE2
  2314. #ifdef __x86_64__
  2315. movups 0x20(INP), IN3
  2316. movaps IN3, STATE3
  2317. movups 0x30(INP), IN4
  2318. movaps IN4, STATE4
  2319. #else
  2320. movups 0x20(INP), IN1
  2321. movaps IN1, STATE3
  2322. movups 0x30(INP), IN2
  2323. movaps IN2, STATE4
  2324. #endif
  2325. call _aesni_dec4
  2326. pxor IV, STATE1
  2327. #ifdef __x86_64__
  2328. pxor IN1, STATE2
  2329. pxor IN2, STATE3
  2330. pxor IN3, STATE4
  2331. movaps IN4, IV
  2332. #else
  2333. pxor IN1, STATE4
  2334. movaps IN2, IV
  2335. movups (INP), IN1
  2336. pxor IN1, STATE2
  2337. movups 0x10(INP), IN2
  2338. pxor IN2, STATE3
  2339. #endif
  2340. movups STATE1, (OUTP)
  2341. movups STATE2, 0x10(OUTP)
  2342. movups STATE3, 0x20(OUTP)
  2343. movups STATE4, 0x30(OUTP)
  2344. sub $64, LEN
  2345. add $64, INP
  2346. add $64, OUTP
  2347. cmp $64, LEN
  2348. jge .Lcbc_dec_loop4
  2349. cmp $16, LEN
  2350. jb .Lcbc_dec_ret
  2351. .align 4
  2352. .Lcbc_dec_loop1:
  2353. movups (INP), IN
  2354. movaps IN, STATE
  2355. call _aesni_dec1
  2356. pxor IV, STATE
  2357. movups STATE, (OUTP)
  2358. movaps IN, IV
  2359. sub $16, LEN
  2360. add $16, INP
  2361. add $16, OUTP
  2362. cmp $16, LEN
  2363. jge .Lcbc_dec_loop1
  2364. .Lcbc_dec_ret:
  2365. movups IV, (IVP)
  2366. .Lcbc_dec_just_ret:
  2367. #ifndef __x86_64__
  2368. popl KLEN
  2369. popl KEYP
  2370. popl LEN
  2371. popl IVP
  2372. #endif
  2373. FRAME_END
  2374. RET
  2375. SYM_FUNC_END(aesni_cbc_dec)
  2376. /*
  2377. * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2378. * size_t len, u8 *iv)
  2379. */
  2380. SYM_FUNC_START(aesni_cts_cbc_enc)
  2381. FRAME_BEGIN
  2382. #ifndef __x86_64__
  2383. pushl IVP
  2384. pushl LEN
  2385. pushl KEYP
  2386. pushl KLEN
  2387. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2388. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2389. movl (FRAME_OFFSET+28)(%esp), INP # src
  2390. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2391. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2392. lea .Lcts_permute_table, T1
  2393. #else
  2394. lea .Lcts_permute_table(%rip), T1
  2395. #endif
  2396. mov 480(KEYP), KLEN
  2397. movups (IVP), STATE
  2398. sub $16, LEN
  2399. mov T1, IVP
  2400. add $32, IVP
  2401. add LEN, T1
  2402. sub LEN, IVP
  2403. movups (T1), %xmm4
  2404. movups (IVP), %xmm5
  2405. movups (INP), IN1
  2406. add LEN, INP
  2407. movups (INP), IN2
  2408. pxor IN1, STATE
  2409. call _aesni_enc1
  2410. pshufb %xmm5, IN2
  2411. pxor STATE, IN2
  2412. pshufb %xmm4, STATE
  2413. add OUTP, LEN
  2414. movups STATE, (LEN)
  2415. movaps IN2, STATE
  2416. call _aesni_enc1
  2417. movups STATE, (OUTP)
  2418. #ifndef __x86_64__
  2419. popl KLEN
  2420. popl KEYP
  2421. popl LEN
  2422. popl IVP
  2423. #endif
  2424. FRAME_END
  2425. RET
  2426. SYM_FUNC_END(aesni_cts_cbc_enc)
  2427. /*
  2428. * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2429. * size_t len, u8 *iv)
  2430. */
  2431. SYM_FUNC_START(aesni_cts_cbc_dec)
  2432. FRAME_BEGIN
  2433. #ifndef __x86_64__
  2434. pushl IVP
  2435. pushl LEN
  2436. pushl KEYP
  2437. pushl KLEN
  2438. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2439. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2440. movl (FRAME_OFFSET+28)(%esp), INP # src
  2441. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2442. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2443. lea .Lcts_permute_table, T1
  2444. #else
  2445. lea .Lcts_permute_table(%rip), T1
  2446. #endif
  2447. mov 480(KEYP), KLEN
  2448. add $240, KEYP
  2449. movups (IVP), IV
  2450. sub $16, LEN
  2451. mov T1, IVP
  2452. add $32, IVP
  2453. add LEN, T1
  2454. sub LEN, IVP
  2455. movups (T1), %xmm4
  2456. movups (INP), STATE
  2457. add LEN, INP
  2458. movups (INP), IN1
  2459. call _aesni_dec1
  2460. movaps STATE, IN2
  2461. pshufb %xmm4, STATE
  2462. pxor IN1, STATE
  2463. add OUTP, LEN
  2464. movups STATE, (LEN)
  2465. movups (IVP), %xmm0
  2466. pshufb %xmm0, IN1
  2467. pblendvb IN2, IN1
  2468. movaps IN1, STATE
  2469. call _aesni_dec1
  2470. pxor IV, STATE
  2471. movups STATE, (OUTP)
  2472. #ifndef __x86_64__
  2473. popl KLEN
  2474. popl KEYP
  2475. popl LEN
  2476. popl IVP
  2477. #endif
  2478. FRAME_END
  2479. RET
  2480. SYM_FUNC_END(aesni_cts_cbc_dec)
  2481. .pushsection .rodata
  2482. .align 16
  2483. .Lcts_permute_table:
  2484. .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
  2485. .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
  2486. .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  2487. .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  2488. .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
  2489. .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
  2490. #ifdef __x86_64__
  2491. .Lbswap_mask:
  2492. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2493. #endif
  2494. .popsection
  2495. #ifdef __x86_64__
  2496. /*
  2497. * _aesni_inc_init: internal ABI
  2498. * setup registers used by _aesni_inc
  2499. * input:
  2500. * IV
  2501. * output:
  2502. * CTR: == IV, in little endian
  2503. * TCTR_LOW: == lower qword of CTR
  2504. * INC: == 1, in little endian
  2505. * BSWAP_MASK == endian swapping mask
  2506. */
  2507. SYM_FUNC_START_LOCAL(_aesni_inc_init)
  2508. movaps .Lbswap_mask, BSWAP_MASK
  2509. movaps IV, CTR
  2510. pshufb BSWAP_MASK, CTR
  2511. mov $1, TCTR_LOW
  2512. movq TCTR_LOW, INC
  2513. movq CTR, TCTR_LOW
  2514. RET
  2515. SYM_FUNC_END(_aesni_inc_init)
  2516. /*
  2517. * _aesni_inc: internal ABI
  2518. * Increase IV by 1, IV is in big endian
  2519. * input:
  2520. * IV
  2521. * CTR: == IV, in little endian
  2522. * TCTR_LOW: == lower qword of CTR
  2523. * INC: == 1, in little endian
  2524. * BSWAP_MASK == endian swapping mask
  2525. * output:
  2526. * IV: Increase by 1
  2527. * changed:
  2528. * CTR: == output IV, in little endian
  2529. * TCTR_LOW: == lower qword of CTR
  2530. */
  2531. SYM_FUNC_START_LOCAL(_aesni_inc)
  2532. paddq INC, CTR
  2533. add $1, TCTR_LOW
  2534. jnc .Linc_low
  2535. pslldq $8, INC
  2536. paddq INC, CTR
  2537. psrldq $8, INC
  2538. .Linc_low:
  2539. movaps CTR, IV
  2540. pshufb BSWAP_MASK, IV
  2541. RET
  2542. SYM_FUNC_END(_aesni_inc)
  2543. /*
  2544. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2545. * size_t len, u8 *iv)
  2546. */
  2547. SYM_FUNC_START(aesni_ctr_enc)
  2548. FRAME_BEGIN
  2549. cmp $16, LEN
  2550. jb .Lctr_enc_just_ret
  2551. mov 480(KEYP), KLEN
  2552. movups (IVP), IV
  2553. call _aesni_inc_init
  2554. cmp $64, LEN
  2555. jb .Lctr_enc_loop1
  2556. .align 4
  2557. .Lctr_enc_loop4:
  2558. movaps IV, STATE1
  2559. call _aesni_inc
  2560. movups (INP), IN1
  2561. movaps IV, STATE2
  2562. call _aesni_inc
  2563. movups 0x10(INP), IN2
  2564. movaps IV, STATE3
  2565. call _aesni_inc
  2566. movups 0x20(INP), IN3
  2567. movaps IV, STATE4
  2568. call _aesni_inc
  2569. movups 0x30(INP), IN4
  2570. call _aesni_enc4
  2571. pxor IN1, STATE1
  2572. movups STATE1, (OUTP)
  2573. pxor IN2, STATE2
  2574. movups STATE2, 0x10(OUTP)
  2575. pxor IN3, STATE3
  2576. movups STATE3, 0x20(OUTP)
  2577. pxor IN4, STATE4
  2578. movups STATE4, 0x30(OUTP)
  2579. sub $64, LEN
  2580. add $64, INP
  2581. add $64, OUTP
  2582. cmp $64, LEN
  2583. jge .Lctr_enc_loop4
  2584. cmp $16, LEN
  2585. jb .Lctr_enc_ret
  2586. .align 4
  2587. .Lctr_enc_loop1:
  2588. movaps IV, STATE
  2589. call _aesni_inc
  2590. movups (INP), IN
  2591. call _aesni_enc1
  2592. pxor IN, STATE
  2593. movups STATE, (OUTP)
  2594. sub $16, LEN
  2595. add $16, INP
  2596. add $16, OUTP
  2597. cmp $16, LEN
  2598. jge .Lctr_enc_loop1
  2599. .Lctr_enc_ret:
  2600. movups IV, (IVP)
  2601. .Lctr_enc_just_ret:
  2602. FRAME_END
  2603. RET
  2604. SYM_FUNC_END(aesni_ctr_enc)
  2605. #endif
  2606. .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  2607. .align 16
  2608. .Lgf128mul_x_ble_mask:
  2609. .octa 0x00000000000000010000000000000087
  2610. .previous
  2611. /*
  2612. * _aesni_gf128mul_x_ble: internal ABI
  2613. * Multiply in GF(2^128) for XTS IVs
  2614. * input:
  2615. * IV: current IV
  2616. * GF128MUL_MASK == mask with 0x87 and 0x01
  2617. * output:
  2618. * IV: next IV
  2619. * changed:
  2620. * CTR: == temporary value
  2621. */
  2622. #define _aesni_gf128mul_x_ble() \
  2623. pshufd $0x13, IV, KEY; \
  2624. paddq IV, IV; \
  2625. psrad $31, KEY; \
  2626. pand GF128MUL_MASK, KEY; \
  2627. pxor KEY, IV;
  2628. /*
  2629. * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
  2630. * const u8 *src, unsigned int len, le128 *iv)
  2631. */
  2632. SYM_FUNC_START(aesni_xts_encrypt)
  2633. FRAME_BEGIN
  2634. #ifndef __x86_64__
  2635. pushl IVP
  2636. pushl LEN
  2637. pushl KEYP
  2638. pushl KLEN
  2639. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2640. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2641. movl (FRAME_OFFSET+28)(%esp), INP # src
  2642. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2643. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2644. movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
  2645. #else
  2646. movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
  2647. #endif
  2648. movups (IVP), IV
  2649. mov 480(KEYP), KLEN
  2650. .Lxts_enc_loop4:
  2651. sub $64, LEN
  2652. jl .Lxts_enc_1x
  2653. movdqa IV, STATE1
  2654. movdqu 0x00(INP), IN
  2655. pxor IN, STATE1
  2656. movdqu IV, 0x00(OUTP)
  2657. _aesni_gf128mul_x_ble()
  2658. movdqa IV, STATE2
  2659. movdqu 0x10(INP), IN
  2660. pxor IN, STATE2
  2661. movdqu IV, 0x10(OUTP)
  2662. _aesni_gf128mul_x_ble()
  2663. movdqa IV, STATE3
  2664. movdqu 0x20(INP), IN
  2665. pxor IN, STATE3
  2666. movdqu IV, 0x20(OUTP)
  2667. _aesni_gf128mul_x_ble()
  2668. movdqa IV, STATE4
  2669. movdqu 0x30(INP), IN
  2670. pxor IN, STATE4
  2671. movdqu IV, 0x30(OUTP)
  2672. call _aesni_enc4
  2673. movdqu 0x00(OUTP), IN
  2674. pxor IN, STATE1
  2675. movdqu STATE1, 0x00(OUTP)
  2676. movdqu 0x10(OUTP), IN
  2677. pxor IN, STATE2
  2678. movdqu STATE2, 0x10(OUTP)
  2679. movdqu 0x20(OUTP), IN
  2680. pxor IN, STATE3
  2681. movdqu STATE3, 0x20(OUTP)
  2682. movdqu 0x30(OUTP), IN
  2683. pxor IN, STATE4
  2684. movdqu STATE4, 0x30(OUTP)
  2685. _aesni_gf128mul_x_ble()
  2686. add $64, INP
  2687. add $64, OUTP
  2688. test LEN, LEN
  2689. jnz .Lxts_enc_loop4
  2690. .Lxts_enc_ret_iv:
  2691. movups IV, (IVP)
  2692. .Lxts_enc_ret:
  2693. #ifndef __x86_64__
  2694. popl KLEN
  2695. popl KEYP
  2696. popl LEN
  2697. popl IVP
  2698. #endif
  2699. FRAME_END
  2700. RET
  2701. .Lxts_enc_1x:
  2702. add $64, LEN
  2703. jz .Lxts_enc_ret_iv
  2704. sub $16, LEN
  2705. jl .Lxts_enc_cts4
  2706. .Lxts_enc_loop1:
  2707. movdqu (INP), STATE
  2708. pxor IV, STATE
  2709. call _aesni_enc1
  2710. pxor IV, STATE
  2711. _aesni_gf128mul_x_ble()
  2712. test LEN, LEN
  2713. jz .Lxts_enc_out
  2714. add $16, INP
  2715. sub $16, LEN
  2716. jl .Lxts_enc_cts1
  2717. movdqu STATE, (OUTP)
  2718. add $16, OUTP
  2719. jmp .Lxts_enc_loop1
  2720. .Lxts_enc_out:
  2721. movdqu STATE, (OUTP)
  2722. jmp .Lxts_enc_ret_iv
  2723. .Lxts_enc_cts4:
  2724. movdqa STATE4, STATE
  2725. sub $16, OUTP
  2726. .Lxts_enc_cts1:
  2727. #ifndef __x86_64__
  2728. lea .Lcts_permute_table, T1
  2729. #else
  2730. lea .Lcts_permute_table(%rip), T1
  2731. #endif
  2732. add LEN, INP /* rewind input pointer */
  2733. add $16, LEN /* # bytes in final block */
  2734. movups (INP), IN1
  2735. mov T1, IVP
  2736. add $32, IVP
  2737. add LEN, T1
  2738. sub LEN, IVP
  2739. add OUTP, LEN
  2740. movups (T1), %xmm4
  2741. movaps STATE, IN2
  2742. pshufb %xmm4, STATE
  2743. movups STATE, (LEN)
  2744. movups (IVP), %xmm0
  2745. pshufb %xmm0, IN1
  2746. pblendvb IN2, IN1
  2747. movaps IN1, STATE
  2748. pxor IV, STATE
  2749. call _aesni_enc1
  2750. pxor IV, STATE
  2751. movups STATE, (OUTP)
  2752. jmp .Lxts_enc_ret
  2753. SYM_FUNC_END(aesni_xts_encrypt)
  2754. /*
  2755. * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
  2756. * const u8 *src, unsigned int len, le128 *iv)
  2757. */
  2758. SYM_FUNC_START(aesni_xts_decrypt)
  2759. FRAME_BEGIN
  2760. #ifndef __x86_64__
  2761. pushl IVP
  2762. pushl LEN
  2763. pushl KEYP
  2764. pushl KLEN
  2765. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2766. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2767. movl (FRAME_OFFSET+28)(%esp), INP # src
  2768. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2769. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2770. movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
  2771. #else
  2772. movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
  2773. #endif
  2774. movups (IVP), IV
  2775. mov 480(KEYP), KLEN
  2776. add $240, KEYP
  2777. test $15, LEN
  2778. jz .Lxts_dec_loop4
  2779. sub $16, LEN
  2780. .Lxts_dec_loop4:
  2781. sub $64, LEN
  2782. jl .Lxts_dec_1x
  2783. movdqa IV, STATE1
  2784. movdqu 0x00(INP), IN
  2785. pxor IN, STATE1
  2786. movdqu IV, 0x00(OUTP)
  2787. _aesni_gf128mul_x_ble()
  2788. movdqa IV, STATE2
  2789. movdqu 0x10(INP), IN
  2790. pxor IN, STATE2
  2791. movdqu IV, 0x10(OUTP)
  2792. _aesni_gf128mul_x_ble()
  2793. movdqa IV, STATE3
  2794. movdqu 0x20(INP), IN
  2795. pxor IN, STATE3
  2796. movdqu IV, 0x20(OUTP)
  2797. _aesni_gf128mul_x_ble()
  2798. movdqa IV, STATE4
  2799. movdqu 0x30(INP), IN
  2800. pxor IN, STATE4
  2801. movdqu IV, 0x30(OUTP)
  2802. call _aesni_dec4
  2803. movdqu 0x00(OUTP), IN
  2804. pxor IN, STATE1
  2805. movdqu STATE1, 0x00(OUTP)
  2806. movdqu 0x10(OUTP), IN
  2807. pxor IN, STATE2
  2808. movdqu STATE2, 0x10(OUTP)
  2809. movdqu 0x20(OUTP), IN
  2810. pxor IN, STATE3
  2811. movdqu STATE3, 0x20(OUTP)
  2812. movdqu 0x30(OUTP), IN
  2813. pxor IN, STATE4
  2814. movdqu STATE4, 0x30(OUTP)
  2815. _aesni_gf128mul_x_ble()
  2816. add $64, INP
  2817. add $64, OUTP
  2818. test LEN, LEN
  2819. jnz .Lxts_dec_loop4
  2820. .Lxts_dec_ret_iv:
  2821. movups IV, (IVP)
  2822. .Lxts_dec_ret:
  2823. #ifndef __x86_64__
  2824. popl KLEN
  2825. popl KEYP
  2826. popl LEN
  2827. popl IVP
  2828. #endif
  2829. FRAME_END
  2830. RET
  2831. .Lxts_dec_1x:
  2832. add $64, LEN
  2833. jz .Lxts_dec_ret_iv
  2834. .Lxts_dec_loop1:
  2835. movdqu (INP), STATE
  2836. add $16, INP
  2837. sub $16, LEN
  2838. jl .Lxts_dec_cts1
  2839. pxor IV, STATE
  2840. call _aesni_dec1
  2841. pxor IV, STATE
  2842. _aesni_gf128mul_x_ble()
  2843. test LEN, LEN
  2844. jz .Lxts_dec_out
  2845. movdqu STATE, (OUTP)
  2846. add $16, OUTP
  2847. jmp .Lxts_dec_loop1
  2848. .Lxts_dec_out:
  2849. movdqu STATE, (OUTP)
  2850. jmp .Lxts_dec_ret_iv
  2851. .Lxts_dec_cts1:
  2852. movdqa IV, STATE4
  2853. _aesni_gf128mul_x_ble()
  2854. pxor IV, STATE
  2855. call _aesni_dec1
  2856. pxor IV, STATE
  2857. #ifndef __x86_64__
  2858. lea .Lcts_permute_table, T1
  2859. #else
  2860. lea .Lcts_permute_table(%rip), T1
  2861. #endif
  2862. add LEN, INP /* rewind input pointer */
  2863. add $16, LEN /* # bytes in final block */
  2864. movups (INP), IN1
  2865. mov T1, IVP
  2866. add $32, IVP
  2867. add LEN, T1
  2868. sub LEN, IVP
  2869. add OUTP, LEN
  2870. movups (T1), %xmm4
  2871. movaps STATE, IN2
  2872. pshufb %xmm4, STATE
  2873. movups STATE, (LEN)
  2874. movups (IVP), %xmm0
  2875. pshufb %xmm0, IN1
  2876. pblendvb IN2, IN1
  2877. movaps IN1, STATE
  2878. pxor STATE4, STATE
  2879. call _aesni_dec1
  2880. pxor STATE4, STATE
  2881. movups STATE, (OUTP)
  2882. jmp .Lxts_dec_ret
  2883. SYM_FUNC_END(aesni_xts_decrypt)