aesni-intel_avx-x86_64.S 98 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826
  1. ########################################################################
  2. # Copyright (c) 2013, Intel Corporation
  3. #
  4. # This software is available to you under a choice of one of two
  5. # licenses. You may choose to be licensed under the terms of the GNU
  6. # General Public License (GPL) Version 2, available from the file
  7. # COPYING in the main directory of this source tree, or the
  8. # OpenIB.org BSD license below:
  9. #
  10. # Redistribution and use in source and binary forms, with or without
  11. # modification, are permitted provided that the following conditions are
  12. # met:
  13. #
  14. # * Redistributions of source code must retain the above copyright
  15. # notice, this list of conditions and the following disclaimer.
  16. #
  17. # * Redistributions in binary form must reproduce the above copyright
  18. # notice, this list of conditions and the following disclaimer in the
  19. # documentation and/or other materials provided with the
  20. # distribution.
  21. #
  22. # * Neither the name of the Intel Corporation nor the names of its
  23. # contributors may be used to endorse or promote products derived from
  24. # this software without specific prior written permission.
  25. #
  26. #
  27. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  28. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  30. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  31. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  32. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  33. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
  34. # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. ########################################################################
  39. ##
  40. ## Authors:
  41. ## Erdinc Ozturk <[email protected]>
  42. ## Vinodh Gopal <[email protected]>
  43. ## James Guilford <[email protected]>
  44. ## Tim Chen <[email protected]>
  45. ##
  46. ## References:
  47. ## This code was derived and highly optimized from the code described in paper:
  48. ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
  49. ## on Intel Architecture Processors. August, 2010
  50. ## The details of the implementation is explained in:
  51. ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
  52. ## on Intel Architecture Processors. October, 2012.
  53. ##
  54. ## Assumptions:
  55. ##
  56. ##
  57. ##
  58. ## iv:
  59. ## 0 1 2 3
  60. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  61. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  62. ## | Salt (From the SA) |
  63. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  64. ## | Initialization Vector |
  65. ## | (This is the sequence number from IPSec header) |
  66. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67. ## | 0x1 |
  68. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69. ##
  70. ##
  71. ##
  72. ## AAD:
  73. ## AAD padded to 128 bits with 0
  74. ## for example, assume AAD is a u32 vector
  75. ##
  76. ## if AAD is 8 bytes:
  77. ## AAD[3] = {A0, A1}#
  78. ## padded AAD in xmm register = {A1 A0 0 0}
  79. ##
  80. ## 0 1 2 3
  81. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  82. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  83. ## | SPI (A1) |
  84. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  85. ## | 32-bit Sequence Number (A0) |
  86. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  87. ## | 0x0 |
  88. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  89. ##
  90. ## AAD Format with 32-bit Sequence Number
  91. ##
  92. ## if AAD is 12 bytes:
  93. ## AAD[3] = {A0, A1, A2}#
  94. ## padded AAD in xmm register = {A2 A1 A0 0}
  95. ##
  96. ## 0 1 2 3
  97. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  98. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  99. ## | SPI (A2) |
  100. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  101. ## | 64-bit Extended Sequence Number {A1,A0} |
  102. ## | |
  103. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  104. ## | 0x0 |
  105. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  106. ##
  107. ## AAD Format with 64-bit Extended Sequence Number
  108. ##
  109. ##
  110. ## aadLen:
  111. ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
  112. ## The code additionally supports aadLen of length 16 bytes.
  113. ##
  114. ## TLen:
  115. ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  116. ##
  117. ## poly = x^128 + x^127 + x^126 + x^121 + 1
  118. ## throughout the code, one tab and two tab indentations are used. one tab is
  119. ## for GHASH part, two tabs is for AES part.
  120. ##
  121. #include <linux/linkage.h>
  122. # constants in mergeable sections, linker can reorder and merge
  123. .section .rodata.cst16.POLY, "aM", @progbits, 16
  124. .align 16
  125. POLY: .octa 0xC2000000000000000000000000000001
  126. .section .rodata.cst16.POLY2, "aM", @progbits, 16
  127. .align 16
  128. POLY2: .octa 0xC20000000000000000000001C2000000
  129. .section .rodata.cst16.TWOONE, "aM", @progbits, 16
  130. .align 16
  131. TWOONE: .octa 0x00000001000000000000000000000001
  132. .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  133. .align 16
  134. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  135. .section .rodata.cst16.ONE, "aM", @progbits, 16
  136. .align 16
  137. ONE: .octa 0x00000000000000000000000000000001
  138. .section .rodata.cst16.ONEf, "aM", @progbits, 16
  139. .align 16
  140. ONEf: .octa 0x01000000000000000000000000000000
  141. # order of these constants should not change.
  142. # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
  143. .section .rodata, "a", @progbits
  144. .align 16
  145. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  146. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  147. .octa 0x00000000000000000000000000000000
  148. .section .rodata
  149. .align 16
  150. .type aad_shift_arr, @object
  151. .size aad_shift_arr, 272
  152. aad_shift_arr:
  153. .octa 0xffffffffffffffffffffffffffffffff
  154. .octa 0xffffffffffffffffffffffffffffff0C
  155. .octa 0xffffffffffffffffffffffffffff0D0C
  156. .octa 0xffffffffffffffffffffffffff0E0D0C
  157. .octa 0xffffffffffffffffffffffff0F0E0D0C
  158. .octa 0xffffffffffffffffffffff0C0B0A0908
  159. .octa 0xffffffffffffffffffff0D0C0B0A0908
  160. .octa 0xffffffffffffffffff0E0D0C0B0A0908
  161. .octa 0xffffffffffffffff0F0E0D0C0B0A0908
  162. .octa 0xffffffffffffff0C0B0A090807060504
  163. .octa 0xffffffffffff0D0C0B0A090807060504
  164. .octa 0xffffffffff0E0D0C0B0A090807060504
  165. .octa 0xffffffff0F0E0D0C0B0A090807060504
  166. .octa 0xffffff0C0B0A09080706050403020100
  167. .octa 0xffff0D0C0B0A09080706050403020100
  168. .octa 0xff0E0D0C0B0A09080706050403020100
  169. .octa 0x0F0E0D0C0B0A09080706050403020100
  170. .text
  171. #define AadHash 16*0
  172. #define AadLen 16*1
  173. #define InLen (16*1)+8
  174. #define PBlockEncKey 16*2
  175. #define OrigIV 16*3
  176. #define CurCount 16*4
  177. #define PBlockLen 16*5
  178. HashKey = 16*6 # store HashKey <<1 mod poly here
  179. HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
  180. HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
  181. HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
  182. HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
  183. HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
  184. HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
  185. HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
  186. HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
  187. HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
  188. HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
  189. HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
  190. HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
  191. HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
  192. HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
  193. HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
  194. #define arg1 %rdi
  195. #define arg2 %rsi
  196. #define arg3 %rdx
  197. #define arg4 %rcx
  198. #define arg5 %r8
  199. #define arg6 %r9
  200. #define keysize 2*15*16(arg1)
  201. i = 0
  202. j = 0
  203. out_order = 0
  204. in_order = 1
  205. DEC = 0
  206. ENC = 1
  207. .macro define_reg r n
  208. reg_\r = %xmm\n
  209. .endm
  210. .macro setreg
  211. .altmacro
  212. define_reg i %i
  213. define_reg j %j
  214. .noaltmacro
  215. .endm
  216. TMP1 = 16*0 # Temporary storage for AAD
  217. TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
  218. TMP3 = 16*2 # Temporary storage for AES State 3
  219. TMP4 = 16*3 # Temporary storage for AES State 4
  220. TMP5 = 16*4 # Temporary storage for AES State 5
  221. TMP6 = 16*5 # Temporary storage for AES State 6
  222. TMP7 = 16*6 # Temporary storage for AES State 7
  223. TMP8 = 16*7 # Temporary storage for AES State 8
  224. VARIABLE_OFFSET = 16*8
  225. ################################
  226. # Utility Macros
  227. ################################
  228. .macro FUNC_SAVE
  229. push %r12
  230. push %r13
  231. push %r15
  232. push %rbp
  233. mov %rsp, %rbp
  234. sub $VARIABLE_OFFSET, %rsp
  235. and $~63, %rsp # align rsp to 64 bytes
  236. .endm
  237. .macro FUNC_RESTORE
  238. mov %rbp, %rsp
  239. pop %rbp
  240. pop %r15
  241. pop %r13
  242. pop %r12
  243. .endm
  244. # Encryption of a single block
  245. .macro ENCRYPT_SINGLE_BLOCK REP XMM0
  246. vpxor (arg1), \XMM0, \XMM0
  247. i = 1
  248. setreg
  249. .rep \REP
  250. vaesenc 16*i(arg1), \XMM0, \XMM0
  251. i = (i+1)
  252. setreg
  253. .endr
  254. vaesenclast 16*i(arg1), \XMM0, \XMM0
  255. .endm
  256. # combined for GCM encrypt and decrypt functions
  257. # clobbering all xmm registers
  258. # clobbering r10, r11, r12, r13, r15, rax
  259. .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
  260. vmovdqu AadHash(arg2), %xmm8
  261. vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
  262. add arg5, InLen(arg2)
  263. # initialize the data pointer offset as zero
  264. xor %r11d, %r11d
  265. PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
  266. sub %r11, arg5
  267. mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
  268. and $-16, %r13 # r13 = r13 - (r13 mod 16)
  269. mov %r13, %r12
  270. shr $4, %r12
  271. and $7, %r12
  272. jz _initial_num_blocks_is_0\@
  273. cmp $7, %r12
  274. je _initial_num_blocks_is_7\@
  275. cmp $6, %r12
  276. je _initial_num_blocks_is_6\@
  277. cmp $5, %r12
  278. je _initial_num_blocks_is_5\@
  279. cmp $4, %r12
  280. je _initial_num_blocks_is_4\@
  281. cmp $3, %r12
  282. je _initial_num_blocks_is_3\@
  283. cmp $2, %r12
  284. je _initial_num_blocks_is_2\@
  285. jmp _initial_num_blocks_is_1\@
  286. _initial_num_blocks_is_7\@:
  287. \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  288. sub $16*7, %r13
  289. jmp _initial_blocks_encrypted\@
  290. _initial_num_blocks_is_6\@:
  291. \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  292. sub $16*6, %r13
  293. jmp _initial_blocks_encrypted\@
  294. _initial_num_blocks_is_5\@:
  295. \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  296. sub $16*5, %r13
  297. jmp _initial_blocks_encrypted\@
  298. _initial_num_blocks_is_4\@:
  299. \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  300. sub $16*4, %r13
  301. jmp _initial_blocks_encrypted\@
  302. _initial_num_blocks_is_3\@:
  303. \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  304. sub $16*3, %r13
  305. jmp _initial_blocks_encrypted\@
  306. _initial_num_blocks_is_2\@:
  307. \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  308. sub $16*2, %r13
  309. jmp _initial_blocks_encrypted\@
  310. _initial_num_blocks_is_1\@:
  311. \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  312. sub $16*1, %r13
  313. jmp _initial_blocks_encrypted\@
  314. _initial_num_blocks_is_0\@:
  315. \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  316. _initial_blocks_encrypted\@:
  317. test %r13, %r13
  318. je _zero_cipher_left\@
  319. sub $128, %r13
  320. je _eight_cipher_left\@
  321. vmovd %xmm9, %r15d
  322. and $255, %r15d
  323. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  324. _encrypt_by_8_new\@:
  325. cmp $(255-8), %r15d
  326. jg _encrypt_by_8\@
  327. add $8, %r15b
  328. \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
  329. add $128, %r11
  330. sub $128, %r13
  331. jne _encrypt_by_8_new\@
  332. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  333. jmp _eight_cipher_left\@
  334. _encrypt_by_8\@:
  335. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  336. add $8, %r15b
  337. \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
  338. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  339. add $128, %r11
  340. sub $128, %r13
  341. jne _encrypt_by_8_new\@
  342. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  343. _eight_cipher_left\@:
  344. \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
  345. _zero_cipher_left\@:
  346. vmovdqu %xmm14, AadHash(arg2)
  347. vmovdqu %xmm9, CurCount(arg2)
  348. # check for 0 length
  349. mov arg5, %r13
  350. and $15, %r13 # r13 = (arg5 mod 16)
  351. je _multiple_of_16_bytes\@
  352. # handle the last <16 Byte block separately
  353. mov %r13, PBlockLen(arg2)
  354. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  355. vmovdqu %xmm9, CurCount(arg2)
  356. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  357. ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
  358. vmovdqu %xmm9, PBlockEncKey(arg2)
  359. cmp $16, arg5
  360. jge _large_enough_update\@
  361. lea (arg4,%r11,1), %r10
  362. mov %r13, %r12
  363. READ_PARTIAL_BLOCK %r10 %r12 %xmm1
  364. lea SHIFT_MASK+16(%rip), %r12
  365. sub %r13, %r12 # adjust the shuffle mask pointer to be
  366. # able to shift 16-r13 bytes (r13 is the
  367. # number of bytes in plaintext mod 16)
  368. jmp _final_ghash_mul\@
  369. _large_enough_update\@:
  370. sub $16, %r11
  371. add %r13, %r11
  372. # receive the last <16 Byte block
  373. vmovdqu (arg4, %r11, 1), %xmm1
  374. sub %r13, %r11
  375. add $16, %r11
  376. lea SHIFT_MASK+16(%rip), %r12
  377. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  378. # (r13 is the number of bytes in plaintext mod 16)
  379. sub %r13, %r12
  380. # get the appropriate shuffle mask
  381. vmovdqu (%r12), %xmm2
  382. # shift right 16-r13 bytes
  383. vpshufb %xmm2, %xmm1, %xmm1
  384. _final_ghash_mul\@:
  385. .if \ENC_DEC == DEC
  386. vmovdqa %xmm1, %xmm2
  387. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  388. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
  389. # mask out top 16-r13 bytes of xmm9
  390. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  391. vpand %xmm1, %xmm2, %xmm2
  392. vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
  393. vpxor %xmm2, %xmm14, %xmm14
  394. vmovdqu %xmm14, AadHash(arg2)
  395. .else
  396. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  397. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
  398. # mask out top 16-r13 bytes of xmm9
  399. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  400. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  401. vpxor %xmm9, %xmm14, %xmm14
  402. vmovdqu %xmm14, AadHash(arg2)
  403. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
  404. .endif
  405. #############################
  406. # output r13 Bytes
  407. vmovq %xmm9, %rax
  408. cmp $8, %r13
  409. jle _less_than_8_bytes_left\@
  410. mov %rax, (arg3 , %r11)
  411. add $8, %r11
  412. vpsrldq $8, %xmm9, %xmm9
  413. vmovq %xmm9, %rax
  414. sub $8, %r13
  415. _less_than_8_bytes_left\@:
  416. movb %al, (arg3 , %r11)
  417. add $1, %r11
  418. shr $8, %rax
  419. sub $1, %r13
  420. jne _less_than_8_bytes_left\@
  421. #############################
  422. _multiple_of_16_bytes\@:
  423. .endm
  424. # GCM_COMPLETE Finishes update of tag of last partial block
  425. # Output: Authorization Tag (AUTH_TAG)
  426. # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
  427. .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
  428. vmovdqu AadHash(arg2), %xmm14
  429. vmovdqu HashKey(arg2), %xmm13
  430. mov PBlockLen(arg2), %r12
  431. test %r12, %r12
  432. je _partial_done\@
  433. #GHASH computation for the last <16 Byte block
  434. \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  435. _partial_done\@:
  436. mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
  437. shl $3, %r12 # convert into number of bits
  438. vmovd %r12d, %xmm15 # len(A) in xmm15
  439. mov InLen(arg2), %r12
  440. shl $3, %r12 # len(C) in bits (*128)
  441. vmovq %r12, %xmm1
  442. vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
  443. vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
  444. vpxor %xmm15, %xmm14, %xmm14
  445. \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
  446. vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
  447. vmovdqu OrigIV(arg2), %xmm9
  448. ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
  449. vpxor %xmm14, %xmm9, %xmm9
  450. _return_T\@:
  451. mov \AUTH_TAG, %r10 # r10 = authTag
  452. mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
  453. cmp $16, %r11
  454. je _T_16\@
  455. cmp $8, %r11
  456. jl _T_4\@
  457. _T_8\@:
  458. vmovq %xmm9, %rax
  459. mov %rax, (%r10)
  460. add $8, %r10
  461. sub $8, %r11
  462. vpsrldq $8, %xmm9, %xmm9
  463. test %r11, %r11
  464. je _return_T_done\@
  465. _T_4\@:
  466. vmovd %xmm9, %eax
  467. mov %eax, (%r10)
  468. add $4, %r10
  469. sub $4, %r11
  470. vpsrldq $4, %xmm9, %xmm9
  471. test %r11, %r11
  472. je _return_T_done\@
  473. _T_123\@:
  474. vmovd %xmm9, %eax
  475. cmp $2, %r11
  476. jl _T_1\@
  477. mov %ax, (%r10)
  478. cmp $2, %r11
  479. je _return_T_done\@
  480. add $2, %r10
  481. sar $16, %eax
  482. _T_1\@:
  483. mov %al, (%r10)
  484. jmp _return_T_done\@
  485. _T_16\@:
  486. vmovdqu %xmm9, (%r10)
  487. _return_T_done\@:
  488. .endm
  489. .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
  490. mov \AAD, %r10 # r10 = AAD
  491. mov \AADLEN, %r12 # r12 = aadLen
  492. mov %r12, %r11
  493. vpxor \T8, \T8, \T8
  494. vpxor \T7, \T7, \T7
  495. cmp $16, %r11
  496. jl _get_AAD_rest8\@
  497. _get_AAD_blocks\@:
  498. vmovdqu (%r10), \T7
  499. vpshufb SHUF_MASK(%rip), \T7, \T7
  500. vpxor \T7, \T8, \T8
  501. \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
  502. add $16, %r10
  503. sub $16, %r12
  504. sub $16, %r11
  505. cmp $16, %r11
  506. jge _get_AAD_blocks\@
  507. vmovdqu \T8, \T7
  508. test %r11, %r11
  509. je _get_AAD_done\@
  510. vpxor \T7, \T7, \T7
  511. /* read the last <16B of AAD. since we have at least 4B of
  512. data right after the AAD (the ICV, and maybe some CT), we can
  513. read 4B/8B blocks safely, and then get rid of the extra stuff */
  514. _get_AAD_rest8\@:
  515. cmp $4, %r11
  516. jle _get_AAD_rest4\@
  517. movq (%r10), \T1
  518. add $8, %r10
  519. sub $8, %r11
  520. vpslldq $8, \T1, \T1
  521. vpsrldq $8, \T7, \T7
  522. vpxor \T1, \T7, \T7
  523. jmp _get_AAD_rest8\@
  524. _get_AAD_rest4\@:
  525. test %r11, %r11
  526. jle _get_AAD_rest0\@
  527. mov (%r10), %eax
  528. movq %rax, \T1
  529. add $4, %r10
  530. sub $4, %r11
  531. vpslldq $12, \T1, \T1
  532. vpsrldq $4, \T7, \T7
  533. vpxor \T1, \T7, \T7
  534. _get_AAD_rest0\@:
  535. /* finalize: shift out the extra bytes we read, and align
  536. left. since pslldq can only shift by an immediate, we use
  537. vpshufb and an array of shuffle masks */
  538. movq %r12, %r11
  539. salq $4, %r11
  540. vmovdqu aad_shift_arr(%r11), \T1
  541. vpshufb \T1, \T7, \T7
  542. _get_AAD_rest_final\@:
  543. vpshufb SHUF_MASK(%rip), \T7, \T7
  544. vpxor \T8, \T7, \T7
  545. \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
  546. _get_AAD_done\@:
  547. vmovdqu \T7, AadHash(arg2)
  548. .endm
  549. .macro INIT GHASH_MUL PRECOMPUTE
  550. mov arg6, %r11
  551. mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
  552. xor %r11d, %r11d
  553. mov %r11, InLen(arg2) # ctx_data.in_length = 0
  554. mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
  555. mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
  556. mov arg3, %rax
  557. movdqu (%rax), %xmm0
  558. movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
  559. vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
  560. movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
  561. vmovdqu (arg4), %xmm6 # xmm6 = HashKey
  562. vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
  563. ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
  564. vmovdqa %xmm6, %xmm2
  565. vpsllq $1, %xmm6, %xmm6
  566. vpsrlq $63, %xmm2, %xmm2
  567. vmovdqa %xmm2, %xmm1
  568. vpslldq $8, %xmm2, %xmm2
  569. vpsrldq $8, %xmm1, %xmm1
  570. vpor %xmm2, %xmm6, %xmm6
  571. #reduction
  572. vpshufd $0b00100100, %xmm1, %xmm2
  573. vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
  574. vpand POLY(%rip), %xmm2, %xmm2
  575. vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
  576. #######################################################################
  577. vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
  578. CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
  579. \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
  580. .endm
  581. # Reads DLEN bytes starting at DPTR and stores in XMMDst
  582. # where 0 < DLEN < 16
  583. # Clobbers %rax, DLEN
  584. .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
  585. vpxor \XMMDst, \XMMDst, \XMMDst
  586. cmp $8, \DLEN
  587. jl _read_lt8_\@
  588. mov (\DPTR), %rax
  589. vpinsrq $0, %rax, \XMMDst, \XMMDst
  590. sub $8, \DLEN
  591. jz _done_read_partial_block_\@
  592. xor %eax, %eax
  593. _read_next_byte_\@:
  594. shl $8, %rax
  595. mov 7(\DPTR, \DLEN, 1), %al
  596. dec \DLEN
  597. jnz _read_next_byte_\@
  598. vpinsrq $1, %rax, \XMMDst, \XMMDst
  599. jmp _done_read_partial_block_\@
  600. _read_lt8_\@:
  601. xor %eax, %eax
  602. _read_next_byte_lt8_\@:
  603. shl $8, %rax
  604. mov -1(\DPTR, \DLEN, 1), %al
  605. dec \DLEN
  606. jnz _read_next_byte_lt8_\@
  607. vpinsrq $0, %rax, \XMMDst, \XMMDst
  608. _done_read_partial_block_\@:
  609. .endm
  610. # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
  611. # between update calls.
  612. # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
  613. # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
  614. # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
  615. .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
  616. AAD_HASH ENC_DEC
  617. mov PBlockLen(arg2), %r13
  618. test %r13, %r13
  619. je _partial_block_done_\@ # Leave Macro if no partial blocks
  620. # Read in input data without over reading
  621. cmp $16, \PLAIN_CYPH_LEN
  622. jl _fewer_than_16_bytes_\@
  623. vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
  624. jmp _data_read_\@
  625. _fewer_than_16_bytes_\@:
  626. lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
  627. mov \PLAIN_CYPH_LEN, %r12
  628. READ_PARTIAL_BLOCK %r10 %r12 %xmm1
  629. mov PBlockLen(arg2), %r13
  630. _data_read_\@: # Finished reading in data
  631. vmovdqu PBlockEncKey(arg2), %xmm9
  632. vmovdqu HashKey(arg2), %xmm13
  633. lea SHIFT_MASK(%rip), %r12
  634. # adjust the shuffle mask pointer to be able to shift r13 bytes
  635. # r16-r13 is the number of bytes in plaintext mod 16)
  636. add %r13, %r12
  637. vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
  638. vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
  639. .if \ENC_DEC == DEC
  640. vmovdqa %xmm1, %xmm3
  641. pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
  642. mov \PLAIN_CYPH_LEN, %r10
  643. add %r13, %r10
  644. # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
  645. sub $16, %r10
  646. # Determine if if partial block is not being filled and
  647. # shift mask accordingly
  648. jge _no_extra_mask_1_\@
  649. sub %r10, %r12
  650. _no_extra_mask_1_\@:
  651. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  652. # get the appropriate mask to mask out bottom r13 bytes of xmm9
  653. vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
  654. vpand %xmm1, %xmm3, %xmm3
  655. vmovdqa SHUF_MASK(%rip), %xmm10
  656. vpshufb %xmm10, %xmm3, %xmm3
  657. vpshufb %xmm2, %xmm3, %xmm3
  658. vpxor %xmm3, \AAD_HASH, \AAD_HASH
  659. test %r10, %r10
  660. jl _partial_incomplete_1_\@
  661. # GHASH computation for the last <16 Byte block
  662. \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  663. xor %eax,%eax
  664. mov %rax, PBlockLen(arg2)
  665. jmp _dec_done_\@
  666. _partial_incomplete_1_\@:
  667. add \PLAIN_CYPH_LEN, PBlockLen(arg2)
  668. _dec_done_\@:
  669. vmovdqu \AAD_HASH, AadHash(arg2)
  670. .else
  671. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  672. mov \PLAIN_CYPH_LEN, %r10
  673. add %r13, %r10
  674. # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
  675. sub $16, %r10
  676. # Determine if if partial block is not being filled and
  677. # shift mask accordingly
  678. jge _no_extra_mask_2_\@
  679. sub %r10, %r12
  680. _no_extra_mask_2_\@:
  681. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  682. # get the appropriate mask to mask out bottom r13 bytes of xmm9
  683. vpand %xmm1, %xmm9, %xmm9
  684. vmovdqa SHUF_MASK(%rip), %xmm1
  685. vpshufb %xmm1, %xmm9, %xmm9
  686. vpshufb %xmm2, %xmm9, %xmm9
  687. vpxor %xmm9, \AAD_HASH, \AAD_HASH
  688. test %r10, %r10
  689. jl _partial_incomplete_2_\@
  690. # GHASH computation for the last <16 Byte block
  691. \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  692. xor %eax,%eax
  693. mov %rax, PBlockLen(arg2)
  694. jmp _encode_done_\@
  695. _partial_incomplete_2_\@:
  696. add \PLAIN_CYPH_LEN, PBlockLen(arg2)
  697. _encode_done_\@:
  698. vmovdqu \AAD_HASH, AadHash(arg2)
  699. vmovdqa SHUF_MASK(%rip), %xmm10
  700. # shuffle xmm9 back to output as ciphertext
  701. vpshufb %xmm10, %xmm9, %xmm9
  702. vpshufb %xmm2, %xmm9, %xmm9
  703. .endif
  704. # output encrypted Bytes
  705. test %r10, %r10
  706. jl _partial_fill_\@
  707. mov %r13, %r12
  708. mov $16, %r13
  709. # Set r13 to be the number of bytes to write out
  710. sub %r12, %r13
  711. jmp _count_set_\@
  712. _partial_fill_\@:
  713. mov \PLAIN_CYPH_LEN, %r13
  714. _count_set_\@:
  715. vmovdqa %xmm9, %xmm0
  716. vmovq %xmm0, %rax
  717. cmp $8, %r13
  718. jle _less_than_8_bytes_left_\@
  719. mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
  720. add $8, \DATA_OFFSET
  721. psrldq $8, %xmm0
  722. vmovq %xmm0, %rax
  723. sub $8, %r13
  724. _less_than_8_bytes_left_\@:
  725. movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
  726. add $1, \DATA_OFFSET
  727. shr $8, %rax
  728. sub $1, %r13
  729. jne _less_than_8_bytes_left_\@
  730. _partial_block_done_\@:
  731. .endm # PARTIAL_BLOCK
  732. ###############################################################################
  733. # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  734. # Input: A and B (128-bits each, bit-reflected)
  735. # Output: C = A*B*x mod poly, (i.e. >>1 )
  736. # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  737. # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  738. ###############################################################################
  739. .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
  740. vpshufd $0b01001110, \GH, \T2
  741. vpshufd $0b01001110, \HK, \T3
  742. vpxor \GH , \T2, \T2 # T2 = (a1+a0)
  743. vpxor \HK , \T3, \T3 # T3 = (b1+b0)
  744. vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
  745. vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
  746. vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
  747. vpxor \GH, \T2,\T2
  748. vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
  749. vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
  750. vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
  751. vpxor \T3, \GH, \GH
  752. vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
  753. #first phase of the reduction
  754. vpslld $31, \GH, \T2 # packed right shifting << 31
  755. vpslld $30, \GH, \T3 # packed right shifting shift << 30
  756. vpslld $25, \GH, \T4 # packed right shifting shift << 25
  757. vpxor \T3, \T2, \T2 # xor the shifted versions
  758. vpxor \T4, \T2, \T2
  759. vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
  760. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  761. vpxor \T2, \GH, \GH # first phase of the reduction complete
  762. #second phase of the reduction
  763. vpsrld $1,\GH, \T2 # packed left shifting >> 1
  764. vpsrld $2,\GH, \T3 # packed left shifting >> 2
  765. vpsrld $7,\GH, \T4 # packed left shifting >> 7
  766. vpxor \T3, \T2, \T2 # xor the shifted versions
  767. vpxor \T4, \T2, \T2
  768. vpxor \T5, \T2, \T2
  769. vpxor \T2, \GH, \GH
  770. vpxor \T1, \GH, \GH # the result is in GH
  771. .endm
  772. .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
  773. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  774. vmovdqa \HK, \T5
  775. vpshufd $0b01001110, \T5, \T1
  776. vpxor \T5, \T1, \T1
  777. vmovdqu \T1, HashKey_k(arg2)
  778. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
  779. vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
  780. vpshufd $0b01001110, \T5, \T1
  781. vpxor \T5, \T1, \T1
  782. vmovdqu \T1, HashKey_2_k(arg2)
  783. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
  784. vmovdqu \T5, HashKey_3(arg2)
  785. vpshufd $0b01001110, \T5, \T1
  786. vpxor \T5, \T1, \T1
  787. vmovdqu \T1, HashKey_3_k(arg2)
  788. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
  789. vmovdqu \T5, HashKey_4(arg2)
  790. vpshufd $0b01001110, \T5, \T1
  791. vpxor \T5, \T1, \T1
  792. vmovdqu \T1, HashKey_4_k(arg2)
  793. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
  794. vmovdqu \T5, HashKey_5(arg2)
  795. vpshufd $0b01001110, \T5, \T1
  796. vpxor \T5, \T1, \T1
  797. vmovdqu \T1, HashKey_5_k(arg2)
  798. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
  799. vmovdqu \T5, HashKey_6(arg2)
  800. vpshufd $0b01001110, \T5, \T1
  801. vpxor \T5, \T1, \T1
  802. vmovdqu \T1, HashKey_6_k(arg2)
  803. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
  804. vmovdqu \T5, HashKey_7(arg2)
  805. vpshufd $0b01001110, \T5, \T1
  806. vpxor \T5, \T1, \T1
  807. vmovdqu \T1, HashKey_7_k(arg2)
  808. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
  809. vmovdqu \T5, HashKey_8(arg2)
  810. vpshufd $0b01001110, \T5, \T1
  811. vpxor \T5, \T1, \T1
  812. vmovdqu \T1, HashKey_8_k(arg2)
  813. .endm
  814. ## if a = number of total plaintext bytes
  815. ## b = floor(a/16)
  816. ## num_initial_blocks = b mod 4#
  817. ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
  818. ## r10, r11, r12, rax are clobbered
  819. ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
  820. .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
  821. i = (8-\num_initial_blocks)
  822. setreg
  823. vmovdqu AadHash(arg2), reg_i
  824. # start AES for num_initial_blocks blocks
  825. vmovdqu CurCount(arg2), \CTR
  826. i = (9-\num_initial_blocks)
  827. setreg
  828. .rep \num_initial_blocks
  829. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  830. vmovdqa \CTR, reg_i
  831. vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
  832. i = (i+1)
  833. setreg
  834. .endr
  835. vmovdqa (arg1), \T_key
  836. i = (9-\num_initial_blocks)
  837. setreg
  838. .rep \num_initial_blocks
  839. vpxor \T_key, reg_i, reg_i
  840. i = (i+1)
  841. setreg
  842. .endr
  843. j = 1
  844. setreg
  845. .rep \REP
  846. vmovdqa 16*j(arg1), \T_key
  847. i = (9-\num_initial_blocks)
  848. setreg
  849. .rep \num_initial_blocks
  850. vaesenc \T_key, reg_i, reg_i
  851. i = (i+1)
  852. setreg
  853. .endr
  854. j = (j+1)
  855. setreg
  856. .endr
  857. vmovdqa 16*j(arg1), \T_key
  858. i = (9-\num_initial_blocks)
  859. setreg
  860. .rep \num_initial_blocks
  861. vaesenclast \T_key, reg_i, reg_i
  862. i = (i+1)
  863. setreg
  864. .endr
  865. i = (9-\num_initial_blocks)
  866. setreg
  867. .rep \num_initial_blocks
  868. vmovdqu (arg4, %r11), \T1
  869. vpxor \T1, reg_i, reg_i
  870. vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
  871. add $16, %r11
  872. .if \ENC_DEC == DEC
  873. vmovdqa \T1, reg_i
  874. .endif
  875. vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
  876. i = (i+1)
  877. setreg
  878. .endr
  879. i = (8-\num_initial_blocks)
  880. j = (9-\num_initial_blocks)
  881. setreg
  882. .rep \num_initial_blocks
  883. vpxor reg_i, reg_j, reg_j
  884. GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
  885. i = (i+1)
  886. j = (j+1)
  887. setreg
  888. .endr
  889. # XMM8 has the combined result here
  890. vmovdqa \XMM8, TMP1(%rsp)
  891. vmovdqa \XMM8, \T3
  892. cmp $128, %r13
  893. jl _initial_blocks_done\@ # no need for precomputed constants
  894. ###############################################################################
  895. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  896. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  897. vmovdqa \CTR, \XMM1
  898. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  899. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  900. vmovdqa \CTR, \XMM2
  901. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  902. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  903. vmovdqa \CTR, \XMM3
  904. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  905. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  906. vmovdqa \CTR, \XMM4
  907. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  908. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  909. vmovdqa \CTR, \XMM5
  910. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  911. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  912. vmovdqa \CTR, \XMM6
  913. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  914. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  915. vmovdqa \CTR, \XMM7
  916. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  917. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  918. vmovdqa \CTR, \XMM8
  919. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  920. vmovdqa (arg1), \T_key
  921. vpxor \T_key, \XMM1, \XMM1
  922. vpxor \T_key, \XMM2, \XMM2
  923. vpxor \T_key, \XMM3, \XMM3
  924. vpxor \T_key, \XMM4, \XMM4
  925. vpxor \T_key, \XMM5, \XMM5
  926. vpxor \T_key, \XMM6, \XMM6
  927. vpxor \T_key, \XMM7, \XMM7
  928. vpxor \T_key, \XMM8, \XMM8
  929. i = 1
  930. setreg
  931. .rep \REP # do REP rounds
  932. vmovdqa 16*i(arg1), \T_key
  933. vaesenc \T_key, \XMM1, \XMM1
  934. vaesenc \T_key, \XMM2, \XMM2
  935. vaesenc \T_key, \XMM3, \XMM3
  936. vaesenc \T_key, \XMM4, \XMM4
  937. vaesenc \T_key, \XMM5, \XMM5
  938. vaesenc \T_key, \XMM6, \XMM6
  939. vaesenc \T_key, \XMM7, \XMM7
  940. vaesenc \T_key, \XMM8, \XMM8
  941. i = (i+1)
  942. setreg
  943. .endr
  944. vmovdqa 16*i(arg1), \T_key
  945. vaesenclast \T_key, \XMM1, \XMM1
  946. vaesenclast \T_key, \XMM2, \XMM2
  947. vaesenclast \T_key, \XMM3, \XMM3
  948. vaesenclast \T_key, \XMM4, \XMM4
  949. vaesenclast \T_key, \XMM5, \XMM5
  950. vaesenclast \T_key, \XMM6, \XMM6
  951. vaesenclast \T_key, \XMM7, \XMM7
  952. vaesenclast \T_key, \XMM8, \XMM8
  953. vmovdqu (arg4, %r11), \T1
  954. vpxor \T1, \XMM1, \XMM1
  955. vmovdqu \XMM1, (arg3 , %r11)
  956. .if \ENC_DEC == DEC
  957. vmovdqa \T1, \XMM1
  958. .endif
  959. vmovdqu 16*1(arg4, %r11), \T1
  960. vpxor \T1, \XMM2, \XMM2
  961. vmovdqu \XMM2, 16*1(arg3 , %r11)
  962. .if \ENC_DEC == DEC
  963. vmovdqa \T1, \XMM2
  964. .endif
  965. vmovdqu 16*2(arg4, %r11), \T1
  966. vpxor \T1, \XMM3, \XMM3
  967. vmovdqu \XMM3, 16*2(arg3 , %r11)
  968. .if \ENC_DEC == DEC
  969. vmovdqa \T1, \XMM3
  970. .endif
  971. vmovdqu 16*3(arg4, %r11), \T1
  972. vpxor \T1, \XMM4, \XMM4
  973. vmovdqu \XMM4, 16*3(arg3 , %r11)
  974. .if \ENC_DEC == DEC
  975. vmovdqa \T1, \XMM4
  976. .endif
  977. vmovdqu 16*4(arg4, %r11), \T1
  978. vpxor \T1, \XMM5, \XMM5
  979. vmovdqu \XMM5, 16*4(arg3 , %r11)
  980. .if \ENC_DEC == DEC
  981. vmovdqa \T1, \XMM5
  982. .endif
  983. vmovdqu 16*5(arg4, %r11), \T1
  984. vpxor \T1, \XMM6, \XMM6
  985. vmovdqu \XMM6, 16*5(arg3 , %r11)
  986. .if \ENC_DEC == DEC
  987. vmovdqa \T1, \XMM6
  988. .endif
  989. vmovdqu 16*6(arg4, %r11), \T1
  990. vpxor \T1, \XMM7, \XMM7
  991. vmovdqu \XMM7, 16*6(arg3 , %r11)
  992. .if \ENC_DEC == DEC
  993. vmovdqa \T1, \XMM7
  994. .endif
  995. vmovdqu 16*7(arg4, %r11), \T1
  996. vpxor \T1, \XMM8, \XMM8
  997. vmovdqu \XMM8, 16*7(arg3 , %r11)
  998. .if \ENC_DEC == DEC
  999. vmovdqa \T1, \XMM8
  1000. .endif
  1001. add $128, %r11
  1002. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1003. vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
  1004. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1005. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1006. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1007. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1008. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1009. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1010. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1011. ###############################################################################
  1012. _initial_blocks_done\@:
  1013. .endm
  1014. # encrypt 8 blocks at a time
  1015. # ghash the 8 previously encrypted ciphertext blocks
  1016. # arg1, arg2, arg3, arg4 are used as pointers only, not modified
  1017. # r11 is the data offset value
  1018. .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
  1019. vmovdqa \XMM1, \T2
  1020. vmovdqa \XMM2, TMP2(%rsp)
  1021. vmovdqa \XMM3, TMP3(%rsp)
  1022. vmovdqa \XMM4, TMP4(%rsp)
  1023. vmovdqa \XMM5, TMP5(%rsp)
  1024. vmovdqa \XMM6, TMP6(%rsp)
  1025. vmovdqa \XMM7, TMP7(%rsp)
  1026. vmovdqa \XMM8, TMP8(%rsp)
  1027. .if \loop_idx == in_order
  1028. vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
  1029. vpaddd ONE(%rip), \XMM1, \XMM2
  1030. vpaddd ONE(%rip), \XMM2, \XMM3
  1031. vpaddd ONE(%rip), \XMM3, \XMM4
  1032. vpaddd ONE(%rip), \XMM4, \XMM5
  1033. vpaddd ONE(%rip), \XMM5, \XMM6
  1034. vpaddd ONE(%rip), \XMM6, \XMM7
  1035. vpaddd ONE(%rip), \XMM7, \XMM8
  1036. vmovdqa \XMM8, \CTR
  1037. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1038. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1039. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1040. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1041. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1042. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1043. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1044. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1045. .else
  1046. vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
  1047. vpaddd ONEf(%rip), \XMM1, \XMM2
  1048. vpaddd ONEf(%rip), \XMM2, \XMM3
  1049. vpaddd ONEf(%rip), \XMM3, \XMM4
  1050. vpaddd ONEf(%rip), \XMM4, \XMM5
  1051. vpaddd ONEf(%rip), \XMM5, \XMM6
  1052. vpaddd ONEf(%rip), \XMM6, \XMM7
  1053. vpaddd ONEf(%rip), \XMM7, \XMM8
  1054. vmovdqa \XMM8, \CTR
  1055. .endif
  1056. #######################################################################
  1057. vmovdqu (arg1), \T1
  1058. vpxor \T1, \XMM1, \XMM1
  1059. vpxor \T1, \XMM2, \XMM2
  1060. vpxor \T1, \XMM3, \XMM3
  1061. vpxor \T1, \XMM4, \XMM4
  1062. vpxor \T1, \XMM5, \XMM5
  1063. vpxor \T1, \XMM6, \XMM6
  1064. vpxor \T1, \XMM7, \XMM7
  1065. vpxor \T1, \XMM8, \XMM8
  1066. #######################################################################
  1067. vmovdqu 16*1(arg1), \T1
  1068. vaesenc \T1, \XMM1, \XMM1
  1069. vaesenc \T1, \XMM2, \XMM2
  1070. vaesenc \T1, \XMM3, \XMM3
  1071. vaesenc \T1, \XMM4, \XMM4
  1072. vaesenc \T1, \XMM5, \XMM5
  1073. vaesenc \T1, \XMM6, \XMM6
  1074. vaesenc \T1, \XMM7, \XMM7
  1075. vaesenc \T1, \XMM8, \XMM8
  1076. vmovdqu 16*2(arg1), \T1
  1077. vaesenc \T1, \XMM1, \XMM1
  1078. vaesenc \T1, \XMM2, \XMM2
  1079. vaesenc \T1, \XMM3, \XMM3
  1080. vaesenc \T1, \XMM4, \XMM4
  1081. vaesenc \T1, \XMM5, \XMM5
  1082. vaesenc \T1, \XMM6, \XMM6
  1083. vaesenc \T1, \XMM7, \XMM7
  1084. vaesenc \T1, \XMM8, \XMM8
  1085. #######################################################################
  1086. vmovdqu HashKey_8(arg2), \T5
  1087. vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
  1088. vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
  1089. vpshufd $0b01001110, \T2, \T6
  1090. vpxor \T2, \T6, \T6
  1091. vmovdqu HashKey_8_k(arg2), \T5
  1092. vpclmulqdq $0x00, \T5, \T6, \T6
  1093. vmovdqu 16*3(arg1), \T1
  1094. vaesenc \T1, \XMM1, \XMM1
  1095. vaesenc \T1, \XMM2, \XMM2
  1096. vaesenc \T1, \XMM3, \XMM3
  1097. vaesenc \T1, \XMM4, \XMM4
  1098. vaesenc \T1, \XMM5, \XMM5
  1099. vaesenc \T1, \XMM6, \XMM6
  1100. vaesenc \T1, \XMM7, \XMM7
  1101. vaesenc \T1, \XMM8, \XMM8
  1102. vmovdqa TMP2(%rsp), \T1
  1103. vmovdqu HashKey_7(arg2), \T5
  1104. vpclmulqdq $0x11, \T5, \T1, \T3
  1105. vpxor \T3, \T4, \T4
  1106. vpclmulqdq $0x00, \T5, \T1, \T3
  1107. vpxor \T3, \T7, \T7
  1108. vpshufd $0b01001110, \T1, \T3
  1109. vpxor \T1, \T3, \T3
  1110. vmovdqu HashKey_7_k(arg2), \T5
  1111. vpclmulqdq $0x10, \T5, \T3, \T3
  1112. vpxor \T3, \T6, \T6
  1113. vmovdqu 16*4(arg1), \T1
  1114. vaesenc \T1, \XMM1, \XMM1
  1115. vaesenc \T1, \XMM2, \XMM2
  1116. vaesenc \T1, \XMM3, \XMM3
  1117. vaesenc \T1, \XMM4, \XMM4
  1118. vaesenc \T1, \XMM5, \XMM5
  1119. vaesenc \T1, \XMM6, \XMM6
  1120. vaesenc \T1, \XMM7, \XMM7
  1121. vaesenc \T1, \XMM8, \XMM8
  1122. #######################################################################
  1123. vmovdqa TMP3(%rsp), \T1
  1124. vmovdqu HashKey_6(arg2), \T5
  1125. vpclmulqdq $0x11, \T5, \T1, \T3
  1126. vpxor \T3, \T4, \T4
  1127. vpclmulqdq $0x00, \T5, \T1, \T3
  1128. vpxor \T3, \T7, \T7
  1129. vpshufd $0b01001110, \T1, \T3
  1130. vpxor \T1, \T3, \T3
  1131. vmovdqu HashKey_6_k(arg2), \T5
  1132. vpclmulqdq $0x10, \T5, \T3, \T3
  1133. vpxor \T3, \T6, \T6
  1134. vmovdqu 16*5(arg1), \T1
  1135. vaesenc \T1, \XMM1, \XMM1
  1136. vaesenc \T1, \XMM2, \XMM2
  1137. vaesenc \T1, \XMM3, \XMM3
  1138. vaesenc \T1, \XMM4, \XMM4
  1139. vaesenc \T1, \XMM5, \XMM5
  1140. vaesenc \T1, \XMM6, \XMM6
  1141. vaesenc \T1, \XMM7, \XMM7
  1142. vaesenc \T1, \XMM8, \XMM8
  1143. vmovdqa TMP4(%rsp), \T1
  1144. vmovdqu HashKey_5(arg2), \T5
  1145. vpclmulqdq $0x11, \T5, \T1, \T3
  1146. vpxor \T3, \T4, \T4
  1147. vpclmulqdq $0x00, \T5, \T1, \T3
  1148. vpxor \T3, \T7, \T7
  1149. vpshufd $0b01001110, \T1, \T3
  1150. vpxor \T1, \T3, \T3
  1151. vmovdqu HashKey_5_k(arg2), \T5
  1152. vpclmulqdq $0x10, \T5, \T3, \T3
  1153. vpxor \T3, \T6, \T6
  1154. vmovdqu 16*6(arg1), \T1
  1155. vaesenc \T1, \XMM1, \XMM1
  1156. vaesenc \T1, \XMM2, \XMM2
  1157. vaesenc \T1, \XMM3, \XMM3
  1158. vaesenc \T1, \XMM4, \XMM4
  1159. vaesenc \T1, \XMM5, \XMM5
  1160. vaesenc \T1, \XMM6, \XMM6
  1161. vaesenc \T1, \XMM7, \XMM7
  1162. vaesenc \T1, \XMM8, \XMM8
  1163. vmovdqa TMP5(%rsp), \T1
  1164. vmovdqu HashKey_4(arg2), \T5
  1165. vpclmulqdq $0x11, \T5, \T1, \T3
  1166. vpxor \T3, \T4, \T4
  1167. vpclmulqdq $0x00, \T5, \T1, \T3
  1168. vpxor \T3, \T7, \T7
  1169. vpshufd $0b01001110, \T1, \T3
  1170. vpxor \T1, \T3, \T3
  1171. vmovdqu HashKey_4_k(arg2), \T5
  1172. vpclmulqdq $0x10, \T5, \T3, \T3
  1173. vpxor \T3, \T6, \T6
  1174. vmovdqu 16*7(arg1), \T1
  1175. vaesenc \T1, \XMM1, \XMM1
  1176. vaesenc \T1, \XMM2, \XMM2
  1177. vaesenc \T1, \XMM3, \XMM3
  1178. vaesenc \T1, \XMM4, \XMM4
  1179. vaesenc \T1, \XMM5, \XMM5
  1180. vaesenc \T1, \XMM6, \XMM6
  1181. vaesenc \T1, \XMM7, \XMM7
  1182. vaesenc \T1, \XMM8, \XMM8
  1183. vmovdqa TMP6(%rsp), \T1
  1184. vmovdqu HashKey_3(arg2), \T5
  1185. vpclmulqdq $0x11, \T5, \T1, \T3
  1186. vpxor \T3, \T4, \T4
  1187. vpclmulqdq $0x00, \T5, \T1, \T3
  1188. vpxor \T3, \T7, \T7
  1189. vpshufd $0b01001110, \T1, \T3
  1190. vpxor \T1, \T3, \T3
  1191. vmovdqu HashKey_3_k(arg2), \T5
  1192. vpclmulqdq $0x10, \T5, \T3, \T3
  1193. vpxor \T3, \T6, \T6
  1194. vmovdqu 16*8(arg1), \T1
  1195. vaesenc \T1, \XMM1, \XMM1
  1196. vaesenc \T1, \XMM2, \XMM2
  1197. vaesenc \T1, \XMM3, \XMM3
  1198. vaesenc \T1, \XMM4, \XMM4
  1199. vaesenc \T1, \XMM5, \XMM5
  1200. vaesenc \T1, \XMM6, \XMM6
  1201. vaesenc \T1, \XMM7, \XMM7
  1202. vaesenc \T1, \XMM8, \XMM8
  1203. vmovdqa TMP7(%rsp), \T1
  1204. vmovdqu HashKey_2(arg2), \T5
  1205. vpclmulqdq $0x11, \T5, \T1, \T3
  1206. vpxor \T3, \T4, \T4
  1207. vpclmulqdq $0x00, \T5, \T1, \T3
  1208. vpxor \T3, \T7, \T7
  1209. vpshufd $0b01001110, \T1, \T3
  1210. vpxor \T1, \T3, \T3
  1211. vmovdqu HashKey_2_k(arg2), \T5
  1212. vpclmulqdq $0x10, \T5, \T3, \T3
  1213. vpxor \T3, \T6, \T6
  1214. #######################################################################
  1215. vmovdqu 16*9(arg1), \T5
  1216. vaesenc \T5, \XMM1, \XMM1
  1217. vaesenc \T5, \XMM2, \XMM2
  1218. vaesenc \T5, \XMM3, \XMM3
  1219. vaesenc \T5, \XMM4, \XMM4
  1220. vaesenc \T5, \XMM5, \XMM5
  1221. vaesenc \T5, \XMM6, \XMM6
  1222. vaesenc \T5, \XMM7, \XMM7
  1223. vaesenc \T5, \XMM8, \XMM8
  1224. vmovdqa TMP8(%rsp), \T1
  1225. vmovdqu HashKey(arg2), \T5
  1226. vpclmulqdq $0x11, \T5, \T1, \T3
  1227. vpxor \T3, \T4, \T4
  1228. vpclmulqdq $0x00, \T5, \T1, \T3
  1229. vpxor \T3, \T7, \T7
  1230. vpshufd $0b01001110, \T1, \T3
  1231. vpxor \T1, \T3, \T3
  1232. vmovdqu HashKey_k(arg2), \T5
  1233. vpclmulqdq $0x10, \T5, \T3, \T3
  1234. vpxor \T3, \T6, \T6
  1235. vpxor \T4, \T6, \T6
  1236. vpxor \T7, \T6, \T6
  1237. vmovdqu 16*10(arg1), \T5
  1238. i = 11
  1239. setreg
  1240. .rep (\REP-9)
  1241. vaesenc \T5, \XMM1, \XMM1
  1242. vaesenc \T5, \XMM2, \XMM2
  1243. vaesenc \T5, \XMM3, \XMM3
  1244. vaesenc \T5, \XMM4, \XMM4
  1245. vaesenc \T5, \XMM5, \XMM5
  1246. vaesenc \T5, \XMM6, \XMM6
  1247. vaesenc \T5, \XMM7, \XMM7
  1248. vaesenc \T5, \XMM8, \XMM8
  1249. vmovdqu 16*i(arg1), \T5
  1250. i = i + 1
  1251. setreg
  1252. .endr
  1253. i = 0
  1254. j = 1
  1255. setreg
  1256. .rep 8
  1257. vpxor 16*i(arg4, %r11), \T5, \T2
  1258. .if \ENC_DEC == ENC
  1259. vaesenclast \T2, reg_j, reg_j
  1260. .else
  1261. vaesenclast \T2, reg_j, \T3
  1262. vmovdqu 16*i(arg4, %r11), reg_j
  1263. vmovdqu \T3, 16*i(arg3, %r11)
  1264. .endif
  1265. i = (i+1)
  1266. j = (j+1)
  1267. setreg
  1268. .endr
  1269. #######################################################################
  1270. vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
  1271. vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
  1272. vpxor \T3, \T7, \T7
  1273. vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
  1274. #######################################################################
  1275. #first phase of the reduction
  1276. #######################################################################
  1277. vpslld $31, \T7, \T2 # packed right shifting << 31
  1278. vpslld $30, \T7, \T3 # packed right shifting shift << 30
  1279. vpslld $25, \T7, \T4 # packed right shifting shift << 25
  1280. vpxor \T3, \T2, \T2 # xor the shifted versions
  1281. vpxor \T4, \T2, \T2
  1282. vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
  1283. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  1284. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  1285. #######################################################################
  1286. .if \ENC_DEC == ENC
  1287. vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
  1288. vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
  1289. vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
  1290. vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
  1291. vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
  1292. vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
  1293. vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
  1294. vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
  1295. .endif
  1296. #######################################################################
  1297. #second phase of the reduction
  1298. vpsrld $1, \T7, \T2 # packed left shifting >> 1
  1299. vpsrld $2, \T7, \T3 # packed left shifting >> 2
  1300. vpsrld $7, \T7, \T4 # packed left shifting >> 7
  1301. vpxor \T3, \T2, \T2 # xor the shifted versions
  1302. vpxor \T4, \T2, \T2
  1303. vpxor \T1, \T2, \T2
  1304. vpxor \T2, \T7, \T7
  1305. vpxor \T7, \T6, \T6 # the result is in T6
  1306. #######################################################################
  1307. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1308. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1309. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1310. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1311. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1312. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1313. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1314. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1315. vpxor \T6, \XMM1, \XMM1
  1316. .endm
  1317. # GHASH the last 4 ciphertext blocks.
  1318. .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
  1319. ## Karatsuba Method
  1320. vpshufd $0b01001110, \XMM1, \T2
  1321. vpxor \XMM1, \T2, \T2
  1322. vmovdqu HashKey_8(arg2), \T5
  1323. vpclmulqdq $0x11, \T5, \XMM1, \T6
  1324. vpclmulqdq $0x00, \T5, \XMM1, \T7
  1325. vmovdqu HashKey_8_k(arg2), \T3
  1326. vpclmulqdq $0x00, \T3, \T2, \XMM1
  1327. ######################
  1328. vpshufd $0b01001110, \XMM2, \T2
  1329. vpxor \XMM2, \T2, \T2
  1330. vmovdqu HashKey_7(arg2), \T5
  1331. vpclmulqdq $0x11, \T5, \XMM2, \T4
  1332. vpxor \T4, \T6, \T6
  1333. vpclmulqdq $0x00, \T5, \XMM2, \T4
  1334. vpxor \T4, \T7, \T7
  1335. vmovdqu HashKey_7_k(arg2), \T3
  1336. vpclmulqdq $0x00, \T3, \T2, \T2
  1337. vpxor \T2, \XMM1, \XMM1
  1338. ######################
  1339. vpshufd $0b01001110, \XMM3, \T2
  1340. vpxor \XMM3, \T2, \T2
  1341. vmovdqu HashKey_6(arg2), \T5
  1342. vpclmulqdq $0x11, \T5, \XMM3, \T4
  1343. vpxor \T4, \T6, \T6
  1344. vpclmulqdq $0x00, \T5, \XMM3, \T4
  1345. vpxor \T4, \T7, \T7
  1346. vmovdqu HashKey_6_k(arg2), \T3
  1347. vpclmulqdq $0x00, \T3, \T2, \T2
  1348. vpxor \T2, \XMM1, \XMM1
  1349. ######################
  1350. vpshufd $0b01001110, \XMM4, \T2
  1351. vpxor \XMM4, \T2, \T2
  1352. vmovdqu HashKey_5(arg2), \T5
  1353. vpclmulqdq $0x11, \T5, \XMM4, \T4
  1354. vpxor \T4, \T6, \T6
  1355. vpclmulqdq $0x00, \T5, \XMM4, \T4
  1356. vpxor \T4, \T7, \T7
  1357. vmovdqu HashKey_5_k(arg2), \T3
  1358. vpclmulqdq $0x00, \T3, \T2, \T2
  1359. vpxor \T2, \XMM1, \XMM1
  1360. ######################
  1361. vpshufd $0b01001110, \XMM5, \T2
  1362. vpxor \XMM5, \T2, \T2
  1363. vmovdqu HashKey_4(arg2), \T5
  1364. vpclmulqdq $0x11, \T5, \XMM5, \T4
  1365. vpxor \T4, \T6, \T6
  1366. vpclmulqdq $0x00, \T5, \XMM5, \T4
  1367. vpxor \T4, \T7, \T7
  1368. vmovdqu HashKey_4_k(arg2), \T3
  1369. vpclmulqdq $0x00, \T3, \T2, \T2
  1370. vpxor \T2, \XMM1, \XMM1
  1371. ######################
  1372. vpshufd $0b01001110, \XMM6, \T2
  1373. vpxor \XMM6, \T2, \T2
  1374. vmovdqu HashKey_3(arg2), \T5
  1375. vpclmulqdq $0x11, \T5, \XMM6, \T4
  1376. vpxor \T4, \T6, \T6
  1377. vpclmulqdq $0x00, \T5, \XMM6, \T4
  1378. vpxor \T4, \T7, \T7
  1379. vmovdqu HashKey_3_k(arg2), \T3
  1380. vpclmulqdq $0x00, \T3, \T2, \T2
  1381. vpxor \T2, \XMM1, \XMM1
  1382. ######################
  1383. vpshufd $0b01001110, \XMM7, \T2
  1384. vpxor \XMM7, \T2, \T2
  1385. vmovdqu HashKey_2(arg2), \T5
  1386. vpclmulqdq $0x11, \T5, \XMM7, \T4
  1387. vpxor \T4, \T6, \T6
  1388. vpclmulqdq $0x00, \T5, \XMM7, \T4
  1389. vpxor \T4, \T7, \T7
  1390. vmovdqu HashKey_2_k(arg2), \T3
  1391. vpclmulqdq $0x00, \T3, \T2, \T2
  1392. vpxor \T2, \XMM1, \XMM1
  1393. ######################
  1394. vpshufd $0b01001110, \XMM8, \T2
  1395. vpxor \XMM8, \T2, \T2
  1396. vmovdqu HashKey(arg2), \T5
  1397. vpclmulqdq $0x11, \T5, \XMM8, \T4
  1398. vpxor \T4, \T6, \T6
  1399. vpclmulqdq $0x00, \T5, \XMM8, \T4
  1400. vpxor \T4, \T7, \T7
  1401. vmovdqu HashKey_k(arg2), \T3
  1402. vpclmulqdq $0x00, \T3, \T2, \T2
  1403. vpxor \T2, \XMM1, \XMM1
  1404. vpxor \T6, \XMM1, \XMM1
  1405. vpxor \T7, \XMM1, \T2
  1406. vpslldq $8, \T2, \T4
  1407. vpsrldq $8, \T2, \T2
  1408. vpxor \T4, \T7, \T7
  1409. vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
  1410. # the accumulated carry-less multiplications
  1411. #######################################################################
  1412. #first phase of the reduction
  1413. vpslld $31, \T7, \T2 # packed right shifting << 31
  1414. vpslld $30, \T7, \T3 # packed right shifting shift << 30
  1415. vpslld $25, \T7, \T4 # packed right shifting shift << 25
  1416. vpxor \T3, \T2, \T2 # xor the shifted versions
  1417. vpxor \T4, \T2, \T2
  1418. vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
  1419. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  1420. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  1421. #######################################################################
  1422. #second phase of the reduction
  1423. vpsrld $1, \T7, \T2 # packed left shifting >> 1
  1424. vpsrld $2, \T7, \T3 # packed left shifting >> 2
  1425. vpsrld $7, \T7, \T4 # packed left shifting >> 7
  1426. vpxor \T3, \T2, \T2 # xor the shifted versions
  1427. vpxor \T4, \T2, \T2
  1428. vpxor \T1, \T2, \T2
  1429. vpxor \T2, \T7, \T7
  1430. vpxor \T7, \T6, \T6 # the result is in T6
  1431. .endm
  1432. #############################################################
  1433. #void aesni_gcm_precomp_avx_gen2
  1434. # (gcm_data *my_ctx_data,
  1435. # gcm_context_data *data,
  1436. # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
  1437. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  1438. # (from Security Association) concatenated with 8 byte
  1439. # Initialisation Vector (from IPSec ESP Payload)
  1440. # concatenated with 0x00000001. 16-byte aligned pointer. */
  1441. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  1442. # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  1443. #############################################################
  1444. SYM_FUNC_START(aesni_gcm_init_avx_gen2)
  1445. FUNC_SAVE
  1446. INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
  1447. FUNC_RESTORE
  1448. RET
  1449. SYM_FUNC_END(aesni_gcm_init_avx_gen2)
  1450. ###############################################################################
  1451. #void aesni_gcm_enc_update_avx_gen2(
  1452. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1453. # gcm_context_data *data,
  1454. # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
  1455. # const u8 *in, /* Plaintext input */
  1456. # u64 plaintext_len) /* Length of data in Bytes for encryption. */
  1457. ###############################################################################
  1458. SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
  1459. FUNC_SAVE
  1460. mov keysize, %eax
  1461. cmp $32, %eax
  1462. je key_256_enc_update
  1463. cmp $16, %eax
  1464. je key_128_enc_update
  1465. # must be 192
  1466. GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
  1467. FUNC_RESTORE
  1468. RET
  1469. key_128_enc_update:
  1470. GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
  1471. FUNC_RESTORE
  1472. RET
  1473. key_256_enc_update:
  1474. GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
  1475. FUNC_RESTORE
  1476. RET
  1477. SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
  1478. ###############################################################################
  1479. #void aesni_gcm_dec_update_avx_gen2(
  1480. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1481. # gcm_context_data *data,
  1482. # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
  1483. # const u8 *in, /* Ciphertext input */
  1484. # u64 plaintext_len) /* Length of data in Bytes for encryption. */
  1485. ###############################################################################
  1486. SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
  1487. FUNC_SAVE
  1488. mov keysize,%eax
  1489. cmp $32, %eax
  1490. je key_256_dec_update
  1491. cmp $16, %eax
  1492. je key_128_dec_update
  1493. # must be 192
  1494. GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
  1495. FUNC_RESTORE
  1496. RET
  1497. key_128_dec_update:
  1498. GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
  1499. FUNC_RESTORE
  1500. RET
  1501. key_256_dec_update:
  1502. GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
  1503. FUNC_RESTORE
  1504. RET
  1505. SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
  1506. ###############################################################################
  1507. #void aesni_gcm_finalize_avx_gen2(
  1508. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1509. # gcm_context_data *data,
  1510. # u8 *auth_tag, /* Authenticated Tag output. */
  1511. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  1512. # Valid values are 16 (most likely), 12 or 8. */
  1513. ###############################################################################
  1514. SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
  1515. FUNC_SAVE
  1516. mov keysize,%eax
  1517. cmp $32, %eax
  1518. je key_256_finalize
  1519. cmp $16, %eax
  1520. je key_128_finalize
  1521. # must be 192
  1522. GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
  1523. FUNC_RESTORE
  1524. RET
  1525. key_128_finalize:
  1526. GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
  1527. FUNC_RESTORE
  1528. RET
  1529. key_256_finalize:
  1530. GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
  1531. FUNC_RESTORE
  1532. RET
  1533. SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
  1534. ###############################################################################
  1535. # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  1536. # Input: A and B (128-bits each, bit-reflected)
  1537. # Output: C = A*B*x mod poly, (i.e. >>1 )
  1538. # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  1539. # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  1540. ###############################################################################
  1541. .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
  1542. vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
  1543. vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
  1544. vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
  1545. vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
  1546. vpxor \T3, \GH, \GH
  1547. vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
  1548. vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
  1549. vpxor \T3, \T1, \T1
  1550. vpxor \T2, \GH, \GH
  1551. #######################################################################
  1552. #first phase of the reduction
  1553. vmovdqa POLY2(%rip), \T3
  1554. vpclmulqdq $0x01, \GH, \T3, \T2
  1555. vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
  1556. vpxor \T2, \GH, \GH # first phase of the reduction complete
  1557. #######################################################################
  1558. #second phase of the reduction
  1559. vpclmulqdq $0x00, \GH, \T3, \T2
  1560. vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1561. vpclmulqdq $0x10, \GH, \T3, \GH
  1562. vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1563. vpxor \T2, \GH, \GH # second phase of the reduction complete
  1564. #######################################################################
  1565. vpxor \T1, \GH, \GH # the result is in GH
  1566. .endm
  1567. .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
  1568. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  1569. vmovdqa \HK, \T5
  1570. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
  1571. vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
  1572. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
  1573. vmovdqu \T5, HashKey_3(arg2)
  1574. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
  1575. vmovdqu \T5, HashKey_4(arg2)
  1576. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
  1577. vmovdqu \T5, HashKey_5(arg2)
  1578. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
  1579. vmovdqu \T5, HashKey_6(arg2)
  1580. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
  1581. vmovdqu \T5, HashKey_7(arg2)
  1582. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
  1583. vmovdqu \T5, HashKey_8(arg2)
  1584. .endm
  1585. ## if a = number of total plaintext bytes
  1586. ## b = floor(a/16)
  1587. ## num_initial_blocks = b mod 4#
  1588. ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
  1589. ## r10, r11, r12, rax are clobbered
  1590. ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
  1591. .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
  1592. i = (8-\num_initial_blocks)
  1593. setreg
  1594. vmovdqu AadHash(arg2), reg_i
  1595. # start AES for num_initial_blocks blocks
  1596. vmovdqu CurCount(arg2), \CTR
  1597. i = (9-\num_initial_blocks)
  1598. setreg
  1599. .rep \num_initial_blocks
  1600. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1601. vmovdqa \CTR, reg_i
  1602. vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
  1603. i = (i+1)
  1604. setreg
  1605. .endr
  1606. vmovdqa (arg1), \T_key
  1607. i = (9-\num_initial_blocks)
  1608. setreg
  1609. .rep \num_initial_blocks
  1610. vpxor \T_key, reg_i, reg_i
  1611. i = (i+1)
  1612. setreg
  1613. .endr
  1614. j = 1
  1615. setreg
  1616. .rep \REP
  1617. vmovdqa 16*j(arg1), \T_key
  1618. i = (9-\num_initial_blocks)
  1619. setreg
  1620. .rep \num_initial_blocks
  1621. vaesenc \T_key, reg_i, reg_i
  1622. i = (i+1)
  1623. setreg
  1624. .endr
  1625. j = (j+1)
  1626. setreg
  1627. .endr
  1628. vmovdqa 16*j(arg1), \T_key
  1629. i = (9-\num_initial_blocks)
  1630. setreg
  1631. .rep \num_initial_blocks
  1632. vaesenclast \T_key, reg_i, reg_i
  1633. i = (i+1)
  1634. setreg
  1635. .endr
  1636. i = (9-\num_initial_blocks)
  1637. setreg
  1638. .rep \num_initial_blocks
  1639. vmovdqu (arg4, %r11), \T1
  1640. vpxor \T1, reg_i, reg_i
  1641. vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
  1642. # num_initial_blocks blocks
  1643. add $16, %r11
  1644. .if \ENC_DEC == DEC
  1645. vmovdqa \T1, reg_i
  1646. .endif
  1647. vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
  1648. i = (i+1)
  1649. setreg
  1650. .endr
  1651. i = (8-\num_initial_blocks)
  1652. j = (9-\num_initial_blocks)
  1653. setreg
  1654. .rep \num_initial_blocks
  1655. vpxor reg_i, reg_j, reg_j
  1656. GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
  1657. i = (i+1)
  1658. j = (j+1)
  1659. setreg
  1660. .endr
  1661. # XMM8 has the combined result here
  1662. vmovdqa \XMM8, TMP1(%rsp)
  1663. vmovdqa \XMM8, \T3
  1664. cmp $128, %r13
  1665. jl _initial_blocks_done\@ # no need for precomputed constants
  1666. ###############################################################################
  1667. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  1668. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1669. vmovdqa \CTR, \XMM1
  1670. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1671. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1672. vmovdqa \CTR, \XMM2
  1673. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1674. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1675. vmovdqa \CTR, \XMM3
  1676. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1677. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1678. vmovdqa \CTR, \XMM4
  1679. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1680. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1681. vmovdqa \CTR, \XMM5
  1682. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1683. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1684. vmovdqa \CTR, \XMM6
  1685. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1686. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1687. vmovdqa \CTR, \XMM7
  1688. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1689. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1690. vmovdqa \CTR, \XMM8
  1691. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1692. vmovdqa (arg1), \T_key
  1693. vpxor \T_key, \XMM1, \XMM1
  1694. vpxor \T_key, \XMM2, \XMM2
  1695. vpxor \T_key, \XMM3, \XMM3
  1696. vpxor \T_key, \XMM4, \XMM4
  1697. vpxor \T_key, \XMM5, \XMM5
  1698. vpxor \T_key, \XMM6, \XMM6
  1699. vpxor \T_key, \XMM7, \XMM7
  1700. vpxor \T_key, \XMM8, \XMM8
  1701. i = 1
  1702. setreg
  1703. .rep \REP # do REP rounds
  1704. vmovdqa 16*i(arg1), \T_key
  1705. vaesenc \T_key, \XMM1, \XMM1
  1706. vaesenc \T_key, \XMM2, \XMM2
  1707. vaesenc \T_key, \XMM3, \XMM3
  1708. vaesenc \T_key, \XMM4, \XMM4
  1709. vaesenc \T_key, \XMM5, \XMM5
  1710. vaesenc \T_key, \XMM6, \XMM6
  1711. vaesenc \T_key, \XMM7, \XMM7
  1712. vaesenc \T_key, \XMM8, \XMM8
  1713. i = (i+1)
  1714. setreg
  1715. .endr
  1716. vmovdqa 16*i(arg1), \T_key
  1717. vaesenclast \T_key, \XMM1, \XMM1
  1718. vaesenclast \T_key, \XMM2, \XMM2
  1719. vaesenclast \T_key, \XMM3, \XMM3
  1720. vaesenclast \T_key, \XMM4, \XMM4
  1721. vaesenclast \T_key, \XMM5, \XMM5
  1722. vaesenclast \T_key, \XMM6, \XMM6
  1723. vaesenclast \T_key, \XMM7, \XMM7
  1724. vaesenclast \T_key, \XMM8, \XMM8
  1725. vmovdqu (arg4, %r11), \T1
  1726. vpxor \T1, \XMM1, \XMM1
  1727. vmovdqu \XMM1, (arg3 , %r11)
  1728. .if \ENC_DEC == DEC
  1729. vmovdqa \T1, \XMM1
  1730. .endif
  1731. vmovdqu 16*1(arg4, %r11), \T1
  1732. vpxor \T1, \XMM2, \XMM2
  1733. vmovdqu \XMM2, 16*1(arg3 , %r11)
  1734. .if \ENC_DEC == DEC
  1735. vmovdqa \T1, \XMM2
  1736. .endif
  1737. vmovdqu 16*2(arg4, %r11), \T1
  1738. vpxor \T1, \XMM3, \XMM3
  1739. vmovdqu \XMM3, 16*2(arg3 , %r11)
  1740. .if \ENC_DEC == DEC
  1741. vmovdqa \T1, \XMM3
  1742. .endif
  1743. vmovdqu 16*3(arg4, %r11), \T1
  1744. vpxor \T1, \XMM4, \XMM4
  1745. vmovdqu \XMM4, 16*3(arg3 , %r11)
  1746. .if \ENC_DEC == DEC
  1747. vmovdqa \T1, \XMM4
  1748. .endif
  1749. vmovdqu 16*4(arg4, %r11), \T1
  1750. vpxor \T1, \XMM5, \XMM5
  1751. vmovdqu \XMM5, 16*4(arg3 , %r11)
  1752. .if \ENC_DEC == DEC
  1753. vmovdqa \T1, \XMM5
  1754. .endif
  1755. vmovdqu 16*5(arg4, %r11), \T1
  1756. vpxor \T1, \XMM6, \XMM6
  1757. vmovdqu \XMM6, 16*5(arg3 , %r11)
  1758. .if \ENC_DEC == DEC
  1759. vmovdqa \T1, \XMM6
  1760. .endif
  1761. vmovdqu 16*6(arg4, %r11), \T1
  1762. vpxor \T1, \XMM7, \XMM7
  1763. vmovdqu \XMM7, 16*6(arg3 , %r11)
  1764. .if \ENC_DEC == DEC
  1765. vmovdqa \T1, \XMM7
  1766. .endif
  1767. vmovdqu 16*7(arg4, %r11), \T1
  1768. vpxor \T1, \XMM8, \XMM8
  1769. vmovdqu \XMM8, 16*7(arg3 , %r11)
  1770. .if \ENC_DEC == DEC
  1771. vmovdqa \T1, \XMM8
  1772. .endif
  1773. add $128, %r11
  1774. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1775. vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
  1776. # the corresponding ciphertext
  1777. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1778. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1779. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1780. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1781. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1782. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1783. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1784. ###############################################################################
  1785. _initial_blocks_done\@:
  1786. .endm
  1787. # encrypt 8 blocks at a time
  1788. # ghash the 8 previously encrypted ciphertext blocks
  1789. # arg1, arg2, arg3, arg4 are used as pointers only, not modified
  1790. # r11 is the data offset value
  1791. .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
  1792. vmovdqa \XMM1, \T2
  1793. vmovdqa \XMM2, TMP2(%rsp)
  1794. vmovdqa \XMM3, TMP3(%rsp)
  1795. vmovdqa \XMM4, TMP4(%rsp)
  1796. vmovdqa \XMM5, TMP5(%rsp)
  1797. vmovdqa \XMM6, TMP6(%rsp)
  1798. vmovdqa \XMM7, TMP7(%rsp)
  1799. vmovdqa \XMM8, TMP8(%rsp)
  1800. .if \loop_idx == in_order
  1801. vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
  1802. vpaddd ONE(%rip), \XMM1, \XMM2
  1803. vpaddd ONE(%rip), \XMM2, \XMM3
  1804. vpaddd ONE(%rip), \XMM3, \XMM4
  1805. vpaddd ONE(%rip), \XMM4, \XMM5
  1806. vpaddd ONE(%rip), \XMM5, \XMM6
  1807. vpaddd ONE(%rip), \XMM6, \XMM7
  1808. vpaddd ONE(%rip), \XMM7, \XMM8
  1809. vmovdqa \XMM8, \CTR
  1810. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1811. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1812. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1813. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1814. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1815. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1816. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1817. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1818. .else
  1819. vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
  1820. vpaddd ONEf(%rip), \XMM1, \XMM2
  1821. vpaddd ONEf(%rip), \XMM2, \XMM3
  1822. vpaddd ONEf(%rip), \XMM3, \XMM4
  1823. vpaddd ONEf(%rip), \XMM4, \XMM5
  1824. vpaddd ONEf(%rip), \XMM5, \XMM6
  1825. vpaddd ONEf(%rip), \XMM6, \XMM7
  1826. vpaddd ONEf(%rip), \XMM7, \XMM8
  1827. vmovdqa \XMM8, \CTR
  1828. .endif
  1829. #######################################################################
  1830. vmovdqu (arg1), \T1
  1831. vpxor \T1, \XMM1, \XMM1
  1832. vpxor \T1, \XMM2, \XMM2
  1833. vpxor \T1, \XMM3, \XMM3
  1834. vpxor \T1, \XMM4, \XMM4
  1835. vpxor \T1, \XMM5, \XMM5
  1836. vpxor \T1, \XMM6, \XMM6
  1837. vpxor \T1, \XMM7, \XMM7
  1838. vpxor \T1, \XMM8, \XMM8
  1839. #######################################################################
  1840. vmovdqu 16*1(arg1), \T1
  1841. vaesenc \T1, \XMM1, \XMM1
  1842. vaesenc \T1, \XMM2, \XMM2
  1843. vaesenc \T1, \XMM3, \XMM3
  1844. vaesenc \T1, \XMM4, \XMM4
  1845. vaesenc \T1, \XMM5, \XMM5
  1846. vaesenc \T1, \XMM6, \XMM6
  1847. vaesenc \T1, \XMM7, \XMM7
  1848. vaesenc \T1, \XMM8, \XMM8
  1849. vmovdqu 16*2(arg1), \T1
  1850. vaesenc \T1, \XMM1, \XMM1
  1851. vaesenc \T1, \XMM2, \XMM2
  1852. vaesenc \T1, \XMM3, \XMM3
  1853. vaesenc \T1, \XMM4, \XMM4
  1854. vaesenc \T1, \XMM5, \XMM5
  1855. vaesenc \T1, \XMM6, \XMM6
  1856. vaesenc \T1, \XMM7, \XMM7
  1857. vaesenc \T1, \XMM8, \XMM8
  1858. #######################################################################
  1859. vmovdqu HashKey_8(arg2), \T5
  1860. vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
  1861. vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
  1862. vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
  1863. vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
  1864. vpxor \T5, \T6, \T6
  1865. vmovdqu 16*3(arg1), \T1
  1866. vaesenc \T1, \XMM1, \XMM1
  1867. vaesenc \T1, \XMM2, \XMM2
  1868. vaesenc \T1, \XMM3, \XMM3
  1869. vaesenc \T1, \XMM4, \XMM4
  1870. vaesenc \T1, \XMM5, \XMM5
  1871. vaesenc \T1, \XMM6, \XMM6
  1872. vaesenc \T1, \XMM7, \XMM7
  1873. vaesenc \T1, \XMM8, \XMM8
  1874. vmovdqa TMP2(%rsp), \T1
  1875. vmovdqu HashKey_7(arg2), \T5
  1876. vpclmulqdq $0x11, \T5, \T1, \T3
  1877. vpxor \T3, \T4, \T4
  1878. vpclmulqdq $0x00, \T5, \T1, \T3
  1879. vpxor \T3, \T7, \T7
  1880. vpclmulqdq $0x01, \T5, \T1, \T3
  1881. vpxor \T3, \T6, \T6
  1882. vpclmulqdq $0x10, \T5, \T1, \T3
  1883. vpxor \T3, \T6, \T6
  1884. vmovdqu 16*4(arg1), \T1
  1885. vaesenc \T1, \XMM1, \XMM1
  1886. vaesenc \T1, \XMM2, \XMM2
  1887. vaesenc \T1, \XMM3, \XMM3
  1888. vaesenc \T1, \XMM4, \XMM4
  1889. vaesenc \T1, \XMM5, \XMM5
  1890. vaesenc \T1, \XMM6, \XMM6
  1891. vaesenc \T1, \XMM7, \XMM7
  1892. vaesenc \T1, \XMM8, \XMM8
  1893. #######################################################################
  1894. vmovdqa TMP3(%rsp), \T1
  1895. vmovdqu HashKey_6(arg2), \T5
  1896. vpclmulqdq $0x11, \T5, \T1, \T3
  1897. vpxor \T3, \T4, \T4
  1898. vpclmulqdq $0x00, \T5, \T1, \T3
  1899. vpxor \T3, \T7, \T7
  1900. vpclmulqdq $0x01, \T5, \T1, \T3
  1901. vpxor \T3, \T6, \T6
  1902. vpclmulqdq $0x10, \T5, \T1, \T3
  1903. vpxor \T3, \T6, \T6
  1904. vmovdqu 16*5(arg1), \T1
  1905. vaesenc \T1, \XMM1, \XMM1
  1906. vaesenc \T1, \XMM2, \XMM2
  1907. vaesenc \T1, \XMM3, \XMM3
  1908. vaesenc \T1, \XMM4, \XMM4
  1909. vaesenc \T1, \XMM5, \XMM5
  1910. vaesenc \T1, \XMM6, \XMM6
  1911. vaesenc \T1, \XMM7, \XMM7
  1912. vaesenc \T1, \XMM8, \XMM8
  1913. vmovdqa TMP4(%rsp), \T1
  1914. vmovdqu HashKey_5(arg2), \T5
  1915. vpclmulqdq $0x11, \T5, \T1, \T3
  1916. vpxor \T3, \T4, \T4
  1917. vpclmulqdq $0x00, \T5, \T1, \T3
  1918. vpxor \T3, \T7, \T7
  1919. vpclmulqdq $0x01, \T5, \T1, \T3
  1920. vpxor \T3, \T6, \T6
  1921. vpclmulqdq $0x10, \T5, \T1, \T3
  1922. vpxor \T3, \T6, \T6
  1923. vmovdqu 16*6(arg1), \T1
  1924. vaesenc \T1, \XMM1, \XMM1
  1925. vaesenc \T1, \XMM2, \XMM2
  1926. vaesenc \T1, \XMM3, \XMM3
  1927. vaesenc \T1, \XMM4, \XMM4
  1928. vaesenc \T1, \XMM5, \XMM5
  1929. vaesenc \T1, \XMM6, \XMM6
  1930. vaesenc \T1, \XMM7, \XMM7
  1931. vaesenc \T1, \XMM8, \XMM8
  1932. vmovdqa TMP5(%rsp), \T1
  1933. vmovdqu HashKey_4(arg2), \T5
  1934. vpclmulqdq $0x11, \T5, \T1, \T3
  1935. vpxor \T3, \T4, \T4
  1936. vpclmulqdq $0x00, \T5, \T1, \T3
  1937. vpxor \T3, \T7, \T7
  1938. vpclmulqdq $0x01, \T5, \T1, \T3
  1939. vpxor \T3, \T6, \T6
  1940. vpclmulqdq $0x10, \T5, \T1, \T3
  1941. vpxor \T3, \T6, \T6
  1942. vmovdqu 16*7(arg1), \T1
  1943. vaesenc \T1, \XMM1, \XMM1
  1944. vaesenc \T1, \XMM2, \XMM2
  1945. vaesenc \T1, \XMM3, \XMM3
  1946. vaesenc \T1, \XMM4, \XMM4
  1947. vaesenc \T1, \XMM5, \XMM5
  1948. vaesenc \T1, \XMM6, \XMM6
  1949. vaesenc \T1, \XMM7, \XMM7
  1950. vaesenc \T1, \XMM8, \XMM8
  1951. vmovdqa TMP6(%rsp), \T1
  1952. vmovdqu HashKey_3(arg2), \T5
  1953. vpclmulqdq $0x11, \T5, \T1, \T3
  1954. vpxor \T3, \T4, \T4
  1955. vpclmulqdq $0x00, \T5, \T1, \T3
  1956. vpxor \T3, \T7, \T7
  1957. vpclmulqdq $0x01, \T5, \T1, \T3
  1958. vpxor \T3, \T6, \T6
  1959. vpclmulqdq $0x10, \T5, \T1, \T3
  1960. vpxor \T3, \T6, \T6
  1961. vmovdqu 16*8(arg1), \T1
  1962. vaesenc \T1, \XMM1, \XMM1
  1963. vaesenc \T1, \XMM2, \XMM2
  1964. vaesenc \T1, \XMM3, \XMM3
  1965. vaesenc \T1, \XMM4, \XMM4
  1966. vaesenc \T1, \XMM5, \XMM5
  1967. vaesenc \T1, \XMM6, \XMM6
  1968. vaesenc \T1, \XMM7, \XMM7
  1969. vaesenc \T1, \XMM8, \XMM8
  1970. vmovdqa TMP7(%rsp), \T1
  1971. vmovdqu HashKey_2(arg2), \T5
  1972. vpclmulqdq $0x11, \T5, \T1, \T3
  1973. vpxor \T3, \T4, \T4
  1974. vpclmulqdq $0x00, \T5, \T1, \T3
  1975. vpxor \T3, \T7, \T7
  1976. vpclmulqdq $0x01, \T5, \T1, \T3
  1977. vpxor \T3, \T6, \T6
  1978. vpclmulqdq $0x10, \T5, \T1, \T3
  1979. vpxor \T3, \T6, \T6
  1980. #######################################################################
  1981. vmovdqu 16*9(arg1), \T5
  1982. vaesenc \T5, \XMM1, \XMM1
  1983. vaesenc \T5, \XMM2, \XMM2
  1984. vaesenc \T5, \XMM3, \XMM3
  1985. vaesenc \T5, \XMM4, \XMM4
  1986. vaesenc \T5, \XMM5, \XMM5
  1987. vaesenc \T5, \XMM6, \XMM6
  1988. vaesenc \T5, \XMM7, \XMM7
  1989. vaesenc \T5, \XMM8, \XMM8
  1990. vmovdqa TMP8(%rsp), \T1
  1991. vmovdqu HashKey(arg2), \T5
  1992. vpclmulqdq $0x00, \T5, \T1, \T3
  1993. vpxor \T3, \T7, \T7
  1994. vpclmulqdq $0x01, \T5, \T1, \T3
  1995. vpxor \T3, \T6, \T6
  1996. vpclmulqdq $0x10, \T5, \T1, \T3
  1997. vpxor \T3, \T6, \T6
  1998. vpclmulqdq $0x11, \T5, \T1, \T3
  1999. vpxor \T3, \T4, \T1
  2000. vmovdqu 16*10(arg1), \T5
  2001. i = 11
  2002. setreg
  2003. .rep (\REP-9)
  2004. vaesenc \T5, \XMM1, \XMM1
  2005. vaesenc \T5, \XMM2, \XMM2
  2006. vaesenc \T5, \XMM3, \XMM3
  2007. vaesenc \T5, \XMM4, \XMM4
  2008. vaesenc \T5, \XMM5, \XMM5
  2009. vaesenc \T5, \XMM6, \XMM6
  2010. vaesenc \T5, \XMM7, \XMM7
  2011. vaesenc \T5, \XMM8, \XMM8
  2012. vmovdqu 16*i(arg1), \T5
  2013. i = i + 1
  2014. setreg
  2015. .endr
  2016. i = 0
  2017. j = 1
  2018. setreg
  2019. .rep 8
  2020. vpxor 16*i(arg4, %r11), \T5, \T2
  2021. .if \ENC_DEC == ENC
  2022. vaesenclast \T2, reg_j, reg_j
  2023. .else
  2024. vaesenclast \T2, reg_j, \T3
  2025. vmovdqu 16*i(arg4, %r11), reg_j
  2026. vmovdqu \T3, 16*i(arg3, %r11)
  2027. .endif
  2028. i = (i+1)
  2029. j = (j+1)
  2030. setreg
  2031. .endr
  2032. #######################################################################
  2033. vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
  2034. vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
  2035. vpxor \T3, \T7, \T7
  2036. vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
  2037. #######################################################################
  2038. #first phase of the reduction
  2039. vmovdqa POLY2(%rip), \T3
  2040. vpclmulqdq $0x01, \T7, \T3, \T2
  2041. vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
  2042. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  2043. #######################################################################
  2044. .if \ENC_DEC == ENC
  2045. vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
  2046. vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
  2047. vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
  2048. vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
  2049. vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
  2050. vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
  2051. vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
  2052. vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
  2053. .endif
  2054. #######################################################################
  2055. #second phase of the reduction
  2056. vpclmulqdq $0x00, \T7, \T3, \T2
  2057. vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  2058. vpclmulqdq $0x10, \T7, \T3, \T4
  2059. vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
  2060. vpxor \T2, \T4, \T4 # second phase of the reduction complete
  2061. #######################################################################
  2062. vpxor \T4, \T1, \T1 # the result is in T1
  2063. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  2064. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  2065. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  2066. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  2067. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  2068. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  2069. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  2070. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  2071. vpxor \T1, \XMM1, \XMM1
  2072. .endm
  2073. # GHASH the last 4 ciphertext blocks.
  2074. .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
  2075. ## Karatsuba Method
  2076. vmovdqu HashKey_8(arg2), \T5
  2077. vpshufd $0b01001110, \XMM1, \T2
  2078. vpshufd $0b01001110, \T5, \T3
  2079. vpxor \XMM1, \T2, \T2
  2080. vpxor \T5, \T3, \T3
  2081. vpclmulqdq $0x11, \T5, \XMM1, \T6
  2082. vpclmulqdq $0x00, \T5, \XMM1, \T7
  2083. vpclmulqdq $0x00, \T3, \T2, \XMM1
  2084. ######################
  2085. vmovdqu HashKey_7(arg2), \T5
  2086. vpshufd $0b01001110, \XMM2, \T2
  2087. vpshufd $0b01001110, \T5, \T3
  2088. vpxor \XMM2, \T2, \T2
  2089. vpxor \T5, \T3, \T3
  2090. vpclmulqdq $0x11, \T5, \XMM2, \T4
  2091. vpxor \T4, \T6, \T6
  2092. vpclmulqdq $0x00, \T5, \XMM2, \T4
  2093. vpxor \T4, \T7, \T7
  2094. vpclmulqdq $0x00, \T3, \T2, \T2
  2095. vpxor \T2, \XMM1, \XMM1
  2096. ######################
  2097. vmovdqu HashKey_6(arg2), \T5
  2098. vpshufd $0b01001110, \XMM3, \T2
  2099. vpshufd $0b01001110, \T5, \T3
  2100. vpxor \XMM3, \T2, \T2
  2101. vpxor \T5, \T3, \T3
  2102. vpclmulqdq $0x11, \T5, \XMM3, \T4
  2103. vpxor \T4, \T6, \T6
  2104. vpclmulqdq $0x00, \T5, \XMM3, \T4
  2105. vpxor \T4, \T7, \T7
  2106. vpclmulqdq $0x00, \T3, \T2, \T2
  2107. vpxor \T2, \XMM1, \XMM1
  2108. ######################
  2109. vmovdqu HashKey_5(arg2), \T5
  2110. vpshufd $0b01001110, \XMM4, \T2
  2111. vpshufd $0b01001110, \T5, \T3
  2112. vpxor \XMM4, \T2, \T2
  2113. vpxor \T5, \T3, \T3
  2114. vpclmulqdq $0x11, \T5, \XMM4, \T4
  2115. vpxor \T4, \T6, \T6
  2116. vpclmulqdq $0x00, \T5, \XMM4, \T4
  2117. vpxor \T4, \T7, \T7
  2118. vpclmulqdq $0x00, \T3, \T2, \T2
  2119. vpxor \T2, \XMM1, \XMM1
  2120. ######################
  2121. vmovdqu HashKey_4(arg2), \T5
  2122. vpshufd $0b01001110, \XMM5, \T2
  2123. vpshufd $0b01001110, \T5, \T3
  2124. vpxor \XMM5, \T2, \T2
  2125. vpxor \T5, \T3, \T3
  2126. vpclmulqdq $0x11, \T5, \XMM5, \T4
  2127. vpxor \T4, \T6, \T6
  2128. vpclmulqdq $0x00, \T5, \XMM5, \T4
  2129. vpxor \T4, \T7, \T7
  2130. vpclmulqdq $0x00, \T3, \T2, \T2
  2131. vpxor \T2, \XMM1, \XMM1
  2132. ######################
  2133. vmovdqu HashKey_3(arg2), \T5
  2134. vpshufd $0b01001110, \XMM6, \T2
  2135. vpshufd $0b01001110, \T5, \T3
  2136. vpxor \XMM6, \T2, \T2
  2137. vpxor \T5, \T3, \T3
  2138. vpclmulqdq $0x11, \T5, \XMM6, \T4
  2139. vpxor \T4, \T6, \T6
  2140. vpclmulqdq $0x00, \T5, \XMM6, \T4
  2141. vpxor \T4, \T7, \T7
  2142. vpclmulqdq $0x00, \T3, \T2, \T2
  2143. vpxor \T2, \XMM1, \XMM1
  2144. ######################
  2145. vmovdqu HashKey_2(arg2), \T5
  2146. vpshufd $0b01001110, \XMM7, \T2
  2147. vpshufd $0b01001110, \T5, \T3
  2148. vpxor \XMM7, \T2, \T2
  2149. vpxor \T5, \T3, \T3
  2150. vpclmulqdq $0x11, \T5, \XMM7, \T4
  2151. vpxor \T4, \T6, \T6
  2152. vpclmulqdq $0x00, \T5, \XMM7, \T4
  2153. vpxor \T4, \T7, \T7
  2154. vpclmulqdq $0x00, \T3, \T2, \T2
  2155. vpxor \T2, \XMM1, \XMM1
  2156. ######################
  2157. vmovdqu HashKey(arg2), \T5
  2158. vpshufd $0b01001110, \XMM8, \T2
  2159. vpshufd $0b01001110, \T5, \T3
  2160. vpxor \XMM8, \T2, \T2
  2161. vpxor \T5, \T3, \T3
  2162. vpclmulqdq $0x11, \T5, \XMM8, \T4
  2163. vpxor \T4, \T6, \T6
  2164. vpclmulqdq $0x00, \T5, \XMM8, \T4
  2165. vpxor \T4, \T7, \T7
  2166. vpclmulqdq $0x00, \T3, \T2, \T2
  2167. vpxor \T2, \XMM1, \XMM1
  2168. vpxor \T6, \XMM1, \XMM1
  2169. vpxor \T7, \XMM1, \T2
  2170. vpslldq $8, \T2, \T4
  2171. vpsrldq $8, \T2, \T2
  2172. vpxor \T4, \T7, \T7
  2173. vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
  2174. # accumulated carry-less multiplications
  2175. #######################################################################
  2176. #first phase of the reduction
  2177. vmovdqa POLY2(%rip), \T3
  2178. vpclmulqdq $0x01, \T7, \T3, \T2
  2179. vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
  2180. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  2181. #######################################################################
  2182. #second phase of the reduction
  2183. vpclmulqdq $0x00, \T7, \T3, \T2
  2184. vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  2185. vpclmulqdq $0x10, \T7, \T3, \T4
  2186. vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
  2187. vpxor \T2, \T4, \T4 # second phase of the reduction complete
  2188. #######################################################################
  2189. vpxor \T4, \T6, \T6 # the result is in T6
  2190. .endm
  2191. #############################################################
  2192. #void aesni_gcm_init_avx_gen4
  2193. # (gcm_data *my_ctx_data,
  2194. # gcm_context_data *data,
  2195. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  2196. # (from Security Association) concatenated with 8 byte
  2197. # Initialisation Vector (from IPSec ESP Payload)
  2198. # concatenated with 0x00000001. 16-byte aligned pointer. */
  2199. # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
  2200. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  2201. # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  2202. #############################################################
  2203. SYM_FUNC_START(aesni_gcm_init_avx_gen4)
  2204. FUNC_SAVE
  2205. INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
  2206. FUNC_RESTORE
  2207. RET
  2208. SYM_FUNC_END(aesni_gcm_init_avx_gen4)
  2209. ###############################################################################
  2210. #void aesni_gcm_enc_avx_gen4(
  2211. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2212. # gcm_context_data *data,
  2213. # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
  2214. # const u8 *in, /* Plaintext input */
  2215. # u64 plaintext_len) /* Length of data in Bytes for encryption. */
  2216. ###############################################################################
  2217. SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
  2218. FUNC_SAVE
  2219. mov keysize,%eax
  2220. cmp $32, %eax
  2221. je key_256_enc_update4
  2222. cmp $16, %eax
  2223. je key_128_enc_update4
  2224. # must be 192
  2225. GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
  2226. FUNC_RESTORE
  2227. RET
  2228. key_128_enc_update4:
  2229. GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
  2230. FUNC_RESTORE
  2231. RET
  2232. key_256_enc_update4:
  2233. GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
  2234. FUNC_RESTORE
  2235. RET
  2236. SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
  2237. ###############################################################################
  2238. #void aesni_gcm_dec_update_avx_gen4(
  2239. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2240. # gcm_context_data *data,
  2241. # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
  2242. # const u8 *in, /* Ciphertext input */
  2243. # u64 plaintext_len) /* Length of data in Bytes for encryption. */
  2244. ###############################################################################
  2245. SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
  2246. FUNC_SAVE
  2247. mov keysize,%eax
  2248. cmp $32, %eax
  2249. je key_256_dec_update4
  2250. cmp $16, %eax
  2251. je key_128_dec_update4
  2252. # must be 192
  2253. GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
  2254. FUNC_RESTORE
  2255. RET
  2256. key_128_dec_update4:
  2257. GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
  2258. FUNC_RESTORE
  2259. RET
  2260. key_256_dec_update4:
  2261. GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
  2262. FUNC_RESTORE
  2263. RET
  2264. SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
  2265. ###############################################################################
  2266. #void aesni_gcm_finalize_avx_gen4(
  2267. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2268. # gcm_context_data *data,
  2269. # u8 *auth_tag, /* Authenticated Tag output. */
  2270. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  2271. # Valid values are 16 (most likely), 12 or 8. */
  2272. ###############################################################################
  2273. SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
  2274. FUNC_SAVE
  2275. mov keysize,%eax
  2276. cmp $32, %eax
  2277. je key_256_finalize4
  2278. cmp $16, %eax
  2279. je key_128_finalize4
  2280. # must be 192
  2281. GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
  2282. FUNC_RESTORE
  2283. RET
  2284. key_128_finalize4:
  2285. GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
  2286. FUNC_RESTORE
  2287. RET
  2288. key_256_finalize4:
  2289. GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
  2290. FUNC_RESTORE
  2291. RET
  2292. SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)