poly1305-x86_64-cryptogams.pl 100 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249
  1. #!/usr/bin/env perl
  2. # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
  3. #
  4. # Copyright (C) 2017-2018 Samuel Neves <[email protected]>. All Rights Reserved.
  5. # Copyright (C) 2017-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
  6. # Copyright (C) 2006-2017 CRYPTOGAMS by <[email protected]>. All Rights Reserved.
  7. #
  8. # This code is taken from the OpenSSL project but the author, Andy Polyakov,
  9. # has relicensed it under the licenses specified in the SPDX header above.
  10. # The original headers, including the original license headers, are
  11. # included below for completeness.
  12. #
  13. # ====================================================================
  14. # Written by Andy Polyakov <[email protected]> for the OpenSSL
  15. # project. The module is, however, dual licensed under OpenSSL and
  16. # CRYPTOGAMS licenses depending on where you obtain it. For further
  17. # details see http://www.openssl.org/~appro/cryptogams/.
  18. # ====================================================================
  19. #
  20. # This module implements Poly1305 hash for x86_64.
  21. #
  22. # March 2015
  23. #
  24. # Initial release.
  25. #
  26. # December 2016
  27. #
  28. # Add AVX512F+VL+BW code path.
  29. #
  30. # November 2017
  31. #
  32. # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
  33. # executed even on Knights Landing. Trigger for modification was
  34. # observation that AVX512 code paths can negatively affect overall
  35. # Skylake-X system performance. Since we are likely to suppress
  36. # AVX512F capability flag [at least on Skylake-X], conversion serves
  37. # as kind of "investment protection". Note that next *lake processor,
  38. # Cannonlake, has AVX512IFMA code path to execute...
  39. #
  40. # Numbers are cycles per processed byte with poly1305_blocks alone,
  41. # measured with rdtsc at fixed clock frequency.
  42. #
  43. # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
  44. # P4 4.46/+120% -
  45. # Core 2 2.41/+90% -
  46. # Westmere 1.88/+120% -
  47. # Sandy Bridge 1.39/+140% 1.10
  48. # Haswell 1.14/+175% 1.11 0.65
  49. # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
  50. # Silvermont 2.83/+95% -
  51. # Knights L 3.60/? 1.65 1.10 0.41(***)
  52. # Goldmont 1.70/+180% -
  53. # VIA Nano 1.82/+150% -
  54. # Sledgehammer 1.38/+160% -
  55. # Bulldozer 2.30/+130% 0.97
  56. # Ryzen 1.15/+200% 1.08 1.18
  57. #
  58. # (*) improvement coefficients relative to clang are more modest and
  59. # are ~50% on most processors, in both cases we are comparing to
  60. # __int128 code;
  61. # (**) SSE2 implementation was attempted, but among non-AVX processors
  62. # it was faster than integer-only code only on older Intel P4 and
  63. # Core processors, 50-30%, less newer processor is, but slower on
  64. # contemporary ones, for example almost 2x slower on Atom, and as
  65. # former are naturally disappearing, SSE2 is deemed unnecessary;
  66. # (***) strangely enough performance seems to vary from core to core,
  67. # listed result is best case;
  68. $flavour = shift;
  69. $output = shift;
  70. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  71. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  72. $kernel=0; $kernel=1 if (!$flavour && !$output);
  73. if (!$kernel) {
  74. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  75. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  76. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  77. die "can't locate x86_64-xlate.pl";
  78. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  79. *STDOUT=*OUT;
  80. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  81. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  82. $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
  83. }
  84. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  85. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
  86. $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
  87. $avx += 1 if ($1==2.11 && $2>=8);
  88. }
  89. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  90. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  91. $avx = ($1>=10) + ($1>=11);
  92. }
  93. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  94. $avx = ($2>=3.0) + ($2>3.0);
  95. }
  96. } else {
  97. $avx = 4; # The kernel uses ifdefs for this.
  98. }
  99. sub declare_function() {
  100. my ($name, $align, $nargs) = @_;
  101. if($kernel) {
  102. $code .= ".align $align\n";
  103. $code .= "SYM_FUNC_START($name)\n";
  104. $code .= ".L$name:\n";
  105. } else {
  106. $code .= ".globl $name\n";
  107. $code .= ".type $name,\@function,$nargs\n";
  108. $code .= ".align $align\n";
  109. $code .= "$name:\n";
  110. }
  111. }
  112. sub end_function() {
  113. my ($name) = @_;
  114. if($kernel) {
  115. $code .= "SYM_FUNC_END($name)\n";
  116. } else {
  117. $code .= ".size $name,.-$name\n";
  118. }
  119. }
  120. $code.=<<___ if $kernel;
  121. #include <linux/linkage.h>
  122. ___
  123. if ($avx) {
  124. $code.=<<___ if $kernel;
  125. .section .rodata
  126. ___
  127. $code.=<<___;
  128. .align 64
  129. .Lconst:
  130. .Lmask24:
  131. .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
  132. .L129:
  133. .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
  134. .Lmask26:
  135. .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
  136. .Lpermd_avx2:
  137. .long 2,2,2,3,2,0,2,1
  138. .Lpermd_avx512:
  139. .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
  140. .L2_44_inp_permd:
  141. .long 0,1,1,2,2,3,7,7
  142. .L2_44_inp_shift:
  143. .quad 0,12,24,64
  144. .L2_44_mask:
  145. .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
  146. .L2_44_shift_rgt:
  147. .quad 44,44,42,64
  148. .L2_44_shift_lft:
  149. .quad 8,8,10,64
  150. .align 64
  151. .Lx_mask44:
  152. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  153. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  154. .Lx_mask42:
  155. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  156. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  157. ___
  158. }
  159. $code.=<<___ if (!$kernel);
  160. .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  161. .align 16
  162. ___
  163. my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
  164. my ($mac,$nonce)=($inp,$len); # *_emit arguments
  165. my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
  166. my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
  167. sub poly1305_iteration {
  168. # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
  169. # output: $h0-$h2 *= $r0-$r1
  170. $code.=<<___;
  171. mulq $h0 # h0*r1
  172. mov %rax,$d2
  173. mov $r0,%rax
  174. mov %rdx,$d3
  175. mulq $h0 # h0*r0
  176. mov %rax,$h0 # future $h0
  177. mov $r0,%rax
  178. mov %rdx,$d1
  179. mulq $h1 # h1*r0
  180. add %rax,$d2
  181. mov $s1,%rax
  182. adc %rdx,$d3
  183. mulq $h1 # h1*s1
  184. mov $h2,$h1 # borrow $h1
  185. add %rax,$h0
  186. adc %rdx,$d1
  187. imulq $s1,$h1 # h2*s1
  188. add $h1,$d2
  189. mov $d1,$h1
  190. adc \$0,$d3
  191. imulq $r0,$h2 # h2*r0
  192. add $d2,$h1
  193. mov \$-4,%rax # mask value
  194. adc $h2,$d3
  195. and $d3,%rax # last reduction step
  196. mov $d3,$h2
  197. shr \$2,$d3
  198. and \$3,$h2
  199. add $d3,%rax
  200. add %rax,$h0
  201. adc \$0,$h1
  202. adc \$0,$h2
  203. ___
  204. }
  205. ########################################################################
  206. # Layout of opaque area is following.
  207. #
  208. # unsigned __int64 h[3]; # current hash value base 2^64
  209. # unsigned __int64 r[2]; # key value base 2^64
  210. $code.=<<___;
  211. .text
  212. ___
  213. $code.=<<___ if (!$kernel);
  214. .extern OPENSSL_ia32cap_P
  215. .globl poly1305_init_x86_64
  216. .hidden poly1305_init_x86_64
  217. .globl poly1305_blocks_x86_64
  218. .hidden poly1305_blocks_x86_64
  219. .globl poly1305_emit_x86_64
  220. .hidden poly1305_emit_x86_64
  221. ___
  222. &declare_function("poly1305_init_x86_64", 32, 3);
  223. $code.=<<___;
  224. xor %eax,%eax
  225. mov %rax,0($ctx) # initialize hash value
  226. mov %rax,8($ctx)
  227. mov %rax,16($ctx)
  228. test $inp,$inp
  229. je .Lno_key
  230. ___
  231. $code.=<<___ if (!$kernel);
  232. lea poly1305_blocks_x86_64(%rip),%r10
  233. lea poly1305_emit_x86_64(%rip),%r11
  234. ___
  235. $code.=<<___ if (!$kernel && $avx);
  236. mov OPENSSL_ia32cap_P+4(%rip),%r9
  237. lea poly1305_blocks_avx(%rip),%rax
  238. lea poly1305_emit_avx(%rip),%rcx
  239. bt \$`60-32`,%r9 # AVX?
  240. cmovc %rax,%r10
  241. cmovc %rcx,%r11
  242. ___
  243. $code.=<<___ if (!$kernel && $avx>1);
  244. lea poly1305_blocks_avx2(%rip),%rax
  245. bt \$`5+32`,%r9 # AVX2?
  246. cmovc %rax,%r10
  247. ___
  248. $code.=<<___ if (!$kernel && $avx>3);
  249. mov \$`(1<<31|1<<21|1<<16)`,%rax
  250. shr \$32,%r9
  251. and %rax,%r9
  252. cmp %rax,%r9
  253. je .Linit_base2_44
  254. ___
  255. $code.=<<___;
  256. mov \$0x0ffffffc0fffffff,%rax
  257. mov \$0x0ffffffc0ffffffc,%rcx
  258. and 0($inp),%rax
  259. and 8($inp),%rcx
  260. mov %rax,24($ctx)
  261. mov %rcx,32($ctx)
  262. ___
  263. $code.=<<___ if (!$kernel && $flavour !~ /elf32/);
  264. mov %r10,0(%rdx)
  265. mov %r11,8(%rdx)
  266. ___
  267. $code.=<<___ if (!$kernel && $flavour =~ /elf32/);
  268. mov %r10d,0(%rdx)
  269. mov %r11d,4(%rdx)
  270. ___
  271. $code.=<<___;
  272. mov \$1,%eax
  273. .Lno_key:
  274. RET
  275. ___
  276. &end_function("poly1305_init_x86_64");
  277. &declare_function("poly1305_blocks_x86_64", 32, 4);
  278. $code.=<<___;
  279. .cfi_startproc
  280. .Lblocks:
  281. shr \$4,$len
  282. jz .Lno_data # too short
  283. push %rbx
  284. .cfi_push %rbx
  285. push %r12
  286. .cfi_push %r12
  287. push %r13
  288. .cfi_push %r13
  289. push %r14
  290. .cfi_push %r14
  291. push %r15
  292. .cfi_push %r15
  293. push $ctx
  294. .cfi_push $ctx
  295. .Lblocks_body:
  296. mov $len,%r15 # reassign $len
  297. mov 24($ctx),$r0 # load r
  298. mov 32($ctx),$s1
  299. mov 0($ctx),$h0 # load hash value
  300. mov 8($ctx),$h1
  301. mov 16($ctx),$h2
  302. mov $s1,$r1
  303. shr \$2,$s1
  304. mov $r1,%rax
  305. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  306. jmp .Loop
  307. .align 32
  308. .Loop:
  309. add 0($inp),$h0 # accumulate input
  310. adc 8($inp),$h1
  311. lea 16($inp),$inp
  312. adc $padbit,$h2
  313. ___
  314. &poly1305_iteration();
  315. $code.=<<___;
  316. mov $r1,%rax
  317. dec %r15 # len-=16
  318. jnz .Loop
  319. mov 0(%rsp),$ctx
  320. .cfi_restore $ctx
  321. mov $h0,0($ctx) # store hash value
  322. mov $h1,8($ctx)
  323. mov $h2,16($ctx)
  324. mov 8(%rsp),%r15
  325. .cfi_restore %r15
  326. mov 16(%rsp),%r14
  327. .cfi_restore %r14
  328. mov 24(%rsp),%r13
  329. .cfi_restore %r13
  330. mov 32(%rsp),%r12
  331. .cfi_restore %r12
  332. mov 40(%rsp),%rbx
  333. .cfi_restore %rbx
  334. lea 48(%rsp),%rsp
  335. .cfi_adjust_cfa_offset -48
  336. .Lno_data:
  337. .Lblocks_epilogue:
  338. RET
  339. .cfi_endproc
  340. ___
  341. &end_function("poly1305_blocks_x86_64");
  342. &declare_function("poly1305_emit_x86_64", 32, 3);
  343. $code.=<<___;
  344. .Lemit:
  345. mov 0($ctx),%r8 # load hash value
  346. mov 8($ctx),%r9
  347. mov 16($ctx),%r10
  348. mov %r8,%rax
  349. add \$5,%r8 # compare to modulus
  350. mov %r9,%rcx
  351. adc \$0,%r9
  352. adc \$0,%r10
  353. shr \$2,%r10 # did 130-bit value overflow?
  354. cmovnz %r8,%rax
  355. cmovnz %r9,%rcx
  356. add 0($nonce),%rax # accumulate nonce
  357. adc 8($nonce),%rcx
  358. mov %rax,0($mac) # write result
  359. mov %rcx,8($mac)
  360. RET
  361. ___
  362. &end_function("poly1305_emit_x86_64");
  363. if ($avx) {
  364. ########################################################################
  365. # Layout of opaque area is following.
  366. #
  367. # unsigned __int32 h[5]; # current hash value base 2^26
  368. # unsigned __int32 is_base2_26;
  369. # unsigned __int64 r[2]; # key value base 2^64
  370. # unsigned __int64 pad;
  371. # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
  372. #
  373. # where r^n are base 2^26 digits of degrees of multiplier key. There are
  374. # 5 digits, but last four are interleaved with multiples of 5, totalling
  375. # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
  376. my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
  377. map("%xmm$_",(0..15));
  378. $code.=<<___;
  379. .type __poly1305_block,\@abi-omnipotent
  380. .align 32
  381. __poly1305_block:
  382. push $ctx
  383. ___
  384. &poly1305_iteration();
  385. $code.=<<___;
  386. pop $ctx
  387. RET
  388. .size __poly1305_block,.-__poly1305_block
  389. .type __poly1305_init_avx,\@abi-omnipotent
  390. .align 32
  391. __poly1305_init_avx:
  392. push %rbp
  393. mov %rsp,%rbp
  394. mov $r0,$h0
  395. mov $r1,$h1
  396. xor $h2,$h2
  397. lea 48+64($ctx),$ctx # size optimization
  398. mov $r1,%rax
  399. call __poly1305_block # r^2
  400. mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
  401. mov \$0x3ffffff,%edx
  402. mov $h0,$d1
  403. and $h0#d,%eax
  404. mov $r0,$d2
  405. and $r0#d,%edx
  406. mov %eax,`16*0+0-64`($ctx)
  407. shr \$26,$d1
  408. mov %edx,`16*0+4-64`($ctx)
  409. shr \$26,$d2
  410. mov \$0x3ffffff,%eax
  411. mov \$0x3ffffff,%edx
  412. and $d1#d,%eax
  413. and $d2#d,%edx
  414. mov %eax,`16*1+0-64`($ctx)
  415. lea (%rax,%rax,4),%eax # *5
  416. mov %edx,`16*1+4-64`($ctx)
  417. lea (%rdx,%rdx,4),%edx # *5
  418. mov %eax,`16*2+0-64`($ctx)
  419. shr \$26,$d1
  420. mov %edx,`16*2+4-64`($ctx)
  421. shr \$26,$d2
  422. mov $h1,%rax
  423. mov $r1,%rdx
  424. shl \$12,%rax
  425. shl \$12,%rdx
  426. or $d1,%rax
  427. or $d2,%rdx
  428. and \$0x3ffffff,%eax
  429. and \$0x3ffffff,%edx
  430. mov %eax,`16*3+0-64`($ctx)
  431. lea (%rax,%rax,4),%eax # *5
  432. mov %edx,`16*3+4-64`($ctx)
  433. lea (%rdx,%rdx,4),%edx # *5
  434. mov %eax,`16*4+0-64`($ctx)
  435. mov $h1,$d1
  436. mov %edx,`16*4+4-64`($ctx)
  437. mov $r1,$d2
  438. mov \$0x3ffffff,%eax
  439. mov \$0x3ffffff,%edx
  440. shr \$14,$d1
  441. shr \$14,$d2
  442. and $d1#d,%eax
  443. and $d2#d,%edx
  444. mov %eax,`16*5+0-64`($ctx)
  445. lea (%rax,%rax,4),%eax # *5
  446. mov %edx,`16*5+4-64`($ctx)
  447. lea (%rdx,%rdx,4),%edx # *5
  448. mov %eax,`16*6+0-64`($ctx)
  449. shr \$26,$d1
  450. mov %edx,`16*6+4-64`($ctx)
  451. shr \$26,$d2
  452. mov $h2,%rax
  453. shl \$24,%rax
  454. or %rax,$d1
  455. mov $d1#d,`16*7+0-64`($ctx)
  456. lea ($d1,$d1,4),$d1 # *5
  457. mov $d2#d,`16*7+4-64`($ctx)
  458. lea ($d2,$d2,4),$d2 # *5
  459. mov $d1#d,`16*8+0-64`($ctx)
  460. mov $d2#d,`16*8+4-64`($ctx)
  461. mov $r1,%rax
  462. call __poly1305_block # r^3
  463. mov \$0x3ffffff,%eax # save r^3 base 2^26
  464. mov $h0,$d1
  465. and $h0#d,%eax
  466. shr \$26,$d1
  467. mov %eax,`16*0+12-64`($ctx)
  468. mov \$0x3ffffff,%edx
  469. and $d1#d,%edx
  470. mov %edx,`16*1+12-64`($ctx)
  471. lea (%rdx,%rdx,4),%edx # *5
  472. shr \$26,$d1
  473. mov %edx,`16*2+12-64`($ctx)
  474. mov $h1,%rax
  475. shl \$12,%rax
  476. or $d1,%rax
  477. and \$0x3ffffff,%eax
  478. mov %eax,`16*3+12-64`($ctx)
  479. lea (%rax,%rax,4),%eax # *5
  480. mov $h1,$d1
  481. mov %eax,`16*4+12-64`($ctx)
  482. mov \$0x3ffffff,%edx
  483. shr \$14,$d1
  484. and $d1#d,%edx
  485. mov %edx,`16*5+12-64`($ctx)
  486. lea (%rdx,%rdx,4),%edx # *5
  487. shr \$26,$d1
  488. mov %edx,`16*6+12-64`($ctx)
  489. mov $h2,%rax
  490. shl \$24,%rax
  491. or %rax,$d1
  492. mov $d1#d,`16*7+12-64`($ctx)
  493. lea ($d1,$d1,4),$d1 # *5
  494. mov $d1#d,`16*8+12-64`($ctx)
  495. mov $r1,%rax
  496. call __poly1305_block # r^4
  497. mov \$0x3ffffff,%eax # save r^4 base 2^26
  498. mov $h0,$d1
  499. and $h0#d,%eax
  500. shr \$26,$d1
  501. mov %eax,`16*0+8-64`($ctx)
  502. mov \$0x3ffffff,%edx
  503. and $d1#d,%edx
  504. mov %edx,`16*1+8-64`($ctx)
  505. lea (%rdx,%rdx,4),%edx # *5
  506. shr \$26,$d1
  507. mov %edx,`16*2+8-64`($ctx)
  508. mov $h1,%rax
  509. shl \$12,%rax
  510. or $d1,%rax
  511. and \$0x3ffffff,%eax
  512. mov %eax,`16*3+8-64`($ctx)
  513. lea (%rax,%rax,4),%eax # *5
  514. mov $h1,$d1
  515. mov %eax,`16*4+8-64`($ctx)
  516. mov \$0x3ffffff,%edx
  517. shr \$14,$d1
  518. and $d1#d,%edx
  519. mov %edx,`16*5+8-64`($ctx)
  520. lea (%rdx,%rdx,4),%edx # *5
  521. shr \$26,$d1
  522. mov %edx,`16*6+8-64`($ctx)
  523. mov $h2,%rax
  524. shl \$24,%rax
  525. or %rax,$d1
  526. mov $d1#d,`16*7+8-64`($ctx)
  527. lea ($d1,$d1,4),$d1 # *5
  528. mov $d1#d,`16*8+8-64`($ctx)
  529. lea -48-64($ctx),$ctx # size [de-]optimization
  530. pop %rbp
  531. RET
  532. .size __poly1305_init_avx,.-__poly1305_init_avx
  533. ___
  534. &declare_function("poly1305_blocks_avx", 32, 4);
  535. $code.=<<___;
  536. .cfi_startproc
  537. mov 20($ctx),%r8d # is_base2_26
  538. cmp \$128,$len
  539. jae .Lblocks_avx
  540. test %r8d,%r8d
  541. jz .Lblocks
  542. .Lblocks_avx:
  543. and \$-16,$len
  544. jz .Lno_data_avx
  545. vzeroupper
  546. test %r8d,%r8d
  547. jz .Lbase2_64_avx
  548. test \$31,$len
  549. jz .Leven_avx
  550. push %rbp
  551. .cfi_push %rbp
  552. mov %rsp,%rbp
  553. push %rbx
  554. .cfi_push %rbx
  555. push %r12
  556. .cfi_push %r12
  557. push %r13
  558. .cfi_push %r13
  559. push %r14
  560. .cfi_push %r14
  561. push %r15
  562. .cfi_push %r15
  563. .Lblocks_avx_body:
  564. mov $len,%r15 # reassign $len
  565. mov 0($ctx),$d1 # load hash value
  566. mov 8($ctx),$d2
  567. mov 16($ctx),$h2#d
  568. mov 24($ctx),$r0 # load r
  569. mov 32($ctx),$s1
  570. ################################# base 2^26 -> base 2^64
  571. mov $d1#d,$h0#d
  572. and \$`-1*(1<<31)`,$d1
  573. mov $d2,$r1 # borrow $r1
  574. mov $d2#d,$h1#d
  575. and \$`-1*(1<<31)`,$d2
  576. shr \$6,$d1
  577. shl \$52,$r1
  578. add $d1,$h0
  579. shr \$12,$h1
  580. shr \$18,$d2
  581. add $r1,$h0
  582. adc $d2,$h1
  583. mov $h2,$d1
  584. shl \$40,$d1
  585. shr \$24,$h2
  586. add $d1,$h1
  587. adc \$0,$h2 # can be partially reduced...
  588. mov \$-4,$d2 # ... so reduce
  589. mov $h2,$d1
  590. and $h2,$d2
  591. shr \$2,$d1
  592. and \$3,$h2
  593. add $d2,$d1 # =*5
  594. add $d1,$h0
  595. adc \$0,$h1
  596. adc \$0,$h2
  597. mov $s1,$r1
  598. mov $s1,%rax
  599. shr \$2,$s1
  600. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  601. add 0($inp),$h0 # accumulate input
  602. adc 8($inp),$h1
  603. lea 16($inp),$inp
  604. adc $padbit,$h2
  605. call __poly1305_block
  606. test $padbit,$padbit # if $padbit is zero,
  607. jz .Lstore_base2_64_avx # store hash in base 2^64 format
  608. ################################# base 2^64 -> base 2^26
  609. mov $h0,%rax
  610. mov $h0,%rdx
  611. shr \$52,$h0
  612. mov $h1,$r0
  613. mov $h1,$r1
  614. shr \$26,%rdx
  615. and \$0x3ffffff,%rax # h[0]
  616. shl \$12,$r0
  617. and \$0x3ffffff,%rdx # h[1]
  618. shr \$14,$h1
  619. or $r0,$h0
  620. shl \$24,$h2
  621. and \$0x3ffffff,$h0 # h[2]
  622. shr \$40,$r1
  623. and \$0x3ffffff,$h1 # h[3]
  624. or $r1,$h2 # h[4]
  625. sub \$16,%r15
  626. jz .Lstore_base2_26_avx
  627. vmovd %rax#d,$H0
  628. vmovd %rdx#d,$H1
  629. vmovd $h0#d,$H2
  630. vmovd $h1#d,$H3
  631. vmovd $h2#d,$H4
  632. jmp .Lproceed_avx
  633. .align 32
  634. .Lstore_base2_64_avx:
  635. mov $h0,0($ctx)
  636. mov $h1,8($ctx)
  637. mov $h2,16($ctx) # note that is_base2_26 is zeroed
  638. jmp .Ldone_avx
  639. .align 16
  640. .Lstore_base2_26_avx:
  641. mov %rax#d,0($ctx) # store hash value base 2^26
  642. mov %rdx#d,4($ctx)
  643. mov $h0#d,8($ctx)
  644. mov $h1#d,12($ctx)
  645. mov $h2#d,16($ctx)
  646. .align 16
  647. .Ldone_avx:
  648. pop %r15
  649. .cfi_restore %r15
  650. pop %r14
  651. .cfi_restore %r14
  652. pop %r13
  653. .cfi_restore %r13
  654. pop %r12
  655. .cfi_restore %r12
  656. pop %rbx
  657. .cfi_restore %rbx
  658. pop %rbp
  659. .cfi_restore %rbp
  660. .Lno_data_avx:
  661. .Lblocks_avx_epilogue:
  662. RET
  663. .cfi_endproc
  664. .align 32
  665. .Lbase2_64_avx:
  666. .cfi_startproc
  667. push %rbp
  668. .cfi_push %rbp
  669. mov %rsp,%rbp
  670. push %rbx
  671. .cfi_push %rbx
  672. push %r12
  673. .cfi_push %r12
  674. push %r13
  675. .cfi_push %r13
  676. push %r14
  677. .cfi_push %r14
  678. push %r15
  679. .cfi_push %r15
  680. .Lbase2_64_avx_body:
  681. mov $len,%r15 # reassign $len
  682. mov 24($ctx),$r0 # load r
  683. mov 32($ctx),$s1
  684. mov 0($ctx),$h0 # load hash value
  685. mov 8($ctx),$h1
  686. mov 16($ctx),$h2#d
  687. mov $s1,$r1
  688. mov $s1,%rax
  689. shr \$2,$s1
  690. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  691. test \$31,$len
  692. jz .Linit_avx
  693. add 0($inp),$h0 # accumulate input
  694. adc 8($inp),$h1
  695. lea 16($inp),$inp
  696. adc $padbit,$h2
  697. sub \$16,%r15
  698. call __poly1305_block
  699. .Linit_avx:
  700. ################################# base 2^64 -> base 2^26
  701. mov $h0,%rax
  702. mov $h0,%rdx
  703. shr \$52,$h0
  704. mov $h1,$d1
  705. mov $h1,$d2
  706. shr \$26,%rdx
  707. and \$0x3ffffff,%rax # h[0]
  708. shl \$12,$d1
  709. and \$0x3ffffff,%rdx # h[1]
  710. shr \$14,$h1
  711. or $d1,$h0
  712. shl \$24,$h2
  713. and \$0x3ffffff,$h0 # h[2]
  714. shr \$40,$d2
  715. and \$0x3ffffff,$h1 # h[3]
  716. or $d2,$h2 # h[4]
  717. vmovd %rax#d,$H0
  718. vmovd %rdx#d,$H1
  719. vmovd $h0#d,$H2
  720. vmovd $h1#d,$H3
  721. vmovd $h2#d,$H4
  722. movl \$1,20($ctx) # set is_base2_26
  723. call __poly1305_init_avx
  724. .Lproceed_avx:
  725. mov %r15,$len
  726. pop %r15
  727. .cfi_restore %r15
  728. pop %r14
  729. .cfi_restore %r14
  730. pop %r13
  731. .cfi_restore %r13
  732. pop %r12
  733. .cfi_restore %r12
  734. pop %rbx
  735. .cfi_restore %rbx
  736. pop %rbp
  737. .cfi_restore %rbp
  738. .Lbase2_64_avx_epilogue:
  739. jmp .Ldo_avx
  740. .cfi_endproc
  741. .align 32
  742. .Leven_avx:
  743. .cfi_startproc
  744. vmovd 4*0($ctx),$H0 # load hash value
  745. vmovd 4*1($ctx),$H1
  746. vmovd 4*2($ctx),$H2
  747. vmovd 4*3($ctx),$H3
  748. vmovd 4*4($ctx),$H4
  749. .Ldo_avx:
  750. ___
  751. $code.=<<___ if (!$win64);
  752. lea 8(%rsp),%r10
  753. .cfi_def_cfa_register %r10
  754. and \$-32,%rsp
  755. sub \$-8,%rsp
  756. lea -0x58(%rsp),%r11
  757. sub \$0x178,%rsp
  758. ___
  759. $code.=<<___ if ($win64);
  760. lea -0xf8(%rsp),%r11
  761. sub \$0x218,%rsp
  762. vmovdqa %xmm6,0x50(%r11)
  763. vmovdqa %xmm7,0x60(%r11)
  764. vmovdqa %xmm8,0x70(%r11)
  765. vmovdqa %xmm9,0x80(%r11)
  766. vmovdqa %xmm10,0x90(%r11)
  767. vmovdqa %xmm11,0xa0(%r11)
  768. vmovdqa %xmm12,0xb0(%r11)
  769. vmovdqa %xmm13,0xc0(%r11)
  770. vmovdqa %xmm14,0xd0(%r11)
  771. vmovdqa %xmm15,0xe0(%r11)
  772. .Ldo_avx_body:
  773. ___
  774. $code.=<<___;
  775. sub \$64,$len
  776. lea -32($inp),%rax
  777. cmovc %rax,$inp
  778. vmovdqu `16*3`($ctx),$D4 # preload r0^2
  779. lea `16*3+64`($ctx),$ctx # size optimization
  780. lea .Lconst(%rip),%rcx
  781. ################################################################
  782. # load input
  783. vmovdqu 16*2($inp),$T0
  784. vmovdqu 16*3($inp),$T1
  785. vmovdqa 64(%rcx),$MASK # .Lmask26
  786. vpsrldq \$6,$T0,$T2 # splat input
  787. vpsrldq \$6,$T1,$T3
  788. vpunpckhqdq $T1,$T0,$T4 # 4
  789. vpunpcklqdq $T1,$T0,$T0 # 0:1
  790. vpunpcklqdq $T3,$T2,$T3 # 2:3
  791. vpsrlq \$40,$T4,$T4 # 4
  792. vpsrlq \$26,$T0,$T1
  793. vpand $MASK,$T0,$T0 # 0
  794. vpsrlq \$4,$T3,$T2
  795. vpand $MASK,$T1,$T1 # 1
  796. vpsrlq \$30,$T3,$T3
  797. vpand $MASK,$T2,$T2 # 2
  798. vpand $MASK,$T3,$T3 # 3
  799. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  800. jbe .Lskip_loop_avx
  801. # expand and copy pre-calculated table to stack
  802. vmovdqu `16*1-64`($ctx),$D1
  803. vmovdqu `16*2-64`($ctx),$D2
  804. vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
  805. vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
  806. vmovdqa $D3,-0x90(%r11)
  807. vmovdqa $D0,0x00(%rsp)
  808. vpshufd \$0xEE,$D1,$D4
  809. vmovdqu `16*3-64`($ctx),$D0
  810. vpshufd \$0x44,$D1,$D1
  811. vmovdqa $D4,-0x80(%r11)
  812. vmovdqa $D1,0x10(%rsp)
  813. vpshufd \$0xEE,$D2,$D3
  814. vmovdqu `16*4-64`($ctx),$D1
  815. vpshufd \$0x44,$D2,$D2
  816. vmovdqa $D3,-0x70(%r11)
  817. vmovdqa $D2,0x20(%rsp)
  818. vpshufd \$0xEE,$D0,$D4
  819. vmovdqu `16*5-64`($ctx),$D2
  820. vpshufd \$0x44,$D0,$D0
  821. vmovdqa $D4,-0x60(%r11)
  822. vmovdqa $D0,0x30(%rsp)
  823. vpshufd \$0xEE,$D1,$D3
  824. vmovdqu `16*6-64`($ctx),$D0
  825. vpshufd \$0x44,$D1,$D1
  826. vmovdqa $D3,-0x50(%r11)
  827. vmovdqa $D1,0x40(%rsp)
  828. vpshufd \$0xEE,$D2,$D4
  829. vmovdqu `16*7-64`($ctx),$D1
  830. vpshufd \$0x44,$D2,$D2
  831. vmovdqa $D4,-0x40(%r11)
  832. vmovdqa $D2,0x50(%rsp)
  833. vpshufd \$0xEE,$D0,$D3
  834. vmovdqu `16*8-64`($ctx),$D2
  835. vpshufd \$0x44,$D0,$D0
  836. vmovdqa $D3,-0x30(%r11)
  837. vmovdqa $D0,0x60(%rsp)
  838. vpshufd \$0xEE,$D1,$D4
  839. vpshufd \$0x44,$D1,$D1
  840. vmovdqa $D4,-0x20(%r11)
  841. vmovdqa $D1,0x70(%rsp)
  842. vpshufd \$0xEE,$D2,$D3
  843. vmovdqa 0x00(%rsp),$D4 # preload r0^2
  844. vpshufd \$0x44,$D2,$D2
  845. vmovdqa $D3,-0x10(%r11)
  846. vmovdqa $D2,0x80(%rsp)
  847. jmp .Loop_avx
  848. .align 32
  849. .Loop_avx:
  850. ################################################################
  851. # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  852. # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  853. # \___________________/
  854. # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  855. # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  856. # \___________________/ \____________________/
  857. #
  858. # Note that we start with inp[2:3]*r^2. This is because it
  859. # doesn't depend on reduction in previous iteration.
  860. ################################################################
  861. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  862. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  863. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  864. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  865. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  866. #
  867. # though note that $Tx and $Hx are "reversed" in this section,
  868. # and $D4 is preloaded with r0^2...
  869. vpmuludq $T0,$D4,$D0 # d0 = h0*r0
  870. vpmuludq $T1,$D4,$D1 # d1 = h1*r0
  871. vmovdqa $H2,0x20(%r11) # offload hash
  872. vpmuludq $T2,$D4,$D2 # d3 = h2*r0
  873. vmovdqa 0x10(%rsp),$H2 # r1^2
  874. vpmuludq $T3,$D4,$D3 # d3 = h3*r0
  875. vpmuludq $T4,$D4,$D4 # d4 = h4*r0
  876. vmovdqa $H0,0x00(%r11) #
  877. vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
  878. vmovdqa $H1,0x10(%r11) #
  879. vpmuludq $T3,$H2,$H1 # h3*r1
  880. vpaddq $H0,$D0,$D0 # d0 += h4*s1
  881. vpaddq $H1,$D4,$D4 # d4 += h3*r1
  882. vmovdqa $H3,0x30(%r11) #
  883. vpmuludq $T2,$H2,$H0 # h2*r1
  884. vpmuludq $T1,$H2,$H1 # h1*r1
  885. vpaddq $H0,$D3,$D3 # d3 += h2*r1
  886. vmovdqa 0x30(%rsp),$H3 # r2^2
  887. vpaddq $H1,$D2,$D2 # d2 += h1*r1
  888. vmovdqa $H4,0x40(%r11) #
  889. vpmuludq $T0,$H2,$H2 # h0*r1
  890. vpmuludq $T2,$H3,$H0 # h2*r2
  891. vpaddq $H2,$D1,$D1 # d1 += h0*r1
  892. vmovdqa 0x40(%rsp),$H4 # s2^2
  893. vpaddq $H0,$D4,$D4 # d4 += h2*r2
  894. vpmuludq $T1,$H3,$H1 # h1*r2
  895. vpmuludq $T0,$H3,$H3 # h0*r2
  896. vpaddq $H1,$D3,$D3 # d3 += h1*r2
  897. vmovdqa 0x50(%rsp),$H2 # r3^2
  898. vpaddq $H3,$D2,$D2 # d2 += h0*r2
  899. vpmuludq $T4,$H4,$H0 # h4*s2
  900. vpmuludq $T3,$H4,$H4 # h3*s2
  901. vpaddq $H0,$D1,$D1 # d1 += h4*s2
  902. vmovdqa 0x60(%rsp),$H3 # s3^2
  903. vpaddq $H4,$D0,$D0 # d0 += h3*s2
  904. vmovdqa 0x80(%rsp),$H4 # s4^2
  905. vpmuludq $T1,$H2,$H1 # h1*r3
  906. vpmuludq $T0,$H2,$H2 # h0*r3
  907. vpaddq $H1,$D4,$D4 # d4 += h1*r3
  908. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  909. vpmuludq $T4,$H3,$H0 # h4*s3
  910. vpmuludq $T3,$H3,$H1 # h3*s3
  911. vpaddq $H0,$D2,$D2 # d2 += h4*s3
  912. vmovdqu 16*0($inp),$H0 # load input
  913. vpaddq $H1,$D1,$D1 # d1 += h3*s3
  914. vpmuludq $T2,$H3,$H3 # h2*s3
  915. vpmuludq $T2,$H4,$T2 # h2*s4
  916. vpaddq $H3,$D0,$D0 # d0 += h2*s3
  917. vmovdqu 16*1($inp),$H1 #
  918. vpaddq $T2,$D1,$D1 # d1 += h2*s4
  919. vpmuludq $T3,$H4,$T3 # h3*s4
  920. vpmuludq $T4,$H4,$T4 # h4*s4
  921. vpsrldq \$6,$H0,$H2 # splat input
  922. vpaddq $T3,$D2,$D2 # d2 += h3*s4
  923. vpaddq $T4,$D3,$D3 # d3 += h4*s4
  924. vpsrldq \$6,$H1,$H3 #
  925. vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
  926. vpmuludq $T1,$H4,$T0 # h1*s4
  927. vpunpckhqdq $H1,$H0,$H4 # 4
  928. vpaddq $T4,$D4,$D4 # d4 += h0*r4
  929. vmovdqa -0x90(%r11),$T4 # r0^4
  930. vpaddq $T0,$D0,$D0 # d0 += h1*s4
  931. vpunpcklqdq $H1,$H0,$H0 # 0:1
  932. vpunpcklqdq $H3,$H2,$H3 # 2:3
  933. #vpsrlq \$40,$H4,$H4 # 4
  934. vpsrldq \$`40/8`,$H4,$H4 # 4
  935. vpsrlq \$26,$H0,$H1
  936. vpand $MASK,$H0,$H0 # 0
  937. vpsrlq \$4,$H3,$H2
  938. vpand $MASK,$H1,$H1 # 1
  939. vpand 0(%rcx),$H4,$H4 # .Lmask24
  940. vpsrlq \$30,$H3,$H3
  941. vpand $MASK,$H2,$H2 # 2
  942. vpand $MASK,$H3,$H3 # 3
  943. vpor 32(%rcx),$H4,$H4 # padbit, yes, always
  944. vpaddq 0x00(%r11),$H0,$H0 # add hash value
  945. vpaddq 0x10(%r11),$H1,$H1
  946. vpaddq 0x20(%r11),$H2,$H2
  947. vpaddq 0x30(%r11),$H3,$H3
  948. vpaddq 0x40(%r11),$H4,$H4
  949. lea 16*2($inp),%rax
  950. lea 16*4($inp),$inp
  951. sub \$64,$len
  952. cmovc %rax,$inp
  953. ################################################################
  954. # Now we accumulate (inp[0:1]+hash)*r^4
  955. ################################################################
  956. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  957. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  958. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  959. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  960. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  961. vpmuludq $H0,$T4,$T0 # h0*r0
  962. vpmuludq $H1,$T4,$T1 # h1*r0
  963. vpaddq $T0,$D0,$D0
  964. vpaddq $T1,$D1,$D1
  965. vmovdqa -0x80(%r11),$T2 # r1^4
  966. vpmuludq $H2,$T4,$T0 # h2*r0
  967. vpmuludq $H3,$T4,$T1 # h3*r0
  968. vpaddq $T0,$D2,$D2
  969. vpaddq $T1,$D3,$D3
  970. vpmuludq $H4,$T4,$T4 # h4*r0
  971. vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
  972. vpaddq $T4,$D4,$D4
  973. vpaddq $T0,$D0,$D0 # d0 += h4*s1
  974. vpmuludq $H2,$T2,$T1 # h2*r1
  975. vpmuludq $H3,$T2,$T0 # h3*r1
  976. vpaddq $T1,$D3,$D3 # d3 += h2*r1
  977. vmovdqa -0x60(%r11),$T3 # r2^4
  978. vpaddq $T0,$D4,$D4 # d4 += h3*r1
  979. vpmuludq $H1,$T2,$T1 # h1*r1
  980. vpmuludq $H0,$T2,$T2 # h0*r1
  981. vpaddq $T1,$D2,$D2 # d2 += h1*r1
  982. vpaddq $T2,$D1,$D1 # d1 += h0*r1
  983. vmovdqa -0x50(%r11),$T4 # s2^4
  984. vpmuludq $H2,$T3,$T0 # h2*r2
  985. vpmuludq $H1,$T3,$T1 # h1*r2
  986. vpaddq $T0,$D4,$D4 # d4 += h2*r2
  987. vpaddq $T1,$D3,$D3 # d3 += h1*r2
  988. vmovdqa -0x40(%r11),$T2 # r3^4
  989. vpmuludq $H0,$T3,$T3 # h0*r2
  990. vpmuludq $H4,$T4,$T0 # h4*s2
  991. vpaddq $T3,$D2,$D2 # d2 += h0*r2
  992. vpaddq $T0,$D1,$D1 # d1 += h4*s2
  993. vmovdqa -0x30(%r11),$T3 # s3^4
  994. vpmuludq $H3,$T4,$T4 # h3*s2
  995. vpmuludq $H1,$T2,$T1 # h1*r3
  996. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  997. vmovdqa -0x10(%r11),$T4 # s4^4
  998. vpaddq $T1,$D4,$D4 # d4 += h1*r3
  999. vpmuludq $H0,$T2,$T2 # h0*r3
  1000. vpmuludq $H4,$T3,$T0 # h4*s3
  1001. vpaddq $T2,$D3,$D3 # d3 += h0*r3
  1002. vpaddq $T0,$D2,$D2 # d2 += h4*s3
  1003. vmovdqu 16*2($inp),$T0 # load input
  1004. vpmuludq $H3,$T3,$T2 # h3*s3
  1005. vpmuludq $H2,$T3,$T3 # h2*s3
  1006. vpaddq $T2,$D1,$D1 # d1 += h3*s3
  1007. vmovdqu 16*3($inp),$T1 #
  1008. vpaddq $T3,$D0,$D0 # d0 += h2*s3
  1009. vpmuludq $H2,$T4,$H2 # h2*s4
  1010. vpmuludq $H3,$T4,$H3 # h3*s4
  1011. vpsrldq \$6,$T0,$T2 # splat input
  1012. vpaddq $H2,$D1,$D1 # d1 += h2*s4
  1013. vpmuludq $H4,$T4,$H4 # h4*s4
  1014. vpsrldq \$6,$T1,$T3 #
  1015. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
  1016. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
  1017. vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
  1018. vpmuludq $H1,$T4,$H0
  1019. vpunpckhqdq $T1,$T0,$T4 # 4
  1020. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1021. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1022. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1023. vpunpcklqdq $T3,$T2,$T3 # 2:3
  1024. #vpsrlq \$40,$T4,$T4 # 4
  1025. vpsrldq \$`40/8`,$T4,$T4 # 4
  1026. vpsrlq \$26,$T0,$T1
  1027. vmovdqa 0x00(%rsp),$D4 # preload r0^2
  1028. vpand $MASK,$T0,$T0 # 0
  1029. vpsrlq \$4,$T3,$T2
  1030. vpand $MASK,$T1,$T1 # 1
  1031. vpand 0(%rcx),$T4,$T4 # .Lmask24
  1032. vpsrlq \$30,$T3,$T3
  1033. vpand $MASK,$T2,$T2 # 2
  1034. vpand $MASK,$T3,$T3 # 3
  1035. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1036. ################################################################
  1037. # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  1038. # and P. Schwabe
  1039. vpsrlq \$26,$H3,$D3
  1040. vpand $MASK,$H3,$H3
  1041. vpaddq $D3,$H4,$H4 # h3 -> h4
  1042. vpsrlq \$26,$H0,$D0
  1043. vpand $MASK,$H0,$H0
  1044. vpaddq $D0,$D1,$H1 # h0 -> h1
  1045. vpsrlq \$26,$H4,$D0
  1046. vpand $MASK,$H4,$H4
  1047. vpsrlq \$26,$H1,$D1
  1048. vpand $MASK,$H1,$H1
  1049. vpaddq $D1,$H2,$H2 # h1 -> h2
  1050. vpaddq $D0,$H0,$H0
  1051. vpsllq \$2,$D0,$D0
  1052. vpaddq $D0,$H0,$H0 # h4 -> h0
  1053. vpsrlq \$26,$H2,$D2
  1054. vpand $MASK,$H2,$H2
  1055. vpaddq $D2,$H3,$H3 # h2 -> h3
  1056. vpsrlq \$26,$H0,$D0
  1057. vpand $MASK,$H0,$H0
  1058. vpaddq $D0,$H1,$H1 # h0 -> h1
  1059. vpsrlq \$26,$H3,$D3
  1060. vpand $MASK,$H3,$H3
  1061. vpaddq $D3,$H4,$H4 # h3 -> h4
  1062. ja .Loop_avx
  1063. .Lskip_loop_avx:
  1064. ################################################################
  1065. # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  1066. vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
  1067. add \$32,$len
  1068. jnz .Long_tail_avx
  1069. vpaddq $H2,$T2,$T2
  1070. vpaddq $H0,$T0,$T0
  1071. vpaddq $H1,$T1,$T1
  1072. vpaddq $H3,$T3,$T3
  1073. vpaddq $H4,$T4,$T4
  1074. .Long_tail_avx:
  1075. vmovdqa $H2,0x20(%r11)
  1076. vmovdqa $H0,0x00(%r11)
  1077. vmovdqa $H1,0x10(%r11)
  1078. vmovdqa $H3,0x30(%r11)
  1079. vmovdqa $H4,0x40(%r11)
  1080. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1081. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1082. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1083. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1084. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1085. vpmuludq $T2,$D4,$D2 # d2 = h2*r0
  1086. vpmuludq $T0,$D4,$D0 # d0 = h0*r0
  1087. vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
  1088. vpmuludq $T1,$D4,$D1 # d1 = h1*r0
  1089. vpmuludq $T3,$D4,$D3 # d3 = h3*r0
  1090. vpmuludq $T4,$D4,$D4 # d4 = h4*r0
  1091. vpmuludq $T3,$H2,$H0 # h3*r1
  1092. vpaddq $H0,$D4,$D4 # d4 += h3*r1
  1093. vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
  1094. vpmuludq $T2,$H2,$H1 # h2*r1
  1095. vpaddq $H1,$D3,$D3 # d3 += h2*r1
  1096. vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
  1097. vpmuludq $T1,$H2,$H0 # h1*r1
  1098. vpaddq $H0,$D2,$D2 # d2 += h1*r1
  1099. vpmuludq $T0,$H2,$H2 # h0*r1
  1100. vpaddq $H2,$D1,$D1 # d1 += h0*r1
  1101. vpmuludq $T4,$H3,$H3 # h4*s1
  1102. vpaddq $H3,$D0,$D0 # d0 += h4*s1
  1103. vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
  1104. vpmuludq $T2,$H4,$H1 # h2*r2
  1105. vpaddq $H1,$D4,$D4 # d4 += h2*r2
  1106. vpmuludq $T1,$H4,$H0 # h1*r2
  1107. vpaddq $H0,$D3,$D3 # d3 += h1*r2
  1108. vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
  1109. vpmuludq $T0,$H4,$H4 # h0*r2
  1110. vpaddq $H4,$D2,$D2 # d2 += h0*r2
  1111. vpmuludq $T4,$H2,$H1 # h4*s2
  1112. vpaddq $H1,$D1,$D1 # d1 += h4*s2
  1113. vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
  1114. vpmuludq $T3,$H2,$H2 # h3*s2
  1115. vpaddq $H2,$D0,$D0 # d0 += h3*s2
  1116. vpmuludq $T1,$H3,$H0 # h1*r3
  1117. vpaddq $H0,$D4,$D4 # d4 += h1*r3
  1118. vpmuludq $T0,$H3,$H3 # h0*r3
  1119. vpaddq $H3,$D3,$D3 # d3 += h0*r3
  1120. vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
  1121. vpmuludq $T4,$H4,$H1 # h4*s3
  1122. vpaddq $H1,$D2,$D2 # d2 += h4*s3
  1123. vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
  1124. vpmuludq $T3,$H4,$H0 # h3*s3
  1125. vpaddq $H0,$D1,$D1 # d1 += h3*s3
  1126. vpmuludq $T2,$H4,$H4 # h2*s3
  1127. vpaddq $H4,$D0,$D0 # d0 += h2*s3
  1128. vpmuludq $T0,$H2,$H2 # h0*r4
  1129. vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
  1130. vpmuludq $T4,$H3,$H1 # h4*s4
  1131. vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
  1132. vpmuludq $T3,$H3,$H0 # h3*s4
  1133. vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
  1134. vpmuludq $T2,$H3,$H1 # h2*s4
  1135. vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
  1136. vpmuludq $T1,$H3,$H3 # h1*s4
  1137. vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
  1138. jz .Lshort_tail_avx
  1139. vmovdqu 16*0($inp),$H0 # load input
  1140. vmovdqu 16*1($inp),$H1
  1141. vpsrldq \$6,$H0,$H2 # splat input
  1142. vpsrldq \$6,$H1,$H3
  1143. vpunpckhqdq $H1,$H0,$H4 # 4
  1144. vpunpcklqdq $H1,$H0,$H0 # 0:1
  1145. vpunpcklqdq $H3,$H2,$H3 # 2:3
  1146. vpsrlq \$40,$H4,$H4 # 4
  1147. vpsrlq \$26,$H0,$H1
  1148. vpand $MASK,$H0,$H0 # 0
  1149. vpsrlq \$4,$H3,$H2
  1150. vpand $MASK,$H1,$H1 # 1
  1151. vpsrlq \$30,$H3,$H3
  1152. vpand $MASK,$H2,$H2 # 2
  1153. vpand $MASK,$H3,$H3 # 3
  1154. vpor 32(%rcx),$H4,$H4 # padbit, yes, always
  1155. vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
  1156. vpaddq 0x00(%r11),$H0,$H0
  1157. vpaddq 0x10(%r11),$H1,$H1
  1158. vpaddq 0x20(%r11),$H2,$H2
  1159. vpaddq 0x30(%r11),$H3,$H3
  1160. vpaddq 0x40(%r11),$H4,$H4
  1161. ################################################################
  1162. # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
  1163. vpmuludq $H0,$T4,$T0 # h0*r0
  1164. vpaddq $T0,$D0,$D0 # d0 += h0*r0
  1165. vpmuludq $H1,$T4,$T1 # h1*r0
  1166. vpaddq $T1,$D1,$D1 # d1 += h1*r0
  1167. vpmuludq $H2,$T4,$T0 # h2*r0
  1168. vpaddq $T0,$D2,$D2 # d2 += h2*r0
  1169. vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
  1170. vpmuludq $H3,$T4,$T1 # h3*r0
  1171. vpaddq $T1,$D3,$D3 # d3 += h3*r0
  1172. vpmuludq $H4,$T4,$T4 # h4*r0
  1173. vpaddq $T4,$D4,$D4 # d4 += h4*r0
  1174. vpmuludq $H3,$T2,$T0 # h3*r1
  1175. vpaddq $T0,$D4,$D4 # d4 += h3*r1
  1176. vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
  1177. vpmuludq $H2,$T2,$T1 # h2*r1
  1178. vpaddq $T1,$D3,$D3 # d3 += h2*r1
  1179. vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
  1180. vpmuludq $H1,$T2,$T0 # h1*r1
  1181. vpaddq $T0,$D2,$D2 # d2 += h1*r1
  1182. vpmuludq $H0,$T2,$T2 # h0*r1
  1183. vpaddq $T2,$D1,$D1 # d1 += h0*r1
  1184. vpmuludq $H4,$T3,$T3 # h4*s1
  1185. vpaddq $T3,$D0,$D0 # d0 += h4*s1
  1186. vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
  1187. vpmuludq $H2,$T4,$T1 # h2*r2
  1188. vpaddq $T1,$D4,$D4 # d4 += h2*r2
  1189. vpmuludq $H1,$T4,$T0 # h1*r2
  1190. vpaddq $T0,$D3,$D3 # d3 += h1*r2
  1191. vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
  1192. vpmuludq $H0,$T4,$T4 # h0*r2
  1193. vpaddq $T4,$D2,$D2 # d2 += h0*r2
  1194. vpmuludq $H4,$T2,$T1 # h4*s2
  1195. vpaddq $T1,$D1,$D1 # d1 += h4*s2
  1196. vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
  1197. vpmuludq $H3,$T2,$T2 # h3*s2
  1198. vpaddq $T2,$D0,$D0 # d0 += h3*s2
  1199. vpmuludq $H1,$T3,$T0 # h1*r3
  1200. vpaddq $T0,$D4,$D4 # d4 += h1*r3
  1201. vpmuludq $H0,$T3,$T3 # h0*r3
  1202. vpaddq $T3,$D3,$D3 # d3 += h0*r3
  1203. vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
  1204. vpmuludq $H4,$T4,$T1 # h4*s3
  1205. vpaddq $T1,$D2,$D2 # d2 += h4*s3
  1206. vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
  1207. vpmuludq $H3,$T4,$T0 # h3*s3
  1208. vpaddq $T0,$D1,$D1 # d1 += h3*s3
  1209. vpmuludq $H2,$T4,$T4 # h2*s3
  1210. vpaddq $T4,$D0,$D0 # d0 += h2*s3
  1211. vpmuludq $H0,$T2,$T2 # h0*r4
  1212. vpaddq $T2,$D4,$D4 # d4 += h0*r4
  1213. vpmuludq $H4,$T3,$T1 # h4*s4
  1214. vpaddq $T1,$D3,$D3 # d3 += h4*s4
  1215. vpmuludq $H3,$T3,$T0 # h3*s4
  1216. vpaddq $T0,$D2,$D2 # d2 += h3*s4
  1217. vpmuludq $H2,$T3,$T1 # h2*s4
  1218. vpaddq $T1,$D1,$D1 # d1 += h2*s4
  1219. vpmuludq $H1,$T3,$T3 # h1*s4
  1220. vpaddq $T3,$D0,$D0 # d0 += h1*s4
  1221. .Lshort_tail_avx:
  1222. ################################################################
  1223. # horizontal addition
  1224. vpsrldq \$8,$D4,$T4
  1225. vpsrldq \$8,$D3,$T3
  1226. vpsrldq \$8,$D1,$T1
  1227. vpsrldq \$8,$D0,$T0
  1228. vpsrldq \$8,$D2,$T2
  1229. vpaddq $T3,$D3,$D3
  1230. vpaddq $T4,$D4,$D4
  1231. vpaddq $T0,$D0,$D0
  1232. vpaddq $T1,$D1,$D1
  1233. vpaddq $T2,$D2,$D2
  1234. ################################################################
  1235. # lazy reduction
  1236. vpsrlq \$26,$D3,$H3
  1237. vpand $MASK,$D3,$D3
  1238. vpaddq $H3,$D4,$D4 # h3 -> h4
  1239. vpsrlq \$26,$D0,$H0
  1240. vpand $MASK,$D0,$D0
  1241. vpaddq $H0,$D1,$D1 # h0 -> h1
  1242. vpsrlq \$26,$D4,$H4
  1243. vpand $MASK,$D4,$D4
  1244. vpsrlq \$26,$D1,$H1
  1245. vpand $MASK,$D1,$D1
  1246. vpaddq $H1,$D2,$D2 # h1 -> h2
  1247. vpaddq $H4,$D0,$D0
  1248. vpsllq \$2,$H4,$H4
  1249. vpaddq $H4,$D0,$D0 # h4 -> h0
  1250. vpsrlq \$26,$D2,$H2
  1251. vpand $MASK,$D2,$D2
  1252. vpaddq $H2,$D3,$D3 # h2 -> h3
  1253. vpsrlq \$26,$D0,$H0
  1254. vpand $MASK,$D0,$D0
  1255. vpaddq $H0,$D1,$D1 # h0 -> h1
  1256. vpsrlq \$26,$D3,$H3
  1257. vpand $MASK,$D3,$D3
  1258. vpaddq $H3,$D4,$D4 # h3 -> h4
  1259. vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
  1260. vmovd $D1,`4*1-48-64`($ctx)
  1261. vmovd $D2,`4*2-48-64`($ctx)
  1262. vmovd $D3,`4*3-48-64`($ctx)
  1263. vmovd $D4,`4*4-48-64`($ctx)
  1264. ___
  1265. $code.=<<___ if ($win64);
  1266. vmovdqa 0x50(%r11),%xmm6
  1267. vmovdqa 0x60(%r11),%xmm7
  1268. vmovdqa 0x70(%r11),%xmm8
  1269. vmovdqa 0x80(%r11),%xmm9
  1270. vmovdqa 0x90(%r11),%xmm10
  1271. vmovdqa 0xa0(%r11),%xmm11
  1272. vmovdqa 0xb0(%r11),%xmm12
  1273. vmovdqa 0xc0(%r11),%xmm13
  1274. vmovdqa 0xd0(%r11),%xmm14
  1275. vmovdqa 0xe0(%r11),%xmm15
  1276. lea 0xf8(%r11),%rsp
  1277. .Ldo_avx_epilogue:
  1278. ___
  1279. $code.=<<___ if (!$win64);
  1280. lea -8(%r10),%rsp
  1281. .cfi_def_cfa_register %rsp
  1282. ___
  1283. $code.=<<___;
  1284. vzeroupper
  1285. RET
  1286. .cfi_endproc
  1287. ___
  1288. &end_function("poly1305_blocks_avx");
  1289. &declare_function("poly1305_emit_avx", 32, 3);
  1290. $code.=<<___;
  1291. cmpl \$0,20($ctx) # is_base2_26?
  1292. je .Lemit
  1293. mov 0($ctx),%eax # load hash value base 2^26
  1294. mov 4($ctx),%ecx
  1295. mov 8($ctx),%r8d
  1296. mov 12($ctx),%r11d
  1297. mov 16($ctx),%r10d
  1298. shl \$26,%rcx # base 2^26 -> base 2^64
  1299. mov %r8,%r9
  1300. shl \$52,%r8
  1301. add %rcx,%rax
  1302. shr \$12,%r9
  1303. add %rax,%r8 # h0
  1304. adc \$0,%r9
  1305. shl \$14,%r11
  1306. mov %r10,%rax
  1307. shr \$24,%r10
  1308. add %r11,%r9
  1309. shl \$40,%rax
  1310. add %rax,%r9 # h1
  1311. adc \$0,%r10 # h2
  1312. mov %r10,%rax # could be partially reduced, so reduce
  1313. mov %r10,%rcx
  1314. and \$3,%r10
  1315. shr \$2,%rax
  1316. and \$-4,%rcx
  1317. add %rcx,%rax
  1318. add %rax,%r8
  1319. adc \$0,%r9
  1320. adc \$0,%r10
  1321. mov %r8,%rax
  1322. add \$5,%r8 # compare to modulus
  1323. mov %r9,%rcx
  1324. adc \$0,%r9
  1325. adc \$0,%r10
  1326. shr \$2,%r10 # did 130-bit value overflow?
  1327. cmovnz %r8,%rax
  1328. cmovnz %r9,%rcx
  1329. add 0($nonce),%rax # accumulate nonce
  1330. adc 8($nonce),%rcx
  1331. mov %rax,0($mac) # write result
  1332. mov %rcx,8($mac)
  1333. RET
  1334. ___
  1335. &end_function("poly1305_emit_avx");
  1336. if ($avx>1) {
  1337. my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
  1338. map("%ymm$_",(0..15));
  1339. my $S4=$MASK;
  1340. sub poly1305_blocks_avxN {
  1341. my ($avx512) = @_;
  1342. my $suffix = $avx512 ? "_avx512" : "";
  1343. $code.=<<___;
  1344. .cfi_startproc
  1345. mov 20($ctx),%r8d # is_base2_26
  1346. cmp \$128,$len
  1347. jae .Lblocks_avx2$suffix
  1348. test %r8d,%r8d
  1349. jz .Lblocks
  1350. .Lblocks_avx2$suffix:
  1351. and \$-16,$len
  1352. jz .Lno_data_avx2$suffix
  1353. vzeroupper
  1354. test %r8d,%r8d
  1355. jz .Lbase2_64_avx2$suffix
  1356. test \$63,$len
  1357. jz .Leven_avx2$suffix
  1358. push %rbp
  1359. .cfi_push %rbp
  1360. mov %rsp,%rbp
  1361. push %rbx
  1362. .cfi_push %rbx
  1363. push %r12
  1364. .cfi_push %r12
  1365. push %r13
  1366. .cfi_push %r13
  1367. push %r14
  1368. .cfi_push %r14
  1369. push %r15
  1370. .cfi_push %r15
  1371. .Lblocks_avx2_body$suffix:
  1372. mov $len,%r15 # reassign $len
  1373. mov 0($ctx),$d1 # load hash value
  1374. mov 8($ctx),$d2
  1375. mov 16($ctx),$h2#d
  1376. mov 24($ctx),$r0 # load r
  1377. mov 32($ctx),$s1
  1378. ################################# base 2^26 -> base 2^64
  1379. mov $d1#d,$h0#d
  1380. and \$`-1*(1<<31)`,$d1
  1381. mov $d2,$r1 # borrow $r1
  1382. mov $d2#d,$h1#d
  1383. and \$`-1*(1<<31)`,$d2
  1384. shr \$6,$d1
  1385. shl \$52,$r1
  1386. add $d1,$h0
  1387. shr \$12,$h1
  1388. shr \$18,$d2
  1389. add $r1,$h0
  1390. adc $d2,$h1
  1391. mov $h2,$d1
  1392. shl \$40,$d1
  1393. shr \$24,$h2
  1394. add $d1,$h1
  1395. adc \$0,$h2 # can be partially reduced...
  1396. mov \$-4,$d2 # ... so reduce
  1397. mov $h2,$d1
  1398. and $h2,$d2
  1399. shr \$2,$d1
  1400. and \$3,$h2
  1401. add $d2,$d1 # =*5
  1402. add $d1,$h0
  1403. adc \$0,$h1
  1404. adc \$0,$h2
  1405. mov $s1,$r1
  1406. mov $s1,%rax
  1407. shr \$2,$s1
  1408. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  1409. .Lbase2_26_pre_avx2$suffix:
  1410. add 0($inp),$h0 # accumulate input
  1411. adc 8($inp),$h1
  1412. lea 16($inp),$inp
  1413. adc $padbit,$h2
  1414. sub \$16,%r15
  1415. call __poly1305_block
  1416. mov $r1,%rax
  1417. test \$63,%r15
  1418. jnz .Lbase2_26_pre_avx2$suffix
  1419. test $padbit,$padbit # if $padbit is zero,
  1420. jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
  1421. ################################# base 2^64 -> base 2^26
  1422. mov $h0,%rax
  1423. mov $h0,%rdx
  1424. shr \$52,$h0
  1425. mov $h1,$r0
  1426. mov $h1,$r1
  1427. shr \$26,%rdx
  1428. and \$0x3ffffff,%rax # h[0]
  1429. shl \$12,$r0
  1430. and \$0x3ffffff,%rdx # h[1]
  1431. shr \$14,$h1
  1432. or $r0,$h0
  1433. shl \$24,$h2
  1434. and \$0x3ffffff,$h0 # h[2]
  1435. shr \$40,$r1
  1436. and \$0x3ffffff,$h1 # h[3]
  1437. or $r1,$h2 # h[4]
  1438. test %r15,%r15
  1439. jz .Lstore_base2_26_avx2$suffix
  1440. vmovd %rax#d,%x#$H0
  1441. vmovd %rdx#d,%x#$H1
  1442. vmovd $h0#d,%x#$H2
  1443. vmovd $h1#d,%x#$H3
  1444. vmovd $h2#d,%x#$H4
  1445. jmp .Lproceed_avx2$suffix
  1446. .align 32
  1447. .Lstore_base2_64_avx2$suffix:
  1448. mov $h0,0($ctx)
  1449. mov $h1,8($ctx)
  1450. mov $h2,16($ctx) # note that is_base2_26 is zeroed
  1451. jmp .Ldone_avx2$suffix
  1452. .align 16
  1453. .Lstore_base2_26_avx2$suffix:
  1454. mov %rax#d,0($ctx) # store hash value base 2^26
  1455. mov %rdx#d,4($ctx)
  1456. mov $h0#d,8($ctx)
  1457. mov $h1#d,12($ctx)
  1458. mov $h2#d,16($ctx)
  1459. .align 16
  1460. .Ldone_avx2$suffix:
  1461. pop %r15
  1462. .cfi_restore %r15
  1463. pop %r14
  1464. .cfi_restore %r14
  1465. pop %r13
  1466. .cfi_restore %r13
  1467. pop %r12
  1468. .cfi_restore %r12
  1469. pop %rbx
  1470. .cfi_restore %rbx
  1471. pop %rbp
  1472. .cfi_restore %rbp
  1473. .Lno_data_avx2$suffix:
  1474. .Lblocks_avx2_epilogue$suffix:
  1475. RET
  1476. .cfi_endproc
  1477. .align 32
  1478. .Lbase2_64_avx2$suffix:
  1479. .cfi_startproc
  1480. push %rbp
  1481. .cfi_push %rbp
  1482. mov %rsp,%rbp
  1483. push %rbx
  1484. .cfi_push %rbx
  1485. push %r12
  1486. .cfi_push %r12
  1487. push %r13
  1488. .cfi_push %r13
  1489. push %r14
  1490. .cfi_push %r14
  1491. push %r15
  1492. .cfi_push %r15
  1493. .Lbase2_64_avx2_body$suffix:
  1494. mov $len,%r15 # reassign $len
  1495. mov 24($ctx),$r0 # load r
  1496. mov 32($ctx),$s1
  1497. mov 0($ctx),$h0 # load hash value
  1498. mov 8($ctx),$h1
  1499. mov 16($ctx),$h2#d
  1500. mov $s1,$r1
  1501. mov $s1,%rax
  1502. shr \$2,$s1
  1503. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  1504. test \$63,$len
  1505. jz .Linit_avx2$suffix
  1506. .Lbase2_64_pre_avx2$suffix:
  1507. add 0($inp),$h0 # accumulate input
  1508. adc 8($inp),$h1
  1509. lea 16($inp),$inp
  1510. adc $padbit,$h2
  1511. sub \$16,%r15
  1512. call __poly1305_block
  1513. mov $r1,%rax
  1514. test \$63,%r15
  1515. jnz .Lbase2_64_pre_avx2$suffix
  1516. .Linit_avx2$suffix:
  1517. ################################# base 2^64 -> base 2^26
  1518. mov $h0,%rax
  1519. mov $h0,%rdx
  1520. shr \$52,$h0
  1521. mov $h1,$d1
  1522. mov $h1,$d2
  1523. shr \$26,%rdx
  1524. and \$0x3ffffff,%rax # h[0]
  1525. shl \$12,$d1
  1526. and \$0x3ffffff,%rdx # h[1]
  1527. shr \$14,$h1
  1528. or $d1,$h0
  1529. shl \$24,$h2
  1530. and \$0x3ffffff,$h0 # h[2]
  1531. shr \$40,$d2
  1532. and \$0x3ffffff,$h1 # h[3]
  1533. or $d2,$h2 # h[4]
  1534. vmovd %rax#d,%x#$H0
  1535. vmovd %rdx#d,%x#$H1
  1536. vmovd $h0#d,%x#$H2
  1537. vmovd $h1#d,%x#$H3
  1538. vmovd $h2#d,%x#$H4
  1539. movl \$1,20($ctx) # set is_base2_26
  1540. call __poly1305_init_avx
  1541. .Lproceed_avx2$suffix:
  1542. mov %r15,$len # restore $len
  1543. ___
  1544. $code.=<<___ if (!$kernel);
  1545. mov OPENSSL_ia32cap_P+8(%rip),%r9d
  1546. mov \$`(1<<31|1<<30|1<<16)`,%r11d
  1547. ___
  1548. $code.=<<___;
  1549. pop %r15
  1550. .cfi_restore %r15
  1551. pop %r14
  1552. .cfi_restore %r14
  1553. pop %r13
  1554. .cfi_restore %r13
  1555. pop %r12
  1556. .cfi_restore %r12
  1557. pop %rbx
  1558. .cfi_restore %rbx
  1559. pop %rbp
  1560. .cfi_restore %rbp
  1561. .Lbase2_64_avx2_epilogue$suffix:
  1562. jmp .Ldo_avx2$suffix
  1563. .cfi_endproc
  1564. .align 32
  1565. .Leven_avx2$suffix:
  1566. .cfi_startproc
  1567. ___
  1568. $code.=<<___ if (!$kernel);
  1569. mov OPENSSL_ia32cap_P+8(%rip),%r9d
  1570. ___
  1571. $code.=<<___;
  1572. vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
  1573. vmovd 4*1($ctx),%x#$H1
  1574. vmovd 4*2($ctx),%x#$H2
  1575. vmovd 4*3($ctx),%x#$H3
  1576. vmovd 4*4($ctx),%x#$H4
  1577. .Ldo_avx2$suffix:
  1578. ___
  1579. $code.=<<___ if (!$kernel && $avx>2);
  1580. cmp \$512,$len
  1581. jb .Lskip_avx512
  1582. and %r11d,%r9d
  1583. test \$`1<<16`,%r9d # check for AVX512F
  1584. jnz .Lblocks_avx512
  1585. .Lskip_avx512$suffix:
  1586. ___
  1587. $code.=<<___ if ($avx > 2 && $avx512 && $kernel);
  1588. cmp \$512,$len
  1589. jae .Lblocks_avx512
  1590. ___
  1591. $code.=<<___ if (!$win64);
  1592. lea 8(%rsp),%r10
  1593. .cfi_def_cfa_register %r10
  1594. sub \$0x128,%rsp
  1595. ___
  1596. $code.=<<___ if ($win64);
  1597. lea 8(%rsp),%r10
  1598. sub \$0x1c8,%rsp
  1599. vmovdqa %xmm6,-0xb0(%r10)
  1600. vmovdqa %xmm7,-0xa0(%r10)
  1601. vmovdqa %xmm8,-0x90(%r10)
  1602. vmovdqa %xmm9,-0x80(%r10)
  1603. vmovdqa %xmm10,-0x70(%r10)
  1604. vmovdqa %xmm11,-0x60(%r10)
  1605. vmovdqa %xmm12,-0x50(%r10)
  1606. vmovdqa %xmm13,-0x40(%r10)
  1607. vmovdqa %xmm14,-0x30(%r10)
  1608. vmovdqa %xmm15,-0x20(%r10)
  1609. .Ldo_avx2_body$suffix:
  1610. ___
  1611. $code.=<<___;
  1612. lea .Lconst(%rip),%rcx
  1613. lea 48+64($ctx),$ctx # size optimization
  1614. vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
  1615. # expand and copy pre-calculated table to stack
  1616. vmovdqu `16*0-64`($ctx),%x#$T2
  1617. and \$-512,%rsp
  1618. vmovdqu `16*1-64`($ctx),%x#$T3
  1619. vmovdqu `16*2-64`($ctx),%x#$T4
  1620. vmovdqu `16*3-64`($ctx),%x#$D0
  1621. vmovdqu `16*4-64`($ctx),%x#$D1
  1622. vmovdqu `16*5-64`($ctx),%x#$D2
  1623. lea 0x90(%rsp),%rax # size optimization
  1624. vmovdqu `16*6-64`($ctx),%x#$D3
  1625. vpermd $T2,$T0,$T2 # 00003412 -> 14243444
  1626. vmovdqu `16*7-64`($ctx),%x#$D4
  1627. vpermd $T3,$T0,$T3
  1628. vmovdqu `16*8-64`($ctx),%x#$MASK
  1629. vpermd $T4,$T0,$T4
  1630. vmovdqa $T2,0x00(%rsp)
  1631. vpermd $D0,$T0,$D0
  1632. vmovdqa $T3,0x20-0x90(%rax)
  1633. vpermd $D1,$T0,$D1
  1634. vmovdqa $T4,0x40-0x90(%rax)
  1635. vpermd $D2,$T0,$D2
  1636. vmovdqa $D0,0x60-0x90(%rax)
  1637. vpermd $D3,$T0,$D3
  1638. vmovdqa $D1,0x80-0x90(%rax)
  1639. vpermd $D4,$T0,$D4
  1640. vmovdqa $D2,0xa0-0x90(%rax)
  1641. vpermd $MASK,$T0,$MASK
  1642. vmovdqa $D3,0xc0-0x90(%rax)
  1643. vmovdqa $D4,0xe0-0x90(%rax)
  1644. vmovdqa $MASK,0x100-0x90(%rax)
  1645. vmovdqa 64(%rcx),$MASK # .Lmask26
  1646. ################################################################
  1647. # load input
  1648. vmovdqu 16*0($inp),%x#$T0
  1649. vmovdqu 16*1($inp),%x#$T1
  1650. vinserti128 \$1,16*2($inp),$T0,$T0
  1651. vinserti128 \$1,16*3($inp),$T1,$T1
  1652. lea 16*4($inp),$inp
  1653. vpsrldq \$6,$T0,$T2 # splat input
  1654. vpsrldq \$6,$T1,$T3
  1655. vpunpckhqdq $T1,$T0,$T4 # 4
  1656. vpunpcklqdq $T3,$T2,$T2 # 2:3
  1657. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1658. vpsrlq \$30,$T2,$T3
  1659. vpsrlq \$4,$T2,$T2
  1660. vpsrlq \$26,$T0,$T1
  1661. vpsrlq \$40,$T4,$T4 # 4
  1662. vpand $MASK,$T2,$T2 # 2
  1663. vpand $MASK,$T0,$T0 # 0
  1664. vpand $MASK,$T1,$T1 # 1
  1665. vpand $MASK,$T3,$T3 # 3
  1666. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1667. vpaddq $H2,$T2,$H2 # accumulate input
  1668. sub \$64,$len
  1669. jz .Ltail_avx2$suffix
  1670. jmp .Loop_avx2$suffix
  1671. .align 32
  1672. .Loop_avx2$suffix:
  1673. ################################################################
  1674. # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
  1675. # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
  1676. # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
  1677. # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
  1678. # \________/\__________/
  1679. ################################################################
  1680. #vpaddq $H2,$T2,$H2 # accumulate input
  1681. vpaddq $H0,$T0,$H0
  1682. vmovdqa `32*0`(%rsp),$T0 # r0^4
  1683. vpaddq $H1,$T1,$H1
  1684. vmovdqa `32*1`(%rsp),$T1 # r1^4
  1685. vpaddq $H3,$T3,$H3
  1686. vmovdqa `32*3`(%rsp),$T2 # r2^4
  1687. vpaddq $H4,$T4,$H4
  1688. vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
  1689. vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
  1690. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1691. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1692. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1693. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1694. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1695. #
  1696. # however, as h2 is "chronologically" first one available pull
  1697. # corresponding operations up, so it's
  1698. #
  1699. # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
  1700. # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
  1701. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1702. # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
  1703. # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
  1704. vpmuludq $H2,$T0,$D2 # d2 = h2*r0
  1705. vpmuludq $H2,$T1,$D3 # d3 = h2*r1
  1706. vpmuludq $H2,$T2,$D4 # d4 = h2*r2
  1707. vpmuludq $H2,$T3,$D0 # d0 = h2*s3
  1708. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  1709. vpmuludq $H0,$T1,$T4 # h0*r1
  1710. vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
  1711. vpaddq $T4,$D1,$D1 # d1 += h0*r1
  1712. vpaddq $H2,$D2,$D2 # d2 += h1*r1
  1713. vpmuludq $H3,$T1,$T4 # h3*r1
  1714. vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
  1715. vpaddq $T4,$D4,$D4 # d4 += h3*r1
  1716. vpaddq $H2,$D0,$D0 # d0 += h4*s1
  1717. vmovdqa `32*4-0x90`(%rax),$T1 # s2
  1718. vpmuludq $H0,$T0,$T4 # h0*r0
  1719. vpmuludq $H1,$T0,$H2 # h1*r0
  1720. vpaddq $T4,$D0,$D0 # d0 += h0*r0
  1721. vpaddq $H2,$D1,$D1 # d1 += h1*r0
  1722. vpmuludq $H3,$T0,$T4 # h3*r0
  1723. vpmuludq $H4,$T0,$H2 # h4*r0
  1724. vmovdqu 16*0($inp),%x#$T0 # load input
  1725. vpaddq $T4,$D3,$D3 # d3 += h3*r0
  1726. vpaddq $H2,$D4,$D4 # d4 += h4*r0
  1727. vinserti128 \$1,16*2($inp),$T0,$T0
  1728. vpmuludq $H3,$T1,$T4 # h3*s2
  1729. vpmuludq $H4,$T1,$H2 # h4*s2
  1730. vmovdqu 16*1($inp),%x#$T1
  1731. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  1732. vpaddq $H2,$D1,$D1 # d1 += h4*s2
  1733. vmovdqa `32*5-0x90`(%rax),$H2 # r3
  1734. vpmuludq $H1,$T2,$T4 # h1*r2
  1735. vpmuludq $H0,$T2,$T2 # h0*r2
  1736. vpaddq $T4,$D3,$D3 # d3 += h1*r2
  1737. vpaddq $T2,$D2,$D2 # d2 += h0*r2
  1738. vinserti128 \$1,16*3($inp),$T1,$T1
  1739. lea 16*4($inp),$inp
  1740. vpmuludq $H1,$H2,$T4 # h1*r3
  1741. vpmuludq $H0,$H2,$H2 # h0*r3
  1742. vpsrldq \$6,$T0,$T2 # splat input
  1743. vpaddq $T4,$D4,$D4 # d4 += h1*r3
  1744. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  1745. vpmuludq $H3,$T3,$T4 # h3*s3
  1746. vpmuludq $H4,$T3,$H2 # h4*s3
  1747. vpsrldq \$6,$T1,$T3
  1748. vpaddq $T4,$D1,$D1 # d1 += h3*s3
  1749. vpaddq $H2,$D2,$D2 # d2 += h4*s3
  1750. vpunpckhqdq $T1,$T0,$T4 # 4
  1751. vpmuludq $H3,$S4,$H3 # h3*s4
  1752. vpmuludq $H4,$S4,$H4 # h4*s4
  1753. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1754. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
  1755. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
  1756. vpunpcklqdq $T3,$T2,$T3 # 2:3
  1757. vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
  1758. vpmuludq $H1,$S4,$H0 # h1*s4
  1759. vmovdqa 64(%rcx),$MASK # .Lmask26
  1760. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1761. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1762. ################################################################
  1763. # lazy reduction (interleaved with tail of input splat)
  1764. vpsrlq \$26,$H3,$D3
  1765. vpand $MASK,$H3,$H3
  1766. vpaddq $D3,$H4,$H4 # h3 -> h4
  1767. vpsrlq \$26,$H0,$D0
  1768. vpand $MASK,$H0,$H0
  1769. vpaddq $D0,$D1,$H1 # h0 -> h1
  1770. vpsrlq \$26,$H4,$D4
  1771. vpand $MASK,$H4,$H4
  1772. vpsrlq \$4,$T3,$T2
  1773. vpsrlq \$26,$H1,$D1
  1774. vpand $MASK,$H1,$H1
  1775. vpaddq $D1,$H2,$H2 # h1 -> h2
  1776. vpaddq $D4,$H0,$H0
  1777. vpsllq \$2,$D4,$D4
  1778. vpaddq $D4,$H0,$H0 # h4 -> h0
  1779. vpand $MASK,$T2,$T2 # 2
  1780. vpsrlq \$26,$T0,$T1
  1781. vpsrlq \$26,$H2,$D2
  1782. vpand $MASK,$H2,$H2
  1783. vpaddq $D2,$H3,$H3 # h2 -> h3
  1784. vpaddq $T2,$H2,$H2 # modulo-scheduled
  1785. vpsrlq \$30,$T3,$T3
  1786. vpsrlq \$26,$H0,$D0
  1787. vpand $MASK,$H0,$H0
  1788. vpaddq $D0,$H1,$H1 # h0 -> h1
  1789. vpsrlq \$40,$T4,$T4 # 4
  1790. vpsrlq \$26,$H3,$D3
  1791. vpand $MASK,$H3,$H3
  1792. vpaddq $D3,$H4,$H4 # h3 -> h4
  1793. vpand $MASK,$T0,$T0 # 0
  1794. vpand $MASK,$T1,$T1 # 1
  1795. vpand $MASK,$T3,$T3 # 3
  1796. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1797. sub \$64,$len
  1798. jnz .Loop_avx2$suffix
  1799. .byte 0x66,0x90
  1800. .Ltail_avx2$suffix:
  1801. ################################################################
  1802. # while above multiplications were by r^4 in all lanes, in last
  1803. # iteration we multiply least significant lane by r^4 and most
  1804. # significant one by r, so copy of above except that references
  1805. # to the precomputed table are displaced by 4...
  1806. #vpaddq $H2,$T2,$H2 # accumulate input
  1807. vpaddq $H0,$T0,$H0
  1808. vmovdqu `32*0+4`(%rsp),$T0 # r0^4
  1809. vpaddq $H1,$T1,$H1
  1810. vmovdqu `32*1+4`(%rsp),$T1 # r1^4
  1811. vpaddq $H3,$T3,$H3
  1812. vmovdqu `32*3+4`(%rsp),$T2 # r2^4
  1813. vpaddq $H4,$T4,$H4
  1814. vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
  1815. vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
  1816. vpmuludq $H2,$T0,$D2 # d2 = h2*r0
  1817. vpmuludq $H2,$T1,$D3 # d3 = h2*r1
  1818. vpmuludq $H2,$T2,$D4 # d4 = h2*r2
  1819. vpmuludq $H2,$T3,$D0 # d0 = h2*s3
  1820. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  1821. vpmuludq $H0,$T1,$T4 # h0*r1
  1822. vpmuludq $H1,$T1,$H2 # h1*r1
  1823. vpaddq $T4,$D1,$D1 # d1 += h0*r1
  1824. vpaddq $H2,$D2,$D2 # d2 += h1*r1
  1825. vpmuludq $H3,$T1,$T4 # h3*r1
  1826. vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
  1827. vpaddq $T4,$D4,$D4 # d4 += h3*r1
  1828. vpaddq $H2,$D0,$D0 # d0 += h4*s1
  1829. vpmuludq $H0,$T0,$T4 # h0*r0
  1830. vpmuludq $H1,$T0,$H2 # h1*r0
  1831. vpaddq $T4,$D0,$D0 # d0 += h0*r0
  1832. vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
  1833. vpaddq $H2,$D1,$D1 # d1 += h1*r0
  1834. vpmuludq $H3,$T0,$T4 # h3*r0
  1835. vpmuludq $H4,$T0,$H2 # h4*r0
  1836. vpaddq $T4,$D3,$D3 # d3 += h3*r0
  1837. vpaddq $H2,$D4,$D4 # d4 += h4*r0
  1838. vpmuludq $H3,$T1,$T4 # h3*s2
  1839. vpmuludq $H4,$T1,$H2 # h4*s2
  1840. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  1841. vpaddq $H2,$D1,$D1 # d1 += h4*s2
  1842. vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
  1843. vpmuludq $H1,$T2,$T4 # h1*r2
  1844. vpmuludq $H0,$T2,$T2 # h0*r2
  1845. vpaddq $T4,$D3,$D3 # d3 += h1*r2
  1846. vpaddq $T2,$D2,$D2 # d2 += h0*r2
  1847. vpmuludq $H1,$H2,$T4 # h1*r3
  1848. vpmuludq $H0,$H2,$H2 # h0*r3
  1849. vpaddq $T4,$D4,$D4 # d4 += h1*r3
  1850. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  1851. vpmuludq $H3,$T3,$T4 # h3*s3
  1852. vpmuludq $H4,$T3,$H2 # h4*s3
  1853. vpaddq $T4,$D1,$D1 # d1 += h3*s3
  1854. vpaddq $H2,$D2,$D2 # d2 += h4*s3
  1855. vpmuludq $H3,$S4,$H3 # h3*s4
  1856. vpmuludq $H4,$S4,$H4 # h4*s4
  1857. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
  1858. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
  1859. vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
  1860. vpmuludq $H1,$S4,$H0 # h1*s4
  1861. vmovdqa 64(%rcx),$MASK # .Lmask26
  1862. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1863. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1864. ################################################################
  1865. # horizontal addition
  1866. vpsrldq \$8,$D1,$T1
  1867. vpsrldq \$8,$H2,$T2
  1868. vpsrldq \$8,$H3,$T3
  1869. vpsrldq \$8,$H4,$T4
  1870. vpsrldq \$8,$H0,$T0
  1871. vpaddq $T1,$D1,$D1
  1872. vpaddq $T2,$H2,$H2
  1873. vpaddq $T3,$H3,$H3
  1874. vpaddq $T4,$H4,$H4
  1875. vpaddq $T0,$H0,$H0
  1876. vpermq \$0x2,$H3,$T3
  1877. vpermq \$0x2,$H4,$T4
  1878. vpermq \$0x2,$H0,$T0
  1879. vpermq \$0x2,$D1,$T1
  1880. vpermq \$0x2,$H2,$T2
  1881. vpaddq $T3,$H3,$H3
  1882. vpaddq $T4,$H4,$H4
  1883. vpaddq $T0,$H0,$H0
  1884. vpaddq $T1,$D1,$D1
  1885. vpaddq $T2,$H2,$H2
  1886. ################################################################
  1887. # lazy reduction
  1888. vpsrlq \$26,$H3,$D3
  1889. vpand $MASK,$H3,$H3
  1890. vpaddq $D3,$H4,$H4 # h3 -> h4
  1891. vpsrlq \$26,$H0,$D0
  1892. vpand $MASK,$H0,$H0
  1893. vpaddq $D0,$D1,$H1 # h0 -> h1
  1894. vpsrlq \$26,$H4,$D4
  1895. vpand $MASK,$H4,$H4
  1896. vpsrlq \$26,$H1,$D1
  1897. vpand $MASK,$H1,$H1
  1898. vpaddq $D1,$H2,$H2 # h1 -> h2
  1899. vpaddq $D4,$H0,$H0
  1900. vpsllq \$2,$D4,$D4
  1901. vpaddq $D4,$H0,$H0 # h4 -> h0
  1902. vpsrlq \$26,$H2,$D2
  1903. vpand $MASK,$H2,$H2
  1904. vpaddq $D2,$H3,$H3 # h2 -> h3
  1905. vpsrlq \$26,$H0,$D0
  1906. vpand $MASK,$H0,$H0
  1907. vpaddq $D0,$H1,$H1 # h0 -> h1
  1908. vpsrlq \$26,$H3,$D3
  1909. vpand $MASK,$H3,$H3
  1910. vpaddq $D3,$H4,$H4 # h3 -> h4
  1911. vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
  1912. vmovd %x#$H1,`4*1-48-64`($ctx)
  1913. vmovd %x#$H2,`4*2-48-64`($ctx)
  1914. vmovd %x#$H3,`4*3-48-64`($ctx)
  1915. vmovd %x#$H4,`4*4-48-64`($ctx)
  1916. ___
  1917. $code.=<<___ if ($win64);
  1918. vmovdqa -0xb0(%r10),%xmm6
  1919. vmovdqa -0xa0(%r10),%xmm7
  1920. vmovdqa -0x90(%r10),%xmm8
  1921. vmovdqa -0x80(%r10),%xmm9
  1922. vmovdqa -0x70(%r10),%xmm10
  1923. vmovdqa -0x60(%r10),%xmm11
  1924. vmovdqa -0x50(%r10),%xmm12
  1925. vmovdqa -0x40(%r10),%xmm13
  1926. vmovdqa -0x30(%r10),%xmm14
  1927. vmovdqa -0x20(%r10),%xmm15
  1928. lea -8(%r10),%rsp
  1929. .Ldo_avx2_epilogue$suffix:
  1930. ___
  1931. $code.=<<___ if (!$win64);
  1932. lea -8(%r10),%rsp
  1933. .cfi_def_cfa_register %rsp
  1934. ___
  1935. $code.=<<___;
  1936. vzeroupper
  1937. RET
  1938. .cfi_endproc
  1939. ___
  1940. if($avx > 2 && $avx512) {
  1941. my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
  1942. my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
  1943. my $PADBIT="%zmm30";
  1944. map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
  1945. map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
  1946. map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
  1947. map(s/%y/%z/,($MASK));
  1948. $code.=<<___;
  1949. .cfi_startproc
  1950. .Lblocks_avx512:
  1951. mov \$15,%eax
  1952. kmovw %eax,%k2
  1953. ___
  1954. $code.=<<___ if (!$win64);
  1955. lea 8(%rsp),%r10
  1956. .cfi_def_cfa_register %r10
  1957. sub \$0x128,%rsp
  1958. ___
  1959. $code.=<<___ if ($win64);
  1960. lea 8(%rsp),%r10
  1961. sub \$0x1c8,%rsp
  1962. vmovdqa %xmm6,-0xb0(%r10)
  1963. vmovdqa %xmm7,-0xa0(%r10)
  1964. vmovdqa %xmm8,-0x90(%r10)
  1965. vmovdqa %xmm9,-0x80(%r10)
  1966. vmovdqa %xmm10,-0x70(%r10)
  1967. vmovdqa %xmm11,-0x60(%r10)
  1968. vmovdqa %xmm12,-0x50(%r10)
  1969. vmovdqa %xmm13,-0x40(%r10)
  1970. vmovdqa %xmm14,-0x30(%r10)
  1971. vmovdqa %xmm15,-0x20(%r10)
  1972. .Ldo_avx512_body:
  1973. ___
  1974. $code.=<<___;
  1975. lea .Lconst(%rip),%rcx
  1976. lea 48+64($ctx),$ctx # size optimization
  1977. vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
  1978. # expand pre-calculated table
  1979. vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
  1980. and \$-512,%rsp
  1981. vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
  1982. mov \$0x20,%rax
  1983. vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
  1984. vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
  1985. vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
  1986. vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
  1987. vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
  1988. vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
  1989. vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
  1990. vpermd $D0,$T2,$R0 # 00003412 -> 14243444
  1991. vpbroadcastq 64(%rcx),$MASK # .Lmask26
  1992. vpermd $D1,$T2,$R1
  1993. vpermd $T0,$T2,$S1
  1994. vpermd $D2,$T2,$R2
  1995. vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
  1996. vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
  1997. vpermd $T1,$T2,$S2
  1998. vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
  1999. vpsrlq \$32,$R1,$T1
  2000. vpermd $D3,$T2,$R3
  2001. vmovdqa64 $S1,0x40(%rsp){%k2}
  2002. vpermd $T3,$T2,$S3
  2003. vpermd $D4,$T2,$R4
  2004. vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
  2005. vpermd $T4,$T2,$S4
  2006. vmovdqa64 $S2,0x80(%rsp){%k2}
  2007. vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
  2008. vmovdqa64 $S3,0xc0(%rsp){%k2}
  2009. vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
  2010. vmovdqa64 $S4,0x100(%rsp){%k2}
  2011. ################################################################
  2012. # calculate 5th through 8th powers of the key
  2013. #
  2014. # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
  2015. # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
  2016. # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
  2017. # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
  2018. # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
  2019. vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
  2020. vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
  2021. vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
  2022. vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
  2023. vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
  2024. vpsrlq \$32,$R2,$T2
  2025. vpmuludq $T1,$S4,$M0
  2026. vpmuludq $T1,$R0,$M1
  2027. vpmuludq $T1,$R1,$M2
  2028. vpmuludq $T1,$R2,$M3
  2029. vpmuludq $T1,$R3,$M4
  2030. vpsrlq \$32,$R3,$T3
  2031. vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
  2032. vpaddq $M1,$D1,$D1 # d1 += r1'*r0
  2033. vpaddq $M2,$D2,$D2 # d2 += r1'*r1
  2034. vpaddq $M3,$D3,$D3 # d3 += r1'*r2
  2035. vpaddq $M4,$D4,$D4 # d4 += r1'*r3
  2036. vpmuludq $T2,$S3,$M0
  2037. vpmuludq $T2,$S4,$M1
  2038. vpmuludq $T2,$R1,$M3
  2039. vpmuludq $T2,$R2,$M4
  2040. vpmuludq $T2,$R0,$M2
  2041. vpsrlq \$32,$R4,$T4
  2042. vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
  2043. vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
  2044. vpaddq $M3,$D3,$D3 # d3 += r2'*r1
  2045. vpaddq $M4,$D4,$D4 # d4 += r2'*r2
  2046. vpaddq $M2,$D2,$D2 # d2 += r2'*r0
  2047. vpmuludq $T3,$S2,$M0
  2048. vpmuludq $T3,$R0,$M3
  2049. vpmuludq $T3,$R1,$M4
  2050. vpmuludq $T3,$S3,$M1
  2051. vpmuludq $T3,$S4,$M2
  2052. vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
  2053. vpaddq $M3,$D3,$D3 # d3 += r3'*r0
  2054. vpaddq $M4,$D4,$D4 # d4 += r3'*r1
  2055. vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
  2056. vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
  2057. vpmuludq $T4,$S4,$M3
  2058. vpmuludq $T4,$R0,$M4
  2059. vpmuludq $T4,$S1,$M0
  2060. vpmuludq $T4,$S2,$M1
  2061. vpmuludq $T4,$S3,$M2
  2062. vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
  2063. vpaddq $M4,$D4,$D4 # d4 += r2'*r0
  2064. vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
  2065. vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
  2066. vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
  2067. ################################################################
  2068. # load input
  2069. vmovdqu64 16*0($inp),%z#$T3
  2070. vmovdqu64 16*4($inp),%z#$T4
  2071. lea 16*8($inp),$inp
  2072. ################################################################
  2073. # lazy reduction
  2074. vpsrlq \$26,$D3,$M3
  2075. vpandq $MASK,$D3,$D3
  2076. vpaddq $M3,$D4,$D4 # d3 -> d4
  2077. vpsrlq \$26,$D0,$M0
  2078. vpandq $MASK,$D0,$D0
  2079. vpaddq $M0,$D1,$D1 # d0 -> d1
  2080. vpsrlq \$26,$D4,$M4
  2081. vpandq $MASK,$D4,$D4
  2082. vpsrlq \$26,$D1,$M1
  2083. vpandq $MASK,$D1,$D1
  2084. vpaddq $M1,$D2,$D2 # d1 -> d2
  2085. vpaddq $M4,$D0,$D0
  2086. vpsllq \$2,$M4,$M4
  2087. vpaddq $M4,$D0,$D0 # d4 -> d0
  2088. vpsrlq \$26,$D2,$M2
  2089. vpandq $MASK,$D2,$D2
  2090. vpaddq $M2,$D3,$D3 # d2 -> d3
  2091. vpsrlq \$26,$D0,$M0
  2092. vpandq $MASK,$D0,$D0
  2093. vpaddq $M0,$D1,$D1 # d0 -> d1
  2094. vpsrlq \$26,$D3,$M3
  2095. vpandq $MASK,$D3,$D3
  2096. vpaddq $M3,$D4,$D4 # d3 -> d4
  2097. ################################################################
  2098. # at this point we have 14243444 in $R0-$S4 and 05060708 in
  2099. # $D0-$D4, ...
  2100. vpunpcklqdq $T4,$T3,$T0 # transpose input
  2101. vpunpckhqdq $T4,$T3,$T4
  2102. # ... since input 64-bit lanes are ordered as 73625140, we could
  2103. # "vperm" it to 76543210 (here and in each loop iteration), *or*
  2104. # we could just flow along, hence the goal for $R0-$S4 is
  2105. # 1858286838784888 ...
  2106. vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
  2107. mov \$0x7777,%eax
  2108. kmovw %eax,%k1
  2109. vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
  2110. vpermd $R1,$M0,$R1
  2111. vpermd $R2,$M0,$R2
  2112. vpermd $R3,$M0,$R3
  2113. vpermd $R4,$M0,$R4
  2114. vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
  2115. vpermd $D1,$M0,${R1}{%k1}
  2116. vpermd $D2,$M0,${R2}{%k1}
  2117. vpermd $D3,$M0,${R3}{%k1}
  2118. vpermd $D4,$M0,${R4}{%k1}
  2119. vpslld \$2,$R1,$S1 # *5
  2120. vpslld \$2,$R2,$S2
  2121. vpslld \$2,$R3,$S3
  2122. vpslld \$2,$R4,$S4
  2123. vpaddd $R1,$S1,$S1
  2124. vpaddd $R2,$S2,$S2
  2125. vpaddd $R3,$S3,$S3
  2126. vpaddd $R4,$S4,$S4
  2127. vpbroadcastq 32(%rcx),$PADBIT # .L129
  2128. vpsrlq \$52,$T0,$T2 # splat input
  2129. vpsllq \$12,$T4,$T3
  2130. vporq $T3,$T2,$T2
  2131. vpsrlq \$26,$T0,$T1
  2132. vpsrlq \$14,$T4,$T3
  2133. vpsrlq \$40,$T4,$T4 # 4
  2134. vpandq $MASK,$T2,$T2 # 2
  2135. vpandq $MASK,$T0,$T0 # 0
  2136. #vpandq $MASK,$T1,$T1 # 1
  2137. #vpandq $MASK,$T3,$T3 # 3
  2138. #vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2139. vpaddq $H2,$T2,$H2 # accumulate input
  2140. sub \$192,$len
  2141. jbe .Ltail_avx512
  2142. jmp .Loop_avx512
  2143. .align 32
  2144. .Loop_avx512:
  2145. ################################################################
  2146. # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
  2147. # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
  2148. # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
  2149. # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
  2150. # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
  2151. # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
  2152. # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
  2153. # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
  2154. # \________/\___________/
  2155. ################################################################
  2156. #vpaddq $H2,$T2,$H2 # accumulate input
  2157. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  2158. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  2159. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  2160. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  2161. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  2162. #
  2163. # however, as h2 is "chronologically" first one available pull
  2164. # corresponding operations up, so it's
  2165. #
  2166. # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
  2167. # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
  2168. # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
  2169. # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
  2170. # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
  2171. vpmuludq $H2,$R1,$D3 # d3 = h2*r1
  2172. vpaddq $H0,$T0,$H0
  2173. vpmuludq $H2,$R2,$D4 # d4 = h2*r2
  2174. vpandq $MASK,$T1,$T1 # 1
  2175. vpmuludq $H2,$S3,$D0 # d0 = h2*s3
  2176. vpandq $MASK,$T3,$T3 # 3
  2177. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  2178. vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2179. vpmuludq $H2,$R0,$D2 # d2 = h2*r0
  2180. vpaddq $H1,$T1,$H1 # accumulate input
  2181. vpaddq $H3,$T3,$H3
  2182. vpaddq $H4,$T4,$H4
  2183. vmovdqu64 16*0($inp),$T3 # load input
  2184. vmovdqu64 16*4($inp),$T4
  2185. lea 16*8($inp),$inp
  2186. vpmuludq $H0,$R3,$M3
  2187. vpmuludq $H0,$R4,$M4
  2188. vpmuludq $H0,$R0,$M0
  2189. vpmuludq $H0,$R1,$M1
  2190. vpaddq $M3,$D3,$D3 # d3 += h0*r3
  2191. vpaddq $M4,$D4,$D4 # d4 += h0*r4
  2192. vpaddq $M0,$D0,$D0 # d0 += h0*r0
  2193. vpaddq $M1,$D1,$D1 # d1 += h0*r1
  2194. vpmuludq $H1,$R2,$M3
  2195. vpmuludq $H1,$R3,$M4
  2196. vpmuludq $H1,$S4,$M0
  2197. vpmuludq $H0,$R2,$M2
  2198. vpaddq $M3,$D3,$D3 # d3 += h1*r2
  2199. vpaddq $M4,$D4,$D4 # d4 += h1*r3
  2200. vpaddq $M0,$D0,$D0 # d0 += h1*s4
  2201. vpaddq $M2,$D2,$D2 # d2 += h0*r2
  2202. vpunpcklqdq $T4,$T3,$T0 # transpose input
  2203. vpunpckhqdq $T4,$T3,$T4
  2204. vpmuludq $H3,$R0,$M3
  2205. vpmuludq $H3,$R1,$M4
  2206. vpmuludq $H1,$R0,$M1
  2207. vpmuludq $H1,$R1,$M2
  2208. vpaddq $M3,$D3,$D3 # d3 += h3*r0
  2209. vpaddq $M4,$D4,$D4 # d4 += h3*r1
  2210. vpaddq $M1,$D1,$D1 # d1 += h1*r0
  2211. vpaddq $M2,$D2,$D2 # d2 += h1*r1
  2212. vpmuludq $H4,$S4,$M3
  2213. vpmuludq $H4,$R0,$M4
  2214. vpmuludq $H3,$S2,$M0
  2215. vpmuludq $H3,$S3,$M1
  2216. vpaddq $M3,$D3,$D3 # d3 += h4*s4
  2217. vpmuludq $H3,$S4,$M2
  2218. vpaddq $M4,$D4,$D4 # d4 += h4*r0
  2219. vpaddq $M0,$D0,$D0 # d0 += h3*s2
  2220. vpaddq $M1,$D1,$D1 # d1 += h3*s3
  2221. vpaddq $M2,$D2,$D2 # d2 += h3*s4
  2222. vpmuludq $H4,$S1,$M0
  2223. vpmuludq $H4,$S2,$M1
  2224. vpmuludq $H4,$S3,$M2
  2225. vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
  2226. vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
  2227. vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
  2228. ################################################################
  2229. # lazy reduction (interleaved with input splat)
  2230. vpsrlq \$52,$T0,$T2 # splat input
  2231. vpsllq \$12,$T4,$T3
  2232. vpsrlq \$26,$D3,$H3
  2233. vpandq $MASK,$D3,$D3
  2234. vpaddq $H3,$D4,$H4 # h3 -> h4
  2235. vporq $T3,$T2,$T2
  2236. vpsrlq \$26,$H0,$D0
  2237. vpandq $MASK,$H0,$H0
  2238. vpaddq $D0,$H1,$H1 # h0 -> h1
  2239. vpandq $MASK,$T2,$T2 # 2
  2240. vpsrlq \$26,$H4,$D4
  2241. vpandq $MASK,$H4,$H4
  2242. vpsrlq \$26,$H1,$D1
  2243. vpandq $MASK,$H1,$H1
  2244. vpaddq $D1,$H2,$H2 # h1 -> h2
  2245. vpaddq $D4,$H0,$H0
  2246. vpsllq \$2,$D4,$D4
  2247. vpaddq $D4,$H0,$H0 # h4 -> h0
  2248. vpaddq $T2,$H2,$H2 # modulo-scheduled
  2249. vpsrlq \$26,$T0,$T1
  2250. vpsrlq \$26,$H2,$D2
  2251. vpandq $MASK,$H2,$H2
  2252. vpaddq $D2,$D3,$H3 # h2 -> h3
  2253. vpsrlq \$14,$T4,$T3
  2254. vpsrlq \$26,$H0,$D0
  2255. vpandq $MASK,$H0,$H0
  2256. vpaddq $D0,$H1,$H1 # h0 -> h1
  2257. vpsrlq \$40,$T4,$T4 # 4
  2258. vpsrlq \$26,$H3,$D3
  2259. vpandq $MASK,$H3,$H3
  2260. vpaddq $D3,$H4,$H4 # h3 -> h4
  2261. vpandq $MASK,$T0,$T0 # 0
  2262. #vpandq $MASK,$T1,$T1 # 1
  2263. #vpandq $MASK,$T3,$T3 # 3
  2264. #vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2265. sub \$128,$len
  2266. ja .Loop_avx512
  2267. .Ltail_avx512:
  2268. ################################################################
  2269. # while above multiplications were by r^8 in all lanes, in last
  2270. # iteration we multiply least significant lane by r^8 and most
  2271. # significant one by r, that's why table gets shifted...
  2272. vpsrlq \$32,$R0,$R0 # 0105020603070408
  2273. vpsrlq \$32,$R1,$R1
  2274. vpsrlq \$32,$R2,$R2
  2275. vpsrlq \$32,$S3,$S3
  2276. vpsrlq \$32,$S4,$S4
  2277. vpsrlq \$32,$R3,$R3
  2278. vpsrlq \$32,$R4,$R4
  2279. vpsrlq \$32,$S1,$S1
  2280. vpsrlq \$32,$S2,$S2
  2281. ################################################################
  2282. # load either next or last 64 byte of input
  2283. lea ($inp,$len),$inp
  2284. #vpaddq $H2,$T2,$H2 # accumulate input
  2285. vpaddq $H0,$T0,$H0
  2286. vpmuludq $H2,$R1,$D3 # d3 = h2*r1
  2287. vpmuludq $H2,$R2,$D4 # d4 = h2*r2
  2288. vpmuludq $H2,$S3,$D0 # d0 = h2*s3
  2289. vpandq $MASK,$T1,$T1 # 1
  2290. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  2291. vpandq $MASK,$T3,$T3 # 3
  2292. vpmuludq $H2,$R0,$D2 # d2 = h2*r0
  2293. vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2294. vpaddq $H1,$T1,$H1 # accumulate input
  2295. vpaddq $H3,$T3,$H3
  2296. vpaddq $H4,$T4,$H4
  2297. vmovdqu 16*0($inp),%x#$T0
  2298. vpmuludq $H0,$R3,$M3
  2299. vpmuludq $H0,$R4,$M4
  2300. vpmuludq $H0,$R0,$M0
  2301. vpmuludq $H0,$R1,$M1
  2302. vpaddq $M3,$D3,$D3 # d3 += h0*r3
  2303. vpaddq $M4,$D4,$D4 # d4 += h0*r4
  2304. vpaddq $M0,$D0,$D0 # d0 += h0*r0
  2305. vpaddq $M1,$D1,$D1 # d1 += h0*r1
  2306. vmovdqu 16*1($inp),%x#$T1
  2307. vpmuludq $H1,$R2,$M3
  2308. vpmuludq $H1,$R3,$M4
  2309. vpmuludq $H1,$S4,$M0
  2310. vpmuludq $H0,$R2,$M2
  2311. vpaddq $M3,$D3,$D3 # d3 += h1*r2
  2312. vpaddq $M4,$D4,$D4 # d4 += h1*r3
  2313. vpaddq $M0,$D0,$D0 # d0 += h1*s4
  2314. vpaddq $M2,$D2,$D2 # d2 += h0*r2
  2315. vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
  2316. vpmuludq $H3,$R0,$M3
  2317. vpmuludq $H3,$R1,$M4
  2318. vpmuludq $H1,$R0,$M1
  2319. vpmuludq $H1,$R1,$M2
  2320. vpaddq $M3,$D3,$D3 # d3 += h3*r0
  2321. vpaddq $M4,$D4,$D4 # d4 += h3*r1
  2322. vpaddq $M1,$D1,$D1 # d1 += h1*r0
  2323. vpaddq $M2,$D2,$D2 # d2 += h1*r1
  2324. vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
  2325. vpmuludq $H4,$S4,$M3
  2326. vpmuludq $H4,$R0,$M4
  2327. vpmuludq $H3,$S2,$M0
  2328. vpmuludq $H3,$S3,$M1
  2329. vpmuludq $H3,$S4,$M2
  2330. vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
  2331. vpaddq $M4,$D4,$D4 # d4 += h4*r0
  2332. vpaddq $M0,$D0,$D0 # d0 += h3*s2
  2333. vpaddq $M1,$D1,$D1 # d1 += h3*s3
  2334. vpaddq $M2,$D2,$D2 # d2 += h3*s4
  2335. vpmuludq $H4,$S1,$M0
  2336. vpmuludq $H4,$S2,$M1
  2337. vpmuludq $H4,$S3,$M2
  2338. vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
  2339. vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
  2340. vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
  2341. ################################################################
  2342. # horizontal addition
  2343. mov \$1,%eax
  2344. vpermq \$0xb1,$H3,$D3
  2345. vpermq \$0xb1,$D4,$H4
  2346. vpermq \$0xb1,$H0,$D0
  2347. vpermq \$0xb1,$H1,$D1
  2348. vpermq \$0xb1,$H2,$D2
  2349. vpaddq $D3,$H3,$H3
  2350. vpaddq $D4,$H4,$H4
  2351. vpaddq $D0,$H0,$H0
  2352. vpaddq $D1,$H1,$H1
  2353. vpaddq $D2,$H2,$H2
  2354. kmovw %eax,%k3
  2355. vpermq \$0x2,$H3,$D3
  2356. vpermq \$0x2,$H4,$D4
  2357. vpermq \$0x2,$H0,$D0
  2358. vpermq \$0x2,$H1,$D1
  2359. vpermq \$0x2,$H2,$D2
  2360. vpaddq $D3,$H3,$H3
  2361. vpaddq $D4,$H4,$H4
  2362. vpaddq $D0,$H0,$H0
  2363. vpaddq $D1,$H1,$H1
  2364. vpaddq $D2,$H2,$H2
  2365. vextracti64x4 \$0x1,$H3,%y#$D3
  2366. vextracti64x4 \$0x1,$H4,%y#$D4
  2367. vextracti64x4 \$0x1,$H0,%y#$D0
  2368. vextracti64x4 \$0x1,$H1,%y#$D1
  2369. vextracti64x4 \$0x1,$H2,%y#$D2
  2370. vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
  2371. vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
  2372. vpaddq $D0,$H0,${H0}{%k3}{z}
  2373. vpaddq $D1,$H1,${H1}{%k3}{z}
  2374. vpaddq $D2,$H2,${H2}{%k3}{z}
  2375. ___
  2376. map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
  2377. map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
  2378. $code.=<<___;
  2379. ################################################################
  2380. # lazy reduction (interleaved with input splat)
  2381. vpsrlq \$26,$H3,$D3
  2382. vpand $MASK,$H3,$H3
  2383. vpsrldq \$6,$T0,$T2 # splat input
  2384. vpsrldq \$6,$T1,$T3
  2385. vpunpckhqdq $T1,$T0,$T4 # 4
  2386. vpaddq $D3,$H4,$H4 # h3 -> h4
  2387. vpsrlq \$26,$H0,$D0
  2388. vpand $MASK,$H0,$H0
  2389. vpunpcklqdq $T3,$T2,$T2 # 2:3
  2390. vpunpcklqdq $T1,$T0,$T0 # 0:1
  2391. vpaddq $D0,$H1,$H1 # h0 -> h1
  2392. vpsrlq \$26,$H4,$D4
  2393. vpand $MASK,$H4,$H4
  2394. vpsrlq \$26,$H1,$D1
  2395. vpand $MASK,$H1,$H1
  2396. vpsrlq \$30,$T2,$T3
  2397. vpsrlq \$4,$T2,$T2
  2398. vpaddq $D1,$H2,$H2 # h1 -> h2
  2399. vpaddq $D4,$H0,$H0
  2400. vpsllq \$2,$D4,$D4
  2401. vpsrlq \$26,$T0,$T1
  2402. vpsrlq \$40,$T4,$T4 # 4
  2403. vpaddq $D4,$H0,$H0 # h4 -> h0
  2404. vpsrlq \$26,$H2,$D2
  2405. vpand $MASK,$H2,$H2
  2406. vpand $MASK,$T2,$T2 # 2
  2407. vpand $MASK,$T0,$T0 # 0
  2408. vpaddq $D2,$H3,$H3 # h2 -> h3
  2409. vpsrlq \$26,$H0,$D0
  2410. vpand $MASK,$H0,$H0
  2411. vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
  2412. vpand $MASK,$T1,$T1 # 1
  2413. vpaddq $D0,$H1,$H1 # h0 -> h1
  2414. vpsrlq \$26,$H3,$D3
  2415. vpand $MASK,$H3,$H3
  2416. vpand $MASK,$T3,$T3 # 3
  2417. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  2418. vpaddq $D3,$H4,$H4 # h3 -> h4
  2419. lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
  2420. add \$64,$len
  2421. jnz .Ltail_avx2$suffix
  2422. vpsubq $T2,$H2,$H2 # undo input accumulation
  2423. vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
  2424. vmovd %x#$H1,`4*1-48-64`($ctx)
  2425. vmovd %x#$H2,`4*2-48-64`($ctx)
  2426. vmovd %x#$H3,`4*3-48-64`($ctx)
  2427. vmovd %x#$H4,`4*4-48-64`($ctx)
  2428. vzeroall
  2429. ___
  2430. $code.=<<___ if ($win64);
  2431. movdqa -0xb0(%r10),%xmm6
  2432. movdqa -0xa0(%r10),%xmm7
  2433. movdqa -0x90(%r10),%xmm8
  2434. movdqa -0x80(%r10),%xmm9
  2435. movdqa -0x70(%r10),%xmm10
  2436. movdqa -0x60(%r10),%xmm11
  2437. movdqa -0x50(%r10),%xmm12
  2438. movdqa -0x40(%r10),%xmm13
  2439. movdqa -0x30(%r10),%xmm14
  2440. movdqa -0x20(%r10),%xmm15
  2441. lea -8(%r10),%rsp
  2442. .Ldo_avx512_epilogue:
  2443. ___
  2444. $code.=<<___ if (!$win64);
  2445. lea -8(%r10),%rsp
  2446. .cfi_def_cfa_register %rsp
  2447. ___
  2448. $code.=<<___;
  2449. RET
  2450. .cfi_endproc
  2451. ___
  2452. }
  2453. }
  2454. &declare_function("poly1305_blocks_avx2", 32, 4);
  2455. poly1305_blocks_avxN(0);
  2456. &end_function("poly1305_blocks_avx2");
  2457. #######################################################################
  2458. if ($avx>2) {
  2459. # On entry we have input length divisible by 64. But since inner loop
  2460. # processes 128 bytes per iteration, cases when length is not divisible
  2461. # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
  2462. # reason stack layout is kept identical to poly1305_blocks_avx2. If not
  2463. # for this tail, we wouldn't have to even allocate stack frame...
  2464. if($kernel) {
  2465. $code .= "#ifdef CONFIG_AS_AVX512\n";
  2466. }
  2467. &declare_function("poly1305_blocks_avx512", 32, 4);
  2468. poly1305_blocks_avxN(1);
  2469. &end_function("poly1305_blocks_avx512");
  2470. if ($kernel) {
  2471. $code .= "#endif\n";
  2472. }
  2473. if (!$kernel && $avx>3) {
  2474. ########################################################################
  2475. # VPMADD52 version using 2^44 radix.
  2476. #
  2477. # One can argue that base 2^52 would be more natural. Well, even though
  2478. # some operations would be more natural, one has to recognize couple of
  2479. # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
  2480. # at amount of multiply-n-accumulate operations. Secondly, it makes it
  2481. # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
  2482. # reference implementations], which means that more such operations
  2483. # would have to be performed in inner loop, which in turn makes critical
  2484. # path longer. In other words, even though base 2^44 reduction might
  2485. # look less elegant, overall critical path is actually shorter...
  2486. ########################################################################
  2487. # Layout of opaque area is following.
  2488. #
  2489. # unsigned __int64 h[3]; # current hash value base 2^44
  2490. # unsigned __int64 s[2]; # key value*20 base 2^44
  2491. # unsigned __int64 r[3]; # key value base 2^44
  2492. # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
  2493. # # r^n positions reflect
  2494. # # placement in register, not
  2495. # # memory, R[3] is R[1]*20
  2496. $code.=<<___;
  2497. .type poly1305_init_base2_44,\@function,3
  2498. .align 32
  2499. poly1305_init_base2_44:
  2500. xor %eax,%eax
  2501. mov %rax,0($ctx) # initialize hash value
  2502. mov %rax,8($ctx)
  2503. mov %rax,16($ctx)
  2504. .Linit_base2_44:
  2505. lea poly1305_blocks_vpmadd52(%rip),%r10
  2506. lea poly1305_emit_base2_44(%rip),%r11
  2507. mov \$0x0ffffffc0fffffff,%rax
  2508. mov \$0x0ffffffc0ffffffc,%rcx
  2509. and 0($inp),%rax
  2510. mov \$0x00000fffffffffff,%r8
  2511. and 8($inp),%rcx
  2512. mov \$0x00000fffffffffff,%r9
  2513. and %rax,%r8
  2514. shrd \$44,%rcx,%rax
  2515. mov %r8,40($ctx) # r0
  2516. and %r9,%rax
  2517. shr \$24,%rcx
  2518. mov %rax,48($ctx) # r1
  2519. lea (%rax,%rax,4),%rax # *5
  2520. mov %rcx,56($ctx) # r2
  2521. shl \$2,%rax # magic <<2
  2522. lea (%rcx,%rcx,4),%rcx # *5
  2523. shl \$2,%rcx # magic <<2
  2524. mov %rax,24($ctx) # s1
  2525. mov %rcx,32($ctx) # s2
  2526. movq \$-1,64($ctx) # write impossible value
  2527. ___
  2528. $code.=<<___ if ($flavour !~ /elf32/);
  2529. mov %r10,0(%rdx)
  2530. mov %r11,8(%rdx)
  2531. ___
  2532. $code.=<<___ if ($flavour =~ /elf32/);
  2533. mov %r10d,0(%rdx)
  2534. mov %r11d,4(%rdx)
  2535. ___
  2536. $code.=<<___;
  2537. mov \$1,%eax
  2538. RET
  2539. .size poly1305_init_base2_44,.-poly1305_init_base2_44
  2540. ___
  2541. {
  2542. my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
  2543. my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
  2544. my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
  2545. $code.=<<___;
  2546. .type poly1305_blocks_vpmadd52,\@function,4
  2547. .align 32
  2548. poly1305_blocks_vpmadd52:
  2549. shr \$4,$len
  2550. jz .Lno_data_vpmadd52 # too short
  2551. shl \$40,$padbit
  2552. mov 64($ctx),%r8 # peek on power of the key
  2553. # if powers of the key are not calculated yet, process up to 3
  2554. # blocks with this single-block subroutine, otherwise ensure that
  2555. # length is divisible by 2 blocks and pass the rest down to next
  2556. # subroutine...
  2557. mov \$3,%rax
  2558. mov \$1,%r10
  2559. cmp \$4,$len # is input long
  2560. cmovae %r10,%rax
  2561. test %r8,%r8 # is power value impossible?
  2562. cmovns %r10,%rax
  2563. and $len,%rax # is input of favourable length?
  2564. jz .Lblocks_vpmadd52_4x
  2565. sub %rax,$len
  2566. mov \$7,%r10d
  2567. mov \$1,%r11d
  2568. kmovw %r10d,%k7
  2569. lea .L2_44_inp_permd(%rip),%r10
  2570. kmovw %r11d,%k1
  2571. vmovq $padbit,%x#$PAD
  2572. vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
  2573. vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
  2574. vpermq \$0xcf,$PAD,$PAD
  2575. vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
  2576. vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
  2577. vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
  2578. vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
  2579. vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
  2580. vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
  2581. vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
  2582. jmp .Loop_vpmadd52
  2583. .align 32
  2584. .Loop_vpmadd52:
  2585. vmovdqu32 0($inp),%x#$T0 # load input as ----3210
  2586. lea 16($inp),$inp
  2587. vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
  2588. vpsrlvq $inp_shift,$T0,$T0
  2589. vpandq $reduc_mask,$T0,$T0
  2590. vporq $PAD,$T0,$T0
  2591. vpaddq $T0,$Dlo,$Dlo # accumulate input
  2592. vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
  2593. vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
  2594. vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
  2595. vpxord $Dlo,$Dlo,$Dlo
  2596. vpxord $Dhi,$Dhi,$Dhi
  2597. vpmadd52luq $r2r1r0,$H0,$Dlo
  2598. vpmadd52huq $r2r1r0,$H0,$Dhi
  2599. vpmadd52luq $r1r0s2,$H1,$Dlo
  2600. vpmadd52huq $r1r0s2,$H1,$Dhi
  2601. vpmadd52luq $r0s2s1,$H2,$Dlo
  2602. vpmadd52huq $r0s2s1,$H2,$Dhi
  2603. vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
  2604. vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
  2605. vpandq $reduc_mask,$Dlo,$Dlo
  2606. vpaddq $T0,$Dhi,$Dhi
  2607. vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
  2608. vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
  2609. vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
  2610. vpandq $reduc_mask,$Dlo,$Dlo
  2611. vpermq \$0b10010011,$T0,$T0
  2612. vpaddq $T0,$Dlo,$Dlo
  2613. vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
  2614. vpaddq $T0,$Dlo,$Dlo
  2615. vpsllq \$2,$T0,$T0
  2616. vpaddq $T0,$Dlo,$Dlo
  2617. dec %rax # len-=16
  2618. jnz .Loop_vpmadd52
  2619. vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
  2620. test $len,$len
  2621. jnz .Lblocks_vpmadd52_4x
  2622. .Lno_data_vpmadd52:
  2623. RET
  2624. .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
  2625. ___
  2626. }
  2627. {
  2628. ########################################################################
  2629. # As implied by its name 4x subroutine processes 4 blocks in parallel
  2630. # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
  2631. # and is handled in 256-bit %ymm registers.
  2632. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
  2633. my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
  2634. my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  2635. $code.=<<___;
  2636. .type poly1305_blocks_vpmadd52_4x,\@function,4
  2637. .align 32
  2638. poly1305_blocks_vpmadd52_4x:
  2639. shr \$4,$len
  2640. jz .Lno_data_vpmadd52_4x # too short
  2641. shl \$40,$padbit
  2642. mov 64($ctx),%r8 # peek on power of the key
  2643. .Lblocks_vpmadd52_4x:
  2644. vpbroadcastq $padbit,$PAD
  2645. vmovdqa64 .Lx_mask44(%rip),$mask44
  2646. mov \$5,%eax
  2647. vmovdqa64 .Lx_mask42(%rip),$mask42
  2648. kmovw %eax,%k1 # used in 2x path
  2649. test %r8,%r8 # is power value impossible?
  2650. js .Linit_vpmadd52 # if it is, then init R[4]
  2651. vmovq 0($ctx),%x#$H0 # load current hash value
  2652. vmovq 8($ctx),%x#$H1
  2653. vmovq 16($ctx),%x#$H2
  2654. test \$3,$len # is length 4*n+2?
  2655. jnz .Lblocks_vpmadd52_2x_do
  2656. .Lblocks_vpmadd52_4x_do:
  2657. vpbroadcastq 64($ctx),$R0 # load 4th power of the key
  2658. vpbroadcastq 96($ctx),$R1
  2659. vpbroadcastq 128($ctx),$R2
  2660. vpbroadcastq 160($ctx),$S1
  2661. .Lblocks_vpmadd52_4x_key_loaded:
  2662. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2663. vpaddq $R2,$S2,$S2
  2664. vpsllq \$2,$S2,$S2
  2665. test \$7,$len # is len 8*n?
  2666. jz .Lblocks_vpmadd52_8x
  2667. vmovdqu64 16*0($inp),$T2 # load data
  2668. vmovdqu64 16*2($inp),$T3
  2669. lea 16*4($inp),$inp
  2670. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2671. vpunpckhqdq $T3,$T2,$T3
  2672. # at this point 64-bit lanes are ordered as 3-1-2-0
  2673. vpsrlq \$24,$T3,$T2 # splat the data
  2674. vporq $PAD,$T2,$T2
  2675. vpaddq $T2,$H2,$H2 # accumulate input
  2676. vpandq $mask44,$T1,$T0
  2677. vpsrlq \$44,$T1,$T1
  2678. vpsllq \$20,$T3,$T3
  2679. vporq $T3,$T1,$T1
  2680. vpandq $mask44,$T1,$T1
  2681. sub \$4,$len
  2682. jz .Ltail_vpmadd52_4x
  2683. jmp .Loop_vpmadd52_4x
  2684. ud2
  2685. .align 32
  2686. .Linit_vpmadd52:
  2687. vmovq 24($ctx),%x#$S1 # load key
  2688. vmovq 56($ctx),%x#$H2
  2689. vmovq 32($ctx),%x#$S2
  2690. vmovq 40($ctx),%x#$R0
  2691. vmovq 48($ctx),%x#$R1
  2692. vmovdqa $R0,$H0
  2693. vmovdqa $R1,$H1
  2694. vmovdqa $H2,$R2
  2695. mov \$2,%eax
  2696. .Lmul_init_vpmadd52:
  2697. vpxorq $D0lo,$D0lo,$D0lo
  2698. vpmadd52luq $H2,$S1,$D0lo
  2699. vpxorq $D0hi,$D0hi,$D0hi
  2700. vpmadd52huq $H2,$S1,$D0hi
  2701. vpxorq $D1lo,$D1lo,$D1lo
  2702. vpmadd52luq $H2,$S2,$D1lo
  2703. vpxorq $D1hi,$D1hi,$D1hi
  2704. vpmadd52huq $H2,$S2,$D1hi
  2705. vpxorq $D2lo,$D2lo,$D2lo
  2706. vpmadd52luq $H2,$R0,$D2lo
  2707. vpxorq $D2hi,$D2hi,$D2hi
  2708. vpmadd52huq $H2,$R0,$D2hi
  2709. vpmadd52luq $H0,$R0,$D0lo
  2710. vpmadd52huq $H0,$R0,$D0hi
  2711. vpmadd52luq $H0,$R1,$D1lo
  2712. vpmadd52huq $H0,$R1,$D1hi
  2713. vpmadd52luq $H0,$R2,$D2lo
  2714. vpmadd52huq $H0,$R2,$D2hi
  2715. vpmadd52luq $H1,$S2,$D0lo
  2716. vpmadd52huq $H1,$S2,$D0hi
  2717. vpmadd52luq $H1,$R0,$D1lo
  2718. vpmadd52huq $H1,$R0,$D1hi
  2719. vpmadd52luq $H1,$R1,$D2lo
  2720. vpmadd52huq $H1,$R1,$D2hi
  2721. ################################################################
  2722. # partial reduction
  2723. vpsrlq \$44,$D0lo,$tmp
  2724. vpsllq \$8,$D0hi,$D0hi
  2725. vpandq $mask44,$D0lo,$H0
  2726. vpaddq $tmp,$D0hi,$D0hi
  2727. vpaddq $D0hi,$D1lo,$D1lo
  2728. vpsrlq \$44,$D1lo,$tmp
  2729. vpsllq \$8,$D1hi,$D1hi
  2730. vpandq $mask44,$D1lo,$H1
  2731. vpaddq $tmp,$D1hi,$D1hi
  2732. vpaddq $D1hi,$D2lo,$D2lo
  2733. vpsrlq \$42,$D2lo,$tmp
  2734. vpsllq \$10,$D2hi,$D2hi
  2735. vpandq $mask42,$D2lo,$H2
  2736. vpaddq $tmp,$D2hi,$D2hi
  2737. vpaddq $D2hi,$H0,$H0
  2738. vpsllq \$2,$D2hi,$D2hi
  2739. vpaddq $D2hi,$H0,$H0
  2740. vpsrlq \$44,$H0,$tmp # additional step
  2741. vpandq $mask44,$H0,$H0
  2742. vpaddq $tmp,$H1,$H1
  2743. dec %eax
  2744. jz .Ldone_init_vpmadd52
  2745. vpunpcklqdq $R1,$H1,$R1 # 1,2
  2746. vpbroadcastq %x#$H1,%x#$H1 # 2,2
  2747. vpunpcklqdq $R2,$H2,$R2
  2748. vpbroadcastq %x#$H2,%x#$H2
  2749. vpunpcklqdq $R0,$H0,$R0
  2750. vpbroadcastq %x#$H0,%x#$H0
  2751. vpsllq \$2,$R1,$S1 # S1 = R1*5*4
  2752. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2753. vpaddq $R1,$S1,$S1
  2754. vpaddq $R2,$S2,$S2
  2755. vpsllq \$2,$S1,$S1
  2756. vpsllq \$2,$S2,$S2
  2757. jmp .Lmul_init_vpmadd52
  2758. ud2
  2759. .align 32
  2760. .Ldone_init_vpmadd52:
  2761. vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
  2762. vinserti128 \$1,%x#$R2,$H2,$R2
  2763. vinserti128 \$1,%x#$R0,$H0,$R0
  2764. vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
  2765. vpermq \$0b11011000,$R2,$R2
  2766. vpermq \$0b11011000,$R0,$R0
  2767. vpsllq \$2,$R1,$S1 # S1 = R1*5*4
  2768. vpaddq $R1,$S1,$S1
  2769. vpsllq \$2,$S1,$S1
  2770. vmovq 0($ctx),%x#$H0 # load current hash value
  2771. vmovq 8($ctx),%x#$H1
  2772. vmovq 16($ctx),%x#$H2
  2773. test \$3,$len # is length 4*n+2?
  2774. jnz .Ldone_init_vpmadd52_2x
  2775. vmovdqu64 $R0,64($ctx) # save key powers
  2776. vpbroadcastq %x#$R0,$R0 # broadcast 4th power
  2777. vmovdqu64 $R1,96($ctx)
  2778. vpbroadcastq %x#$R1,$R1
  2779. vmovdqu64 $R2,128($ctx)
  2780. vpbroadcastq %x#$R2,$R2
  2781. vmovdqu64 $S1,160($ctx)
  2782. vpbroadcastq %x#$S1,$S1
  2783. jmp .Lblocks_vpmadd52_4x_key_loaded
  2784. ud2
  2785. .align 32
  2786. .Ldone_init_vpmadd52_2x:
  2787. vmovdqu64 $R0,64($ctx) # save key powers
  2788. vpsrldq \$8,$R0,$R0 # 0-1-0-2
  2789. vmovdqu64 $R1,96($ctx)
  2790. vpsrldq \$8,$R1,$R1
  2791. vmovdqu64 $R2,128($ctx)
  2792. vpsrldq \$8,$R2,$R2
  2793. vmovdqu64 $S1,160($ctx)
  2794. vpsrldq \$8,$S1,$S1
  2795. jmp .Lblocks_vpmadd52_2x_key_loaded
  2796. ud2
  2797. .align 32
  2798. .Lblocks_vpmadd52_2x_do:
  2799. vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
  2800. vmovdqu64 160+8($ctx),${S1}{%k1}{z}
  2801. vmovdqu64 64+8($ctx),${R0}{%k1}{z}
  2802. vmovdqu64 96+8($ctx),${R1}{%k1}{z}
  2803. .Lblocks_vpmadd52_2x_key_loaded:
  2804. vmovdqu64 16*0($inp),$T2 # load data
  2805. vpxorq $T3,$T3,$T3
  2806. lea 16*2($inp),$inp
  2807. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2808. vpunpckhqdq $T3,$T2,$T3
  2809. # at this point 64-bit lanes are ordered as x-1-x-0
  2810. vpsrlq \$24,$T3,$T2 # splat the data
  2811. vporq $PAD,$T2,$T2
  2812. vpaddq $T2,$H2,$H2 # accumulate input
  2813. vpandq $mask44,$T1,$T0
  2814. vpsrlq \$44,$T1,$T1
  2815. vpsllq \$20,$T3,$T3
  2816. vporq $T3,$T1,$T1
  2817. vpandq $mask44,$T1,$T1
  2818. jmp .Ltail_vpmadd52_2x
  2819. ud2
  2820. .align 32
  2821. .Loop_vpmadd52_4x:
  2822. #vpaddq $T2,$H2,$H2 # accumulate input
  2823. vpaddq $T0,$H0,$H0
  2824. vpaddq $T1,$H1,$H1
  2825. vpxorq $D0lo,$D0lo,$D0lo
  2826. vpmadd52luq $H2,$S1,$D0lo
  2827. vpxorq $D0hi,$D0hi,$D0hi
  2828. vpmadd52huq $H2,$S1,$D0hi
  2829. vpxorq $D1lo,$D1lo,$D1lo
  2830. vpmadd52luq $H2,$S2,$D1lo
  2831. vpxorq $D1hi,$D1hi,$D1hi
  2832. vpmadd52huq $H2,$S2,$D1hi
  2833. vpxorq $D2lo,$D2lo,$D2lo
  2834. vpmadd52luq $H2,$R0,$D2lo
  2835. vpxorq $D2hi,$D2hi,$D2hi
  2836. vpmadd52huq $H2,$R0,$D2hi
  2837. vmovdqu64 16*0($inp),$T2 # load data
  2838. vmovdqu64 16*2($inp),$T3
  2839. lea 16*4($inp),$inp
  2840. vpmadd52luq $H0,$R0,$D0lo
  2841. vpmadd52huq $H0,$R0,$D0hi
  2842. vpmadd52luq $H0,$R1,$D1lo
  2843. vpmadd52huq $H0,$R1,$D1hi
  2844. vpmadd52luq $H0,$R2,$D2lo
  2845. vpmadd52huq $H0,$R2,$D2hi
  2846. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2847. vpunpckhqdq $T3,$T2,$T3
  2848. vpmadd52luq $H1,$S2,$D0lo
  2849. vpmadd52huq $H1,$S2,$D0hi
  2850. vpmadd52luq $H1,$R0,$D1lo
  2851. vpmadd52huq $H1,$R0,$D1hi
  2852. vpmadd52luq $H1,$R1,$D2lo
  2853. vpmadd52huq $H1,$R1,$D2hi
  2854. ################################################################
  2855. # partial reduction (interleaved with data splat)
  2856. vpsrlq \$44,$D0lo,$tmp
  2857. vpsllq \$8,$D0hi,$D0hi
  2858. vpandq $mask44,$D0lo,$H0
  2859. vpaddq $tmp,$D0hi,$D0hi
  2860. vpsrlq \$24,$T3,$T2
  2861. vporq $PAD,$T2,$T2
  2862. vpaddq $D0hi,$D1lo,$D1lo
  2863. vpsrlq \$44,$D1lo,$tmp
  2864. vpsllq \$8,$D1hi,$D1hi
  2865. vpandq $mask44,$D1lo,$H1
  2866. vpaddq $tmp,$D1hi,$D1hi
  2867. vpandq $mask44,$T1,$T0
  2868. vpsrlq \$44,$T1,$T1
  2869. vpsllq \$20,$T3,$T3
  2870. vpaddq $D1hi,$D2lo,$D2lo
  2871. vpsrlq \$42,$D2lo,$tmp
  2872. vpsllq \$10,$D2hi,$D2hi
  2873. vpandq $mask42,$D2lo,$H2
  2874. vpaddq $tmp,$D2hi,$D2hi
  2875. vpaddq $T2,$H2,$H2 # accumulate input
  2876. vpaddq $D2hi,$H0,$H0
  2877. vpsllq \$2,$D2hi,$D2hi
  2878. vpaddq $D2hi,$H0,$H0
  2879. vporq $T3,$T1,$T1
  2880. vpandq $mask44,$T1,$T1
  2881. vpsrlq \$44,$H0,$tmp # additional step
  2882. vpandq $mask44,$H0,$H0
  2883. vpaddq $tmp,$H1,$H1
  2884. sub \$4,$len # len-=64
  2885. jnz .Loop_vpmadd52_4x
  2886. .Ltail_vpmadd52_4x:
  2887. vmovdqu64 128($ctx),$R2 # load all key powers
  2888. vmovdqu64 160($ctx),$S1
  2889. vmovdqu64 64($ctx),$R0
  2890. vmovdqu64 96($ctx),$R1
  2891. .Ltail_vpmadd52_2x:
  2892. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2893. vpaddq $R2,$S2,$S2
  2894. vpsllq \$2,$S2,$S2
  2895. #vpaddq $T2,$H2,$H2 # accumulate input
  2896. vpaddq $T0,$H0,$H0
  2897. vpaddq $T1,$H1,$H1
  2898. vpxorq $D0lo,$D0lo,$D0lo
  2899. vpmadd52luq $H2,$S1,$D0lo
  2900. vpxorq $D0hi,$D0hi,$D0hi
  2901. vpmadd52huq $H2,$S1,$D0hi
  2902. vpxorq $D1lo,$D1lo,$D1lo
  2903. vpmadd52luq $H2,$S2,$D1lo
  2904. vpxorq $D1hi,$D1hi,$D1hi
  2905. vpmadd52huq $H2,$S2,$D1hi
  2906. vpxorq $D2lo,$D2lo,$D2lo
  2907. vpmadd52luq $H2,$R0,$D2lo
  2908. vpxorq $D2hi,$D2hi,$D2hi
  2909. vpmadd52huq $H2,$R0,$D2hi
  2910. vpmadd52luq $H0,$R0,$D0lo
  2911. vpmadd52huq $H0,$R0,$D0hi
  2912. vpmadd52luq $H0,$R1,$D1lo
  2913. vpmadd52huq $H0,$R1,$D1hi
  2914. vpmadd52luq $H0,$R2,$D2lo
  2915. vpmadd52huq $H0,$R2,$D2hi
  2916. vpmadd52luq $H1,$S2,$D0lo
  2917. vpmadd52huq $H1,$S2,$D0hi
  2918. vpmadd52luq $H1,$R0,$D1lo
  2919. vpmadd52huq $H1,$R0,$D1hi
  2920. vpmadd52luq $H1,$R1,$D2lo
  2921. vpmadd52huq $H1,$R1,$D2hi
  2922. ################################################################
  2923. # horizontal addition
  2924. mov \$1,%eax
  2925. kmovw %eax,%k1
  2926. vpsrldq \$8,$D0lo,$T0
  2927. vpsrldq \$8,$D0hi,$H0
  2928. vpsrldq \$8,$D1lo,$T1
  2929. vpsrldq \$8,$D1hi,$H1
  2930. vpaddq $T0,$D0lo,$D0lo
  2931. vpaddq $H0,$D0hi,$D0hi
  2932. vpsrldq \$8,$D2lo,$T2
  2933. vpsrldq \$8,$D2hi,$H2
  2934. vpaddq $T1,$D1lo,$D1lo
  2935. vpaddq $H1,$D1hi,$D1hi
  2936. vpermq \$0x2,$D0lo,$T0
  2937. vpermq \$0x2,$D0hi,$H0
  2938. vpaddq $T2,$D2lo,$D2lo
  2939. vpaddq $H2,$D2hi,$D2hi
  2940. vpermq \$0x2,$D1lo,$T1
  2941. vpermq \$0x2,$D1hi,$H1
  2942. vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
  2943. vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
  2944. vpermq \$0x2,$D2lo,$T2
  2945. vpermq \$0x2,$D2hi,$H2
  2946. vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
  2947. vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
  2948. vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
  2949. vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
  2950. ################################################################
  2951. # partial reduction
  2952. vpsrlq \$44,$D0lo,$tmp
  2953. vpsllq \$8,$D0hi,$D0hi
  2954. vpandq $mask44,$D0lo,$H0
  2955. vpaddq $tmp,$D0hi,$D0hi
  2956. vpaddq $D0hi,$D1lo,$D1lo
  2957. vpsrlq \$44,$D1lo,$tmp
  2958. vpsllq \$8,$D1hi,$D1hi
  2959. vpandq $mask44,$D1lo,$H1
  2960. vpaddq $tmp,$D1hi,$D1hi
  2961. vpaddq $D1hi,$D2lo,$D2lo
  2962. vpsrlq \$42,$D2lo,$tmp
  2963. vpsllq \$10,$D2hi,$D2hi
  2964. vpandq $mask42,$D2lo,$H2
  2965. vpaddq $tmp,$D2hi,$D2hi
  2966. vpaddq $D2hi,$H0,$H0
  2967. vpsllq \$2,$D2hi,$D2hi
  2968. vpaddq $D2hi,$H0,$H0
  2969. vpsrlq \$44,$H0,$tmp # additional step
  2970. vpandq $mask44,$H0,$H0
  2971. vpaddq $tmp,$H1,$H1
  2972. # at this point $len is
  2973. # either 4*n+2 or 0...
  2974. sub \$2,$len # len-=32
  2975. ja .Lblocks_vpmadd52_4x_do
  2976. vmovq %x#$H0,0($ctx)
  2977. vmovq %x#$H1,8($ctx)
  2978. vmovq %x#$H2,16($ctx)
  2979. vzeroall
  2980. .Lno_data_vpmadd52_4x:
  2981. RET
  2982. .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
  2983. ___
  2984. }
  2985. {
  2986. ########################################################################
  2987. # As implied by its name 8x subroutine processes 8 blocks in parallel...
  2988. # This is intermediate version, as it's used only in cases when input
  2989. # length is either 8*n, 8*n+1 or 8*n+2...
  2990. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
  2991. my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
  2992. my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  2993. my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
  2994. $code.=<<___;
  2995. .type poly1305_blocks_vpmadd52_8x,\@function,4
  2996. .align 32
  2997. poly1305_blocks_vpmadd52_8x:
  2998. shr \$4,$len
  2999. jz .Lno_data_vpmadd52_8x # too short
  3000. shl \$40,$padbit
  3001. mov 64($ctx),%r8 # peek on power of the key
  3002. vmovdqa64 .Lx_mask44(%rip),$mask44
  3003. vmovdqa64 .Lx_mask42(%rip),$mask42
  3004. test %r8,%r8 # is power value impossible?
  3005. js .Linit_vpmadd52 # if it is, then init R[4]
  3006. vmovq 0($ctx),%x#$H0 # load current hash value
  3007. vmovq 8($ctx),%x#$H1
  3008. vmovq 16($ctx),%x#$H2
  3009. .Lblocks_vpmadd52_8x:
  3010. ################################################################
  3011. # fist we calculate more key powers
  3012. vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
  3013. vmovdqu64 160($ctx),$S1
  3014. vmovdqu64 64($ctx),$R0
  3015. vmovdqu64 96($ctx),$R1
  3016. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  3017. vpaddq $R2,$S2,$S2
  3018. vpsllq \$2,$S2,$S2
  3019. vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
  3020. vpbroadcastq %x#$R0,$RR0
  3021. vpbroadcastq %x#$R1,$RR1
  3022. vpxorq $D0lo,$D0lo,$D0lo
  3023. vpmadd52luq $RR2,$S1,$D0lo
  3024. vpxorq $D0hi,$D0hi,$D0hi
  3025. vpmadd52huq $RR2,$S1,$D0hi
  3026. vpxorq $D1lo,$D1lo,$D1lo
  3027. vpmadd52luq $RR2,$S2,$D1lo
  3028. vpxorq $D1hi,$D1hi,$D1hi
  3029. vpmadd52huq $RR2,$S2,$D1hi
  3030. vpxorq $D2lo,$D2lo,$D2lo
  3031. vpmadd52luq $RR2,$R0,$D2lo
  3032. vpxorq $D2hi,$D2hi,$D2hi
  3033. vpmadd52huq $RR2,$R0,$D2hi
  3034. vpmadd52luq $RR0,$R0,$D0lo
  3035. vpmadd52huq $RR0,$R0,$D0hi
  3036. vpmadd52luq $RR0,$R1,$D1lo
  3037. vpmadd52huq $RR0,$R1,$D1hi
  3038. vpmadd52luq $RR0,$R2,$D2lo
  3039. vpmadd52huq $RR0,$R2,$D2hi
  3040. vpmadd52luq $RR1,$S2,$D0lo
  3041. vpmadd52huq $RR1,$S2,$D0hi
  3042. vpmadd52luq $RR1,$R0,$D1lo
  3043. vpmadd52huq $RR1,$R0,$D1hi
  3044. vpmadd52luq $RR1,$R1,$D2lo
  3045. vpmadd52huq $RR1,$R1,$D2hi
  3046. ################################################################
  3047. # partial reduction
  3048. vpsrlq \$44,$D0lo,$tmp
  3049. vpsllq \$8,$D0hi,$D0hi
  3050. vpandq $mask44,$D0lo,$RR0
  3051. vpaddq $tmp,$D0hi,$D0hi
  3052. vpaddq $D0hi,$D1lo,$D1lo
  3053. vpsrlq \$44,$D1lo,$tmp
  3054. vpsllq \$8,$D1hi,$D1hi
  3055. vpandq $mask44,$D1lo,$RR1
  3056. vpaddq $tmp,$D1hi,$D1hi
  3057. vpaddq $D1hi,$D2lo,$D2lo
  3058. vpsrlq \$42,$D2lo,$tmp
  3059. vpsllq \$10,$D2hi,$D2hi
  3060. vpandq $mask42,$D2lo,$RR2
  3061. vpaddq $tmp,$D2hi,$D2hi
  3062. vpaddq $D2hi,$RR0,$RR0
  3063. vpsllq \$2,$D2hi,$D2hi
  3064. vpaddq $D2hi,$RR0,$RR0
  3065. vpsrlq \$44,$RR0,$tmp # additional step
  3066. vpandq $mask44,$RR0,$RR0
  3067. vpaddq $tmp,$RR1,$RR1
  3068. ################################################################
  3069. # At this point Rx holds 1324 powers, RRx - 5768, and the goal
  3070. # is 15263748, which reflects how data is loaded...
  3071. vpunpcklqdq $R2,$RR2,$T2 # 3748
  3072. vpunpckhqdq $R2,$RR2,$R2 # 1526
  3073. vpunpcklqdq $R0,$RR0,$T0
  3074. vpunpckhqdq $R0,$RR0,$R0
  3075. vpunpcklqdq $R1,$RR1,$T1
  3076. vpunpckhqdq $R1,$RR1,$R1
  3077. ___
  3078. ######## switch to %zmm
  3079. map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
  3080. map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
  3081. map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  3082. map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
  3083. $code.=<<___;
  3084. vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
  3085. vshufi64x2 \$0x44,$R0,$T0,$RR0
  3086. vshufi64x2 \$0x44,$R1,$T1,$RR1
  3087. vmovdqu64 16*0($inp),$T2 # load data
  3088. vmovdqu64 16*4($inp),$T3
  3089. lea 16*8($inp),$inp
  3090. vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
  3091. vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
  3092. vpaddq $RR2,$SS2,$SS2
  3093. vpaddq $RR1,$SS1,$SS1
  3094. vpsllq \$2,$SS2,$SS2
  3095. vpsllq \$2,$SS1,$SS1
  3096. vpbroadcastq $padbit,$PAD
  3097. vpbroadcastq %x#$mask44,$mask44
  3098. vpbroadcastq %x#$mask42,$mask42
  3099. vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
  3100. vpbroadcastq %x#$SS2,$S2
  3101. vpbroadcastq %x#$RR0,$R0
  3102. vpbroadcastq %x#$RR1,$R1
  3103. vpbroadcastq %x#$RR2,$R2
  3104. vpunpcklqdq $T3,$T2,$T1 # transpose data
  3105. vpunpckhqdq $T3,$T2,$T3
  3106. # at this point 64-bit lanes are ordered as 73625140
  3107. vpsrlq \$24,$T3,$T2 # splat the data
  3108. vporq $PAD,$T2,$T2
  3109. vpaddq $T2,$H2,$H2 # accumulate input
  3110. vpandq $mask44,$T1,$T0
  3111. vpsrlq \$44,$T1,$T1
  3112. vpsllq \$20,$T3,$T3
  3113. vporq $T3,$T1,$T1
  3114. vpandq $mask44,$T1,$T1
  3115. sub \$8,$len
  3116. jz .Ltail_vpmadd52_8x
  3117. jmp .Loop_vpmadd52_8x
  3118. .align 32
  3119. .Loop_vpmadd52_8x:
  3120. #vpaddq $T2,$H2,$H2 # accumulate input
  3121. vpaddq $T0,$H0,$H0
  3122. vpaddq $T1,$H1,$H1
  3123. vpxorq $D0lo,$D0lo,$D0lo
  3124. vpmadd52luq $H2,$S1,$D0lo
  3125. vpxorq $D0hi,$D0hi,$D0hi
  3126. vpmadd52huq $H2,$S1,$D0hi
  3127. vpxorq $D1lo,$D1lo,$D1lo
  3128. vpmadd52luq $H2,$S2,$D1lo
  3129. vpxorq $D1hi,$D1hi,$D1hi
  3130. vpmadd52huq $H2,$S2,$D1hi
  3131. vpxorq $D2lo,$D2lo,$D2lo
  3132. vpmadd52luq $H2,$R0,$D2lo
  3133. vpxorq $D2hi,$D2hi,$D2hi
  3134. vpmadd52huq $H2,$R0,$D2hi
  3135. vmovdqu64 16*0($inp),$T2 # load data
  3136. vmovdqu64 16*4($inp),$T3
  3137. lea 16*8($inp),$inp
  3138. vpmadd52luq $H0,$R0,$D0lo
  3139. vpmadd52huq $H0,$R0,$D0hi
  3140. vpmadd52luq $H0,$R1,$D1lo
  3141. vpmadd52huq $H0,$R1,$D1hi
  3142. vpmadd52luq $H0,$R2,$D2lo
  3143. vpmadd52huq $H0,$R2,$D2hi
  3144. vpunpcklqdq $T3,$T2,$T1 # transpose data
  3145. vpunpckhqdq $T3,$T2,$T3
  3146. vpmadd52luq $H1,$S2,$D0lo
  3147. vpmadd52huq $H1,$S2,$D0hi
  3148. vpmadd52luq $H1,$R0,$D1lo
  3149. vpmadd52huq $H1,$R0,$D1hi
  3150. vpmadd52luq $H1,$R1,$D2lo
  3151. vpmadd52huq $H1,$R1,$D2hi
  3152. ################################################################
  3153. # partial reduction (interleaved with data splat)
  3154. vpsrlq \$44,$D0lo,$tmp
  3155. vpsllq \$8,$D0hi,$D0hi
  3156. vpandq $mask44,$D0lo,$H0
  3157. vpaddq $tmp,$D0hi,$D0hi
  3158. vpsrlq \$24,$T3,$T2
  3159. vporq $PAD,$T2,$T2
  3160. vpaddq $D0hi,$D1lo,$D1lo
  3161. vpsrlq \$44,$D1lo,$tmp
  3162. vpsllq \$8,$D1hi,$D1hi
  3163. vpandq $mask44,$D1lo,$H1
  3164. vpaddq $tmp,$D1hi,$D1hi
  3165. vpandq $mask44,$T1,$T0
  3166. vpsrlq \$44,$T1,$T1
  3167. vpsllq \$20,$T3,$T3
  3168. vpaddq $D1hi,$D2lo,$D2lo
  3169. vpsrlq \$42,$D2lo,$tmp
  3170. vpsllq \$10,$D2hi,$D2hi
  3171. vpandq $mask42,$D2lo,$H2
  3172. vpaddq $tmp,$D2hi,$D2hi
  3173. vpaddq $T2,$H2,$H2 # accumulate input
  3174. vpaddq $D2hi,$H0,$H0
  3175. vpsllq \$2,$D2hi,$D2hi
  3176. vpaddq $D2hi,$H0,$H0
  3177. vporq $T3,$T1,$T1
  3178. vpandq $mask44,$T1,$T1
  3179. vpsrlq \$44,$H0,$tmp # additional step
  3180. vpandq $mask44,$H0,$H0
  3181. vpaddq $tmp,$H1,$H1
  3182. sub \$8,$len # len-=128
  3183. jnz .Loop_vpmadd52_8x
  3184. .Ltail_vpmadd52_8x:
  3185. #vpaddq $T2,$H2,$H2 # accumulate input
  3186. vpaddq $T0,$H0,$H0
  3187. vpaddq $T1,$H1,$H1
  3188. vpxorq $D0lo,$D0lo,$D0lo
  3189. vpmadd52luq $H2,$SS1,$D0lo
  3190. vpxorq $D0hi,$D0hi,$D0hi
  3191. vpmadd52huq $H2,$SS1,$D0hi
  3192. vpxorq $D1lo,$D1lo,$D1lo
  3193. vpmadd52luq $H2,$SS2,$D1lo
  3194. vpxorq $D1hi,$D1hi,$D1hi
  3195. vpmadd52huq $H2,$SS2,$D1hi
  3196. vpxorq $D2lo,$D2lo,$D2lo
  3197. vpmadd52luq $H2,$RR0,$D2lo
  3198. vpxorq $D2hi,$D2hi,$D2hi
  3199. vpmadd52huq $H2,$RR0,$D2hi
  3200. vpmadd52luq $H0,$RR0,$D0lo
  3201. vpmadd52huq $H0,$RR0,$D0hi
  3202. vpmadd52luq $H0,$RR1,$D1lo
  3203. vpmadd52huq $H0,$RR1,$D1hi
  3204. vpmadd52luq $H0,$RR2,$D2lo
  3205. vpmadd52huq $H0,$RR2,$D2hi
  3206. vpmadd52luq $H1,$SS2,$D0lo
  3207. vpmadd52huq $H1,$SS2,$D0hi
  3208. vpmadd52luq $H1,$RR0,$D1lo
  3209. vpmadd52huq $H1,$RR0,$D1hi
  3210. vpmadd52luq $H1,$RR1,$D2lo
  3211. vpmadd52huq $H1,$RR1,$D2hi
  3212. ################################################################
  3213. # horizontal addition
  3214. mov \$1,%eax
  3215. kmovw %eax,%k1
  3216. vpsrldq \$8,$D0lo,$T0
  3217. vpsrldq \$8,$D0hi,$H0
  3218. vpsrldq \$8,$D1lo,$T1
  3219. vpsrldq \$8,$D1hi,$H1
  3220. vpaddq $T0,$D0lo,$D0lo
  3221. vpaddq $H0,$D0hi,$D0hi
  3222. vpsrldq \$8,$D2lo,$T2
  3223. vpsrldq \$8,$D2hi,$H2
  3224. vpaddq $T1,$D1lo,$D1lo
  3225. vpaddq $H1,$D1hi,$D1hi
  3226. vpermq \$0x2,$D0lo,$T0
  3227. vpermq \$0x2,$D0hi,$H0
  3228. vpaddq $T2,$D2lo,$D2lo
  3229. vpaddq $H2,$D2hi,$D2hi
  3230. vpermq \$0x2,$D1lo,$T1
  3231. vpermq \$0x2,$D1hi,$H1
  3232. vpaddq $T0,$D0lo,$D0lo
  3233. vpaddq $H0,$D0hi,$D0hi
  3234. vpermq \$0x2,$D2lo,$T2
  3235. vpermq \$0x2,$D2hi,$H2
  3236. vpaddq $T1,$D1lo,$D1lo
  3237. vpaddq $H1,$D1hi,$D1hi
  3238. vextracti64x4 \$1,$D0lo,%y#$T0
  3239. vextracti64x4 \$1,$D0hi,%y#$H0
  3240. vpaddq $T2,$D2lo,$D2lo
  3241. vpaddq $H2,$D2hi,$D2hi
  3242. vextracti64x4 \$1,$D1lo,%y#$T1
  3243. vextracti64x4 \$1,$D1hi,%y#$H1
  3244. vextracti64x4 \$1,$D2lo,%y#$T2
  3245. vextracti64x4 \$1,$D2hi,%y#$H2
  3246. ___
  3247. ######## switch back to %ymm
  3248. map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
  3249. map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
  3250. map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  3251. $code.=<<___;
  3252. vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
  3253. vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
  3254. vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
  3255. vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
  3256. vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
  3257. vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
  3258. ################################################################
  3259. # partial reduction
  3260. vpsrlq \$44,$D0lo,$tmp
  3261. vpsllq \$8,$D0hi,$D0hi
  3262. vpandq $mask44,$D0lo,$H0
  3263. vpaddq $tmp,$D0hi,$D0hi
  3264. vpaddq $D0hi,$D1lo,$D1lo
  3265. vpsrlq \$44,$D1lo,$tmp
  3266. vpsllq \$8,$D1hi,$D1hi
  3267. vpandq $mask44,$D1lo,$H1
  3268. vpaddq $tmp,$D1hi,$D1hi
  3269. vpaddq $D1hi,$D2lo,$D2lo
  3270. vpsrlq \$42,$D2lo,$tmp
  3271. vpsllq \$10,$D2hi,$D2hi
  3272. vpandq $mask42,$D2lo,$H2
  3273. vpaddq $tmp,$D2hi,$D2hi
  3274. vpaddq $D2hi,$H0,$H0
  3275. vpsllq \$2,$D2hi,$D2hi
  3276. vpaddq $D2hi,$H0,$H0
  3277. vpsrlq \$44,$H0,$tmp # additional step
  3278. vpandq $mask44,$H0,$H0
  3279. vpaddq $tmp,$H1,$H1
  3280. ################################################################
  3281. vmovq %x#$H0,0($ctx)
  3282. vmovq %x#$H1,8($ctx)
  3283. vmovq %x#$H2,16($ctx)
  3284. vzeroall
  3285. .Lno_data_vpmadd52_8x:
  3286. RET
  3287. .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
  3288. ___
  3289. }
  3290. $code.=<<___;
  3291. .type poly1305_emit_base2_44,\@function,3
  3292. .align 32
  3293. poly1305_emit_base2_44:
  3294. mov 0($ctx),%r8 # load hash value
  3295. mov 8($ctx),%r9
  3296. mov 16($ctx),%r10
  3297. mov %r9,%rax
  3298. shr \$20,%r9
  3299. shl \$44,%rax
  3300. mov %r10,%rcx
  3301. shr \$40,%r10
  3302. shl \$24,%rcx
  3303. add %rax,%r8
  3304. adc %rcx,%r9
  3305. adc \$0,%r10
  3306. mov %r8,%rax
  3307. add \$5,%r8 # compare to modulus
  3308. mov %r9,%rcx
  3309. adc \$0,%r9
  3310. adc \$0,%r10
  3311. shr \$2,%r10 # did 130-bit value overflow?
  3312. cmovnz %r8,%rax
  3313. cmovnz %r9,%rcx
  3314. add 0($nonce),%rax # accumulate nonce
  3315. adc 8($nonce),%rcx
  3316. mov %rax,0($mac) # write result
  3317. mov %rcx,8($mac)
  3318. RET
  3319. .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
  3320. ___
  3321. } } }
  3322. }
  3323. if (!$kernel)
  3324. { # chacha20-poly1305 helpers
  3325. my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  3326. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  3327. $code.=<<___;
  3328. .globl xor128_encrypt_n_pad
  3329. .type xor128_encrypt_n_pad,\@abi-omnipotent
  3330. .align 16
  3331. xor128_encrypt_n_pad:
  3332. sub $otp,$inp
  3333. sub $otp,$out
  3334. mov $len,%r10 # put len aside
  3335. shr \$4,$len # len / 16
  3336. jz .Ltail_enc
  3337. nop
  3338. .Loop_enc_xmm:
  3339. movdqu ($inp,$otp),%xmm0
  3340. pxor ($otp),%xmm0
  3341. movdqu %xmm0,($out,$otp)
  3342. movdqa %xmm0,($otp)
  3343. lea 16($otp),$otp
  3344. dec $len
  3345. jnz .Loop_enc_xmm
  3346. and \$15,%r10 # len % 16
  3347. jz .Ldone_enc
  3348. .Ltail_enc:
  3349. mov \$16,$len
  3350. sub %r10,$len
  3351. xor %eax,%eax
  3352. .Loop_enc_byte:
  3353. mov ($inp,$otp),%al
  3354. xor ($otp),%al
  3355. mov %al,($out,$otp)
  3356. mov %al,($otp)
  3357. lea 1($otp),$otp
  3358. dec %r10
  3359. jnz .Loop_enc_byte
  3360. xor %eax,%eax
  3361. .Loop_enc_pad:
  3362. mov %al,($otp)
  3363. lea 1($otp),$otp
  3364. dec $len
  3365. jnz .Loop_enc_pad
  3366. .Ldone_enc:
  3367. mov $otp,%rax
  3368. RET
  3369. .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
  3370. .globl xor128_decrypt_n_pad
  3371. .type xor128_decrypt_n_pad,\@abi-omnipotent
  3372. .align 16
  3373. xor128_decrypt_n_pad:
  3374. sub $otp,$inp
  3375. sub $otp,$out
  3376. mov $len,%r10 # put len aside
  3377. shr \$4,$len # len / 16
  3378. jz .Ltail_dec
  3379. nop
  3380. .Loop_dec_xmm:
  3381. movdqu ($inp,$otp),%xmm0
  3382. movdqa ($otp),%xmm1
  3383. pxor %xmm0,%xmm1
  3384. movdqu %xmm1,($out,$otp)
  3385. movdqa %xmm0,($otp)
  3386. lea 16($otp),$otp
  3387. dec $len
  3388. jnz .Loop_dec_xmm
  3389. pxor %xmm1,%xmm1
  3390. and \$15,%r10 # len % 16
  3391. jz .Ldone_dec
  3392. .Ltail_dec:
  3393. mov \$16,$len
  3394. sub %r10,$len
  3395. xor %eax,%eax
  3396. xor %r11d,%r11d
  3397. .Loop_dec_byte:
  3398. mov ($inp,$otp),%r11b
  3399. mov ($otp),%al
  3400. xor %r11b,%al
  3401. mov %al,($out,$otp)
  3402. mov %r11b,($otp)
  3403. lea 1($otp),$otp
  3404. dec %r10
  3405. jnz .Loop_dec_byte
  3406. xor %eax,%eax
  3407. .Loop_dec_pad:
  3408. mov %al,($otp)
  3409. lea 1($otp),$otp
  3410. dec $len
  3411. jnz .Loop_dec_pad
  3412. .Ldone_dec:
  3413. mov $otp,%rax
  3414. RET
  3415. .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
  3416. ___
  3417. }
  3418. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  3419. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  3420. if ($win64) {
  3421. $rec="%rcx";
  3422. $frame="%rdx";
  3423. $context="%r8";
  3424. $disp="%r9";
  3425. $code.=<<___;
  3426. .extern __imp_RtlVirtualUnwind
  3427. .type se_handler,\@abi-omnipotent
  3428. .align 16
  3429. se_handler:
  3430. push %rsi
  3431. push %rdi
  3432. push %rbx
  3433. push %rbp
  3434. push %r12
  3435. push %r13
  3436. push %r14
  3437. push %r15
  3438. pushfq
  3439. sub \$64,%rsp
  3440. mov 120($context),%rax # pull context->Rax
  3441. mov 248($context),%rbx # pull context->Rip
  3442. mov 8($disp),%rsi # disp->ImageBase
  3443. mov 56($disp),%r11 # disp->HandlerData
  3444. mov 0(%r11),%r10d # HandlerData[0]
  3445. lea (%rsi,%r10),%r10 # prologue label
  3446. cmp %r10,%rbx # context->Rip<.Lprologue
  3447. jb .Lcommon_seh_tail
  3448. mov 152($context),%rax # pull context->Rsp
  3449. mov 4(%r11),%r10d # HandlerData[1]
  3450. lea (%rsi,%r10),%r10 # epilogue label
  3451. cmp %r10,%rbx # context->Rip>=.Lepilogue
  3452. jae .Lcommon_seh_tail
  3453. lea 48(%rax),%rax
  3454. mov -8(%rax),%rbx
  3455. mov -16(%rax),%rbp
  3456. mov -24(%rax),%r12
  3457. mov -32(%rax),%r13
  3458. mov -40(%rax),%r14
  3459. mov -48(%rax),%r15
  3460. mov %rbx,144($context) # restore context->Rbx
  3461. mov %rbp,160($context) # restore context->Rbp
  3462. mov %r12,216($context) # restore context->R12
  3463. mov %r13,224($context) # restore context->R13
  3464. mov %r14,232($context) # restore context->R14
  3465. mov %r15,240($context) # restore context->R14
  3466. jmp .Lcommon_seh_tail
  3467. .size se_handler,.-se_handler
  3468. .type avx_handler,\@abi-omnipotent
  3469. .align 16
  3470. avx_handler:
  3471. push %rsi
  3472. push %rdi
  3473. push %rbx
  3474. push %rbp
  3475. push %r12
  3476. push %r13
  3477. push %r14
  3478. push %r15
  3479. pushfq
  3480. sub \$64,%rsp
  3481. mov 120($context),%rax # pull context->Rax
  3482. mov 248($context),%rbx # pull context->Rip
  3483. mov 8($disp),%rsi # disp->ImageBase
  3484. mov 56($disp),%r11 # disp->HandlerData
  3485. mov 0(%r11),%r10d # HandlerData[0]
  3486. lea (%rsi,%r10),%r10 # prologue label
  3487. cmp %r10,%rbx # context->Rip<prologue label
  3488. jb .Lcommon_seh_tail
  3489. mov 152($context),%rax # pull context->Rsp
  3490. mov 4(%r11),%r10d # HandlerData[1]
  3491. lea (%rsi,%r10),%r10 # epilogue label
  3492. cmp %r10,%rbx # context->Rip>=epilogue label
  3493. jae .Lcommon_seh_tail
  3494. mov 208($context),%rax # pull context->R11
  3495. lea 0x50(%rax),%rsi
  3496. lea 0xf8(%rax),%rax
  3497. lea 512($context),%rdi # &context.Xmm6
  3498. mov \$20,%ecx
  3499. .long 0xa548f3fc # cld; rep movsq
  3500. .Lcommon_seh_tail:
  3501. mov 8(%rax),%rdi
  3502. mov 16(%rax),%rsi
  3503. mov %rax,152($context) # restore context->Rsp
  3504. mov %rsi,168($context) # restore context->Rsi
  3505. mov %rdi,176($context) # restore context->Rdi
  3506. mov 40($disp),%rdi # disp->ContextRecord
  3507. mov $context,%rsi # context
  3508. mov \$154,%ecx # sizeof(CONTEXT)
  3509. .long 0xa548f3fc # cld; rep movsq
  3510. mov $disp,%rsi
  3511. xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
  3512. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  3513. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  3514. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  3515. mov 40(%rsi),%r10 # disp->ContextRecord
  3516. lea 56(%rsi),%r11 # &disp->HandlerData
  3517. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  3518. mov %r10,32(%rsp) # arg5
  3519. mov %r11,40(%rsp) # arg6
  3520. mov %r12,48(%rsp) # arg7
  3521. mov %rcx,56(%rsp) # arg8, (NULL)
  3522. call *__imp_RtlVirtualUnwind(%rip)
  3523. mov \$1,%eax # ExceptionContinueSearch
  3524. add \$64,%rsp
  3525. popfq
  3526. pop %r15
  3527. pop %r14
  3528. pop %r13
  3529. pop %r12
  3530. pop %rbp
  3531. pop %rbx
  3532. pop %rdi
  3533. pop %rsi
  3534. RET
  3535. .size avx_handler,.-avx_handler
  3536. .section .pdata
  3537. .align 4
  3538. .rva .LSEH_begin_poly1305_init_x86_64
  3539. .rva .LSEH_end_poly1305_init_x86_64
  3540. .rva .LSEH_info_poly1305_init_x86_64
  3541. .rva .LSEH_begin_poly1305_blocks_x86_64
  3542. .rva .LSEH_end_poly1305_blocks_x86_64
  3543. .rva .LSEH_info_poly1305_blocks_x86_64
  3544. .rva .LSEH_begin_poly1305_emit_x86_64
  3545. .rva .LSEH_end_poly1305_emit_x86_64
  3546. .rva .LSEH_info_poly1305_emit_x86_64
  3547. ___
  3548. $code.=<<___ if ($avx);
  3549. .rva .LSEH_begin_poly1305_blocks_avx
  3550. .rva .Lbase2_64_avx
  3551. .rva .LSEH_info_poly1305_blocks_avx_1
  3552. .rva .Lbase2_64_avx
  3553. .rva .Leven_avx
  3554. .rva .LSEH_info_poly1305_blocks_avx_2
  3555. .rva .Leven_avx
  3556. .rva .LSEH_end_poly1305_blocks_avx
  3557. .rva .LSEH_info_poly1305_blocks_avx_3
  3558. .rva .LSEH_begin_poly1305_emit_avx
  3559. .rva .LSEH_end_poly1305_emit_avx
  3560. .rva .LSEH_info_poly1305_emit_avx
  3561. ___
  3562. $code.=<<___ if ($avx>1);
  3563. .rva .LSEH_begin_poly1305_blocks_avx2
  3564. .rva .Lbase2_64_avx2
  3565. .rva .LSEH_info_poly1305_blocks_avx2_1
  3566. .rva .Lbase2_64_avx2
  3567. .rva .Leven_avx2
  3568. .rva .LSEH_info_poly1305_blocks_avx2_2
  3569. .rva .Leven_avx2
  3570. .rva .LSEH_end_poly1305_blocks_avx2
  3571. .rva .LSEH_info_poly1305_blocks_avx2_3
  3572. ___
  3573. $code.=<<___ if ($avx>2);
  3574. .rva .LSEH_begin_poly1305_blocks_avx512
  3575. .rva .LSEH_end_poly1305_blocks_avx512
  3576. .rva .LSEH_info_poly1305_blocks_avx512
  3577. ___
  3578. $code.=<<___;
  3579. .section .xdata
  3580. .align 8
  3581. .LSEH_info_poly1305_init_x86_64:
  3582. .byte 9,0,0,0
  3583. .rva se_handler
  3584. .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
  3585. .LSEH_info_poly1305_blocks_x86_64:
  3586. .byte 9,0,0,0
  3587. .rva se_handler
  3588. .rva .Lblocks_body,.Lblocks_epilogue
  3589. .LSEH_info_poly1305_emit_x86_64:
  3590. .byte 9,0,0,0
  3591. .rva se_handler
  3592. .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
  3593. ___
  3594. $code.=<<___ if ($avx);
  3595. .LSEH_info_poly1305_blocks_avx_1:
  3596. .byte 9,0,0,0
  3597. .rva se_handler
  3598. .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
  3599. .LSEH_info_poly1305_blocks_avx_2:
  3600. .byte 9,0,0,0
  3601. .rva se_handler
  3602. .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
  3603. .LSEH_info_poly1305_blocks_avx_3:
  3604. .byte 9,0,0,0
  3605. .rva avx_handler
  3606. .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
  3607. .LSEH_info_poly1305_emit_avx:
  3608. .byte 9,0,0,0
  3609. .rva se_handler
  3610. .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
  3611. ___
  3612. $code.=<<___ if ($avx>1);
  3613. .LSEH_info_poly1305_blocks_avx2_1:
  3614. .byte 9,0,0,0
  3615. .rva se_handler
  3616. .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
  3617. .LSEH_info_poly1305_blocks_avx2_2:
  3618. .byte 9,0,0,0
  3619. .rva se_handler
  3620. .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
  3621. .LSEH_info_poly1305_blocks_avx2_3:
  3622. .byte 9,0,0,0
  3623. .rva avx_handler
  3624. .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
  3625. ___
  3626. $code.=<<___ if ($avx>2);
  3627. .LSEH_info_poly1305_blocks_avx512:
  3628. .byte 9,0,0,0
  3629. .rva avx_handler
  3630. .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
  3631. ___
  3632. }
  3633. open SELF,$0;
  3634. while(<SELF>) {
  3635. next if (/^#!/);
  3636. last if (!s/^#/\/\// and !/^$/);
  3637. print;
  3638. }
  3639. close SELF;
  3640. foreach (split('\n',$code)) {
  3641. s/\`([^\`]*)\`/eval($1)/ge;
  3642. s/%r([a-z]+)#d/%e$1/g;
  3643. s/%r([0-9]+)#d/%r$1d/g;
  3644. s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
  3645. if ($kernel) {
  3646. s/(^\.type.*),[0-9]+$/\1/;
  3647. s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
  3648. next if /^\.cfi.*/;
  3649. }
  3650. print $_,"\n";
  3651. }
  3652. close STDOUT;