nested.c 218 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/objtool.h>
  3. #include <linux/percpu.h>
  4. #include <asm/debugreg.h>
  5. #include <asm/mmu_context.h>
  6. #include "cpuid.h"
  7. #include "evmcs.h"
  8. #include "hyperv.h"
  9. #include "mmu.h"
  10. #include "nested.h"
  11. #include "pmu.h"
  12. #include "sgx.h"
  13. #include "trace.h"
  14. #include "vmx.h"
  15. #include "x86.h"
  16. static bool __read_mostly enable_shadow_vmcs = 1;
  17. module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  18. static bool __read_mostly nested_early_check = 0;
  19. module_param(nested_early_check, bool, S_IRUGO);
  20. #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
  21. /*
  22. * Hyper-V requires all of these, so mark them as supported even though
  23. * they are just treated the same as all-context.
  24. */
  25. #define VMX_VPID_EXTENT_SUPPORTED_MASK \
  26. (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
  27. VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
  28. VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
  29. VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
  30. #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
  31. enum {
  32. VMX_VMREAD_BITMAP,
  33. VMX_VMWRITE_BITMAP,
  34. VMX_BITMAP_NR
  35. };
  36. static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
  37. #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
  38. #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
  39. struct shadow_vmcs_field {
  40. u16 encoding;
  41. u16 offset;
  42. };
  43. static struct shadow_vmcs_field shadow_read_only_fields[] = {
  44. #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
  45. #include "vmcs_shadow_fields.h"
  46. };
  47. static int max_shadow_read_only_fields =
  48. ARRAY_SIZE(shadow_read_only_fields);
  49. static struct shadow_vmcs_field shadow_read_write_fields[] = {
  50. #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
  51. #include "vmcs_shadow_fields.h"
  52. };
  53. static int max_shadow_read_write_fields =
  54. ARRAY_SIZE(shadow_read_write_fields);
  55. static void init_vmcs_shadow_fields(void)
  56. {
  57. int i, j;
  58. memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
  59. memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
  60. for (i = j = 0; i < max_shadow_read_only_fields; i++) {
  61. struct shadow_vmcs_field entry = shadow_read_only_fields[i];
  62. u16 field = entry.encoding;
  63. if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
  64. (i + 1 == max_shadow_read_only_fields ||
  65. shadow_read_only_fields[i + 1].encoding != field + 1))
  66. pr_err("Missing field from shadow_read_only_field %x\n",
  67. field + 1);
  68. clear_bit(field, vmx_vmread_bitmap);
  69. if (field & 1)
  70. #ifdef CONFIG_X86_64
  71. continue;
  72. #else
  73. entry.offset += sizeof(u32);
  74. #endif
  75. shadow_read_only_fields[j++] = entry;
  76. }
  77. max_shadow_read_only_fields = j;
  78. for (i = j = 0; i < max_shadow_read_write_fields; i++) {
  79. struct shadow_vmcs_field entry = shadow_read_write_fields[i];
  80. u16 field = entry.encoding;
  81. if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
  82. (i + 1 == max_shadow_read_write_fields ||
  83. shadow_read_write_fields[i + 1].encoding != field + 1))
  84. pr_err("Missing field from shadow_read_write_field %x\n",
  85. field + 1);
  86. WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
  87. field <= GUEST_TR_AR_BYTES,
  88. "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
  89. /*
  90. * PML and the preemption timer can be emulated, but the
  91. * processor cannot vmwrite to fields that don't exist
  92. * on bare metal.
  93. */
  94. switch (field) {
  95. case GUEST_PML_INDEX:
  96. if (!cpu_has_vmx_pml())
  97. continue;
  98. break;
  99. case VMX_PREEMPTION_TIMER_VALUE:
  100. if (!cpu_has_vmx_preemption_timer())
  101. continue;
  102. break;
  103. case GUEST_INTR_STATUS:
  104. if (!cpu_has_vmx_apicv())
  105. continue;
  106. break;
  107. default:
  108. break;
  109. }
  110. clear_bit(field, vmx_vmwrite_bitmap);
  111. clear_bit(field, vmx_vmread_bitmap);
  112. if (field & 1)
  113. #ifdef CONFIG_X86_64
  114. continue;
  115. #else
  116. entry.offset += sizeof(u32);
  117. #endif
  118. shadow_read_write_fields[j++] = entry;
  119. }
  120. max_shadow_read_write_fields = j;
  121. }
  122. /*
  123. * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
  124. * set the success or error code of an emulated VMX instruction (as specified
  125. * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
  126. * instruction.
  127. */
  128. static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
  129. {
  130. vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
  131. & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
  132. X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
  133. return kvm_skip_emulated_instruction(vcpu);
  134. }
  135. static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
  136. {
  137. vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
  138. & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
  139. X86_EFLAGS_SF | X86_EFLAGS_OF))
  140. | X86_EFLAGS_CF);
  141. return kvm_skip_emulated_instruction(vcpu);
  142. }
  143. static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
  144. u32 vm_instruction_error)
  145. {
  146. vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
  147. & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
  148. X86_EFLAGS_SF | X86_EFLAGS_OF))
  149. | X86_EFLAGS_ZF);
  150. get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
  151. /*
  152. * We don't need to force sync to shadow VMCS because
  153. * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
  154. * fields and thus must be synced.
  155. */
  156. if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
  157. to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
  158. return kvm_skip_emulated_instruction(vcpu);
  159. }
  160. static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
  161. {
  162. struct vcpu_vmx *vmx = to_vmx(vcpu);
  163. /*
  164. * failValid writes the error number to the current VMCS, which
  165. * can't be done if there isn't a current VMCS.
  166. */
  167. if (vmx->nested.current_vmptr == INVALID_GPA &&
  168. !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
  169. return nested_vmx_failInvalid(vcpu);
  170. return nested_vmx_failValid(vcpu, vm_instruction_error);
  171. }
  172. static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
  173. {
  174. /* TODO: not to reset guest simply here. */
  175. kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
  176. pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
  177. }
  178. static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
  179. {
  180. return fixed_bits_valid(control, low, high);
  181. }
  182. static inline u64 vmx_control_msr(u32 low, u32 high)
  183. {
  184. return low | ((u64)high << 32);
  185. }
  186. static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
  187. {
  188. secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
  189. vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
  190. vmx->nested.need_vmcs12_to_shadow_sync = false;
  191. }
  192. static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
  193. {
  194. struct vcpu_vmx *vmx = to_vmx(vcpu);
  195. if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
  196. kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
  197. vmx->nested.hv_evmcs = NULL;
  198. }
  199. vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
  200. }
  201. static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
  202. struct loaded_vmcs *prev)
  203. {
  204. struct vmcs_host_state *dest, *src;
  205. if (unlikely(!vmx->guest_state_loaded))
  206. return;
  207. src = &prev->host_state;
  208. dest = &vmx->loaded_vmcs->host_state;
  209. vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
  210. dest->ldt_sel = src->ldt_sel;
  211. #ifdef CONFIG_X86_64
  212. dest->ds_sel = src->ds_sel;
  213. dest->es_sel = src->es_sel;
  214. #endif
  215. }
  216. static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
  217. {
  218. struct vcpu_vmx *vmx = to_vmx(vcpu);
  219. struct loaded_vmcs *prev;
  220. int cpu;
  221. if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
  222. return;
  223. cpu = get_cpu();
  224. prev = vmx->loaded_vmcs;
  225. vmx->loaded_vmcs = vmcs;
  226. vmx_vcpu_load_vmcs(vcpu, cpu, prev);
  227. vmx_sync_vmcs_host_state(vmx, prev);
  228. put_cpu();
  229. vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
  230. /*
  231. * All lazily updated registers will be reloaded from VMCS12 on both
  232. * vmentry and vmexit.
  233. */
  234. vcpu->arch.regs_dirty = 0;
  235. }
  236. /*
  237. * Free whatever needs to be freed from vmx->nested when L1 goes down, or
  238. * just stops using VMX.
  239. */
  240. static void free_nested(struct kvm_vcpu *vcpu)
  241. {
  242. struct vcpu_vmx *vmx = to_vmx(vcpu);
  243. if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
  244. vmx_switch_vmcs(vcpu, &vmx->vmcs01);
  245. if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
  246. return;
  247. kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
  248. vmx->nested.vmxon = false;
  249. vmx->nested.smm.vmxon = false;
  250. vmx->nested.vmxon_ptr = INVALID_GPA;
  251. free_vpid(vmx->nested.vpid02);
  252. vmx->nested.posted_intr_nv = -1;
  253. vmx->nested.current_vmptr = INVALID_GPA;
  254. if (enable_shadow_vmcs) {
  255. vmx_disable_shadow_vmcs(vmx);
  256. vmcs_clear(vmx->vmcs01.shadow_vmcs);
  257. free_vmcs(vmx->vmcs01.shadow_vmcs);
  258. vmx->vmcs01.shadow_vmcs = NULL;
  259. }
  260. kfree(vmx->nested.cached_vmcs12);
  261. vmx->nested.cached_vmcs12 = NULL;
  262. kfree(vmx->nested.cached_shadow_vmcs12);
  263. vmx->nested.cached_shadow_vmcs12 = NULL;
  264. /*
  265. * Unpin physical memory we referred to in the vmcs02. The APIC access
  266. * page's backing page (yeah, confusing) shouldn't actually be accessed,
  267. * and if it is written, the contents are irrelevant.
  268. */
  269. kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
  270. kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
  271. kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
  272. vmx->nested.pi_desc = NULL;
  273. kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
  274. nested_release_evmcs(vcpu);
  275. free_loaded_vmcs(&vmx->nested.vmcs02);
  276. }
  277. /*
  278. * Ensure that the current vmcs of the logical processor is the
  279. * vmcs01 of the vcpu before calling free_nested().
  280. */
  281. void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
  282. {
  283. vcpu_load(vcpu);
  284. vmx_leave_nested(vcpu);
  285. vcpu_put(vcpu);
  286. }
  287. #define EPTP_PA_MASK GENMASK_ULL(51, 12)
  288. static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
  289. {
  290. return VALID_PAGE(root_hpa) &&
  291. ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
  292. }
  293. static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
  294. gpa_t addr)
  295. {
  296. uint i;
  297. struct kvm_mmu_root_info *cached_root;
  298. WARN_ON_ONCE(!mmu_is_nested(vcpu));
  299. for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
  300. cached_root = &vcpu->arch.mmu->prev_roots[i];
  301. if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
  302. eptp))
  303. vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
  304. }
  305. }
  306. static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
  307. struct x86_exception *fault)
  308. {
  309. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  310. struct vcpu_vmx *vmx = to_vmx(vcpu);
  311. u32 vm_exit_reason;
  312. unsigned long exit_qualification = vcpu->arch.exit_qualification;
  313. if (vmx->nested.pml_full) {
  314. vm_exit_reason = EXIT_REASON_PML_FULL;
  315. vmx->nested.pml_full = false;
  316. exit_qualification &= INTR_INFO_UNBLOCK_NMI;
  317. } else {
  318. if (fault->error_code & PFERR_RSVD_MASK)
  319. vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
  320. else
  321. vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
  322. /*
  323. * Although the caller (kvm_inject_emulated_page_fault) would
  324. * have already synced the faulting address in the shadow EPT
  325. * tables for the current EPTP12, we also need to sync it for
  326. * any other cached EPTP02s based on the same EP4TA, since the
  327. * TLB associates mappings to the EP4TA rather than the full EPTP.
  328. */
  329. nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
  330. fault->address);
  331. }
  332. nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
  333. vmcs12->guest_physical_address = fault->address;
  334. }
  335. static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
  336. {
  337. struct vcpu_vmx *vmx = to_vmx(vcpu);
  338. bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
  339. int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
  340. kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
  341. nested_ept_ad_enabled(vcpu),
  342. nested_ept_get_eptp(vcpu));
  343. }
  344. static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
  345. {
  346. WARN_ON(mmu_is_nested(vcpu));
  347. vcpu->arch.mmu = &vcpu->arch.guest_mmu;
  348. nested_ept_new_eptp(vcpu);
  349. vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
  350. vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
  351. vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
  352. vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
  353. }
  354. static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
  355. {
  356. vcpu->arch.mmu = &vcpu->arch.root_mmu;
  357. vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
  358. }
  359. static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
  360. u16 error_code)
  361. {
  362. bool inequality, bit;
  363. bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
  364. inequality =
  365. (error_code & vmcs12->page_fault_error_code_mask) !=
  366. vmcs12->page_fault_error_code_match;
  367. return inequality ^ bit;
  368. }
  369. static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
  370. u32 error_code)
  371. {
  372. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  373. /*
  374. * Drop bits 31:16 of the error code when performing the #PF mask+match
  375. * check. All VMCS fields involved are 32 bits, but Intel CPUs never
  376. * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
  377. * error code. Including the to-be-dropped bits in the check might
  378. * result in an "impossible" or missed exit from L1's perspective.
  379. */
  380. if (vector == PF_VECTOR)
  381. return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
  382. return (vmcs12->exception_bitmap & (1u << vector));
  383. }
  384. static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
  385. struct vmcs12 *vmcs12)
  386. {
  387. if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
  388. return 0;
  389. if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
  390. CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
  391. return -EINVAL;
  392. return 0;
  393. }
  394. static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
  395. struct vmcs12 *vmcs12)
  396. {
  397. if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
  398. return 0;
  399. if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
  400. return -EINVAL;
  401. return 0;
  402. }
  403. static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
  404. struct vmcs12 *vmcs12)
  405. {
  406. if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
  407. return 0;
  408. if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
  409. return -EINVAL;
  410. return 0;
  411. }
  412. /*
  413. * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
  414. * itself utilizing x2APIC. All MSRs were previously set to be intercepted,
  415. * only the "disable intercept" case needs to be handled.
  416. */
  417. static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
  418. unsigned long *msr_bitmap_l0,
  419. u32 msr, int type)
  420. {
  421. if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
  422. vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
  423. if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
  424. vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
  425. }
  426. static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
  427. {
  428. int msr;
  429. for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
  430. unsigned word = msr / BITS_PER_LONG;
  431. msr_bitmap[word] = ~0;
  432. msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
  433. }
  434. }
  435. #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
  436. static inline \
  437. void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
  438. unsigned long *msr_bitmap_l1, \
  439. unsigned long *msr_bitmap_l0, u32 msr) \
  440. { \
  441. if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
  442. vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
  443. vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
  444. else \
  445. vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
  446. }
  447. BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
  448. BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
  449. static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
  450. unsigned long *msr_bitmap_l1,
  451. unsigned long *msr_bitmap_l0,
  452. u32 msr, int types)
  453. {
  454. if (types & MSR_TYPE_R)
  455. nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
  456. msr_bitmap_l0, msr);
  457. if (types & MSR_TYPE_W)
  458. nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
  459. msr_bitmap_l0, msr);
  460. }
  461. /*
  462. * Merge L0's and L1's MSR bitmap, return false to indicate that
  463. * we do not use the hardware.
  464. */
  465. static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
  466. struct vmcs12 *vmcs12)
  467. {
  468. struct vcpu_vmx *vmx = to_vmx(vcpu);
  469. int msr;
  470. unsigned long *msr_bitmap_l1;
  471. unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
  472. struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
  473. struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
  474. /* Nothing to do if the MSR bitmap is not in use. */
  475. if (!cpu_has_vmx_msr_bitmap() ||
  476. !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
  477. return false;
  478. /*
  479. * MSR bitmap update can be skipped when:
  480. * - MSR bitmap for L1 hasn't changed.
  481. * - Nested hypervisor (L1) is attempting to launch the same L2 as
  482. * before.
  483. * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
  484. * and tells KVM (L0) there were no changes in MSR bitmap for L2.
  485. */
  486. if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
  487. evmcs->hv_enlightenments_control.msr_bitmap &&
  488. evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
  489. return true;
  490. if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
  491. return false;
  492. msr_bitmap_l1 = (unsigned long *)map->hva;
  493. /*
  494. * To keep the control flow simple, pay eight 8-byte writes (sixteen
  495. * 4-byte writes on 32-bit systems) up front to enable intercepts for
  496. * the x2APIC MSR range and selectively toggle those relevant to L2.
  497. */
  498. enable_x2apic_msr_intercepts(msr_bitmap_l0);
  499. if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
  500. if (nested_cpu_has_apic_reg_virt(vmcs12)) {
  501. /*
  502. * L0 need not intercept reads for MSRs between 0x800
  503. * and 0x8ff, it just lets the processor take the value
  504. * from the virtual-APIC page; take those 256 bits
  505. * directly from the L1 bitmap.
  506. */
  507. for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
  508. unsigned word = msr / BITS_PER_LONG;
  509. msr_bitmap_l0[word] = msr_bitmap_l1[word];
  510. }
  511. }
  512. nested_vmx_disable_intercept_for_x2apic_msr(
  513. msr_bitmap_l1, msr_bitmap_l0,
  514. X2APIC_MSR(APIC_TASKPRI),
  515. MSR_TYPE_R | MSR_TYPE_W);
  516. if (nested_cpu_has_vid(vmcs12)) {
  517. nested_vmx_disable_intercept_for_x2apic_msr(
  518. msr_bitmap_l1, msr_bitmap_l0,
  519. X2APIC_MSR(APIC_EOI),
  520. MSR_TYPE_W);
  521. nested_vmx_disable_intercept_for_x2apic_msr(
  522. msr_bitmap_l1, msr_bitmap_l0,
  523. X2APIC_MSR(APIC_SELF_IPI),
  524. MSR_TYPE_W);
  525. }
  526. }
  527. /*
  528. * Always check vmcs01's bitmap to honor userspace MSR filters and any
  529. * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
  530. */
  531. #ifdef CONFIG_X86_64
  532. nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
  533. MSR_FS_BASE, MSR_TYPE_RW);
  534. nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
  535. MSR_GS_BASE, MSR_TYPE_RW);
  536. nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
  537. MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
  538. #endif
  539. nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
  540. MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
  541. nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
  542. MSR_IA32_PRED_CMD, MSR_TYPE_W);
  543. kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
  544. vmx->nested.force_msr_bitmap_recalc = false;
  545. return true;
  546. }
  547. static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
  548. struct vmcs12 *vmcs12)
  549. {
  550. struct vcpu_vmx *vmx = to_vmx(vcpu);
  551. struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
  552. if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
  553. vmcs12->vmcs_link_pointer == INVALID_GPA)
  554. return;
  555. if (ghc->gpa != vmcs12->vmcs_link_pointer &&
  556. kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
  557. vmcs12->vmcs_link_pointer, VMCS12_SIZE))
  558. return;
  559. kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
  560. VMCS12_SIZE);
  561. }
  562. static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
  563. struct vmcs12 *vmcs12)
  564. {
  565. struct vcpu_vmx *vmx = to_vmx(vcpu);
  566. struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
  567. if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
  568. vmcs12->vmcs_link_pointer == INVALID_GPA)
  569. return;
  570. if (ghc->gpa != vmcs12->vmcs_link_pointer &&
  571. kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
  572. vmcs12->vmcs_link_pointer, VMCS12_SIZE))
  573. return;
  574. kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
  575. VMCS12_SIZE);
  576. }
  577. /*
  578. * In nested virtualization, check if L1 has set
  579. * VM_EXIT_ACK_INTR_ON_EXIT
  580. */
  581. static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
  582. {
  583. return get_vmcs12(vcpu)->vm_exit_controls &
  584. VM_EXIT_ACK_INTR_ON_EXIT;
  585. }
  586. static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
  587. struct vmcs12 *vmcs12)
  588. {
  589. if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
  590. CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
  591. return -EINVAL;
  592. else
  593. return 0;
  594. }
  595. static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
  596. struct vmcs12 *vmcs12)
  597. {
  598. if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
  599. !nested_cpu_has_apic_reg_virt(vmcs12) &&
  600. !nested_cpu_has_vid(vmcs12) &&
  601. !nested_cpu_has_posted_intr(vmcs12))
  602. return 0;
  603. /*
  604. * If virtualize x2apic mode is enabled,
  605. * virtualize apic access must be disabled.
  606. */
  607. if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
  608. nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
  609. return -EINVAL;
  610. /*
  611. * If virtual interrupt delivery is enabled,
  612. * we must exit on external interrupts.
  613. */
  614. if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
  615. return -EINVAL;
  616. /*
  617. * bits 15:8 should be zero in posted_intr_nv,
  618. * the descriptor address has been already checked
  619. * in nested_get_vmcs12_pages.
  620. *
  621. * bits 5:0 of posted_intr_desc_addr should be zero.
  622. */
  623. if (nested_cpu_has_posted_intr(vmcs12) &&
  624. (CC(!nested_cpu_has_vid(vmcs12)) ||
  625. CC(!nested_exit_intr_ack_set(vcpu)) ||
  626. CC((vmcs12->posted_intr_nv & 0xff00)) ||
  627. CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
  628. return -EINVAL;
  629. /* tpr shadow is needed by all apicv features. */
  630. if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
  631. return -EINVAL;
  632. return 0;
  633. }
  634. static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
  635. u32 count, u64 addr)
  636. {
  637. if (count == 0)
  638. return 0;
  639. if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
  640. !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
  641. return -EINVAL;
  642. return 0;
  643. }
  644. static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
  645. struct vmcs12 *vmcs12)
  646. {
  647. if (CC(nested_vmx_check_msr_switch(vcpu,
  648. vmcs12->vm_exit_msr_load_count,
  649. vmcs12->vm_exit_msr_load_addr)) ||
  650. CC(nested_vmx_check_msr_switch(vcpu,
  651. vmcs12->vm_exit_msr_store_count,
  652. vmcs12->vm_exit_msr_store_addr)))
  653. return -EINVAL;
  654. return 0;
  655. }
  656. static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
  657. struct vmcs12 *vmcs12)
  658. {
  659. if (CC(nested_vmx_check_msr_switch(vcpu,
  660. vmcs12->vm_entry_msr_load_count,
  661. vmcs12->vm_entry_msr_load_addr)))
  662. return -EINVAL;
  663. return 0;
  664. }
  665. static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
  666. struct vmcs12 *vmcs12)
  667. {
  668. if (!nested_cpu_has_pml(vmcs12))
  669. return 0;
  670. if (CC(!nested_cpu_has_ept(vmcs12)) ||
  671. CC(!page_address_valid(vcpu, vmcs12->pml_address)))
  672. return -EINVAL;
  673. return 0;
  674. }
  675. static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
  676. struct vmcs12 *vmcs12)
  677. {
  678. if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
  679. !nested_cpu_has_ept(vmcs12)))
  680. return -EINVAL;
  681. return 0;
  682. }
  683. static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
  684. struct vmcs12 *vmcs12)
  685. {
  686. if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
  687. !nested_cpu_has_ept(vmcs12)))
  688. return -EINVAL;
  689. return 0;
  690. }
  691. static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
  692. struct vmcs12 *vmcs12)
  693. {
  694. if (!nested_cpu_has_shadow_vmcs(vmcs12))
  695. return 0;
  696. if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
  697. CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
  698. return -EINVAL;
  699. return 0;
  700. }
  701. static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
  702. struct vmx_msr_entry *e)
  703. {
  704. /* x2APIC MSR accesses are not allowed */
  705. if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
  706. return -EINVAL;
  707. if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
  708. CC(e->index == MSR_IA32_UCODE_REV))
  709. return -EINVAL;
  710. if (CC(e->reserved != 0))
  711. return -EINVAL;
  712. return 0;
  713. }
  714. static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
  715. struct vmx_msr_entry *e)
  716. {
  717. if (CC(e->index == MSR_FS_BASE) ||
  718. CC(e->index == MSR_GS_BASE) ||
  719. CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
  720. nested_vmx_msr_check_common(vcpu, e))
  721. return -EINVAL;
  722. return 0;
  723. }
  724. static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
  725. struct vmx_msr_entry *e)
  726. {
  727. if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
  728. nested_vmx_msr_check_common(vcpu, e))
  729. return -EINVAL;
  730. return 0;
  731. }
  732. static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
  733. {
  734. struct vcpu_vmx *vmx = to_vmx(vcpu);
  735. u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
  736. vmx->nested.msrs.misc_high);
  737. return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
  738. }
  739. /*
  740. * Load guest's/host's msr at nested entry/exit.
  741. * return 0 for success, entry index for failure.
  742. *
  743. * One of the failure modes for MSR load/store is when a list exceeds the
  744. * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
  745. * as possible, process all valid entries before failing rather than precheck
  746. * for a capacity violation.
  747. */
  748. static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
  749. {
  750. u32 i;
  751. struct vmx_msr_entry e;
  752. u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
  753. for (i = 0; i < count; i++) {
  754. if (unlikely(i >= max_msr_list_size))
  755. goto fail;
  756. if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
  757. &e, sizeof(e))) {
  758. pr_debug_ratelimited(
  759. "%s cannot read MSR entry (%u, 0x%08llx)\n",
  760. __func__, i, gpa + i * sizeof(e));
  761. goto fail;
  762. }
  763. if (nested_vmx_load_msr_check(vcpu, &e)) {
  764. pr_debug_ratelimited(
  765. "%s check failed (%u, 0x%x, 0x%x)\n",
  766. __func__, i, e.index, e.reserved);
  767. goto fail;
  768. }
  769. if (kvm_set_msr(vcpu, e.index, e.value)) {
  770. pr_debug_ratelimited(
  771. "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
  772. __func__, i, e.index, e.value);
  773. goto fail;
  774. }
  775. }
  776. return 0;
  777. fail:
  778. /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
  779. return i + 1;
  780. }
  781. static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
  782. u32 msr_index,
  783. u64 *data)
  784. {
  785. struct vcpu_vmx *vmx = to_vmx(vcpu);
  786. /*
  787. * If the L0 hypervisor stored a more accurate value for the TSC that
  788. * does not include the time taken for emulation of the L2->L1
  789. * VM-exit in L0, use the more accurate value.
  790. */
  791. if (msr_index == MSR_IA32_TSC) {
  792. int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
  793. MSR_IA32_TSC);
  794. if (i >= 0) {
  795. u64 val = vmx->msr_autostore.guest.val[i].value;
  796. *data = kvm_read_l1_tsc(vcpu, val);
  797. return true;
  798. }
  799. }
  800. if (kvm_get_msr(vcpu, msr_index, data)) {
  801. pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
  802. msr_index);
  803. return false;
  804. }
  805. return true;
  806. }
  807. static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
  808. struct vmx_msr_entry *e)
  809. {
  810. if (kvm_vcpu_read_guest(vcpu,
  811. gpa + i * sizeof(*e),
  812. e, 2 * sizeof(u32))) {
  813. pr_debug_ratelimited(
  814. "%s cannot read MSR entry (%u, 0x%08llx)\n",
  815. __func__, i, gpa + i * sizeof(*e));
  816. return false;
  817. }
  818. if (nested_vmx_store_msr_check(vcpu, e)) {
  819. pr_debug_ratelimited(
  820. "%s check failed (%u, 0x%x, 0x%x)\n",
  821. __func__, i, e->index, e->reserved);
  822. return false;
  823. }
  824. return true;
  825. }
  826. static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
  827. {
  828. u64 data;
  829. u32 i;
  830. struct vmx_msr_entry e;
  831. u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
  832. for (i = 0; i < count; i++) {
  833. if (unlikely(i >= max_msr_list_size))
  834. return -EINVAL;
  835. if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
  836. return -EINVAL;
  837. if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
  838. return -EINVAL;
  839. if (kvm_vcpu_write_guest(vcpu,
  840. gpa + i * sizeof(e) +
  841. offsetof(struct vmx_msr_entry, value),
  842. &data, sizeof(data))) {
  843. pr_debug_ratelimited(
  844. "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
  845. __func__, i, e.index, data);
  846. return -EINVAL;
  847. }
  848. }
  849. return 0;
  850. }
  851. static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
  852. {
  853. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  854. u32 count = vmcs12->vm_exit_msr_store_count;
  855. u64 gpa = vmcs12->vm_exit_msr_store_addr;
  856. struct vmx_msr_entry e;
  857. u32 i;
  858. for (i = 0; i < count; i++) {
  859. if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
  860. return false;
  861. if (e.index == msr_index)
  862. return true;
  863. }
  864. return false;
  865. }
  866. static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
  867. u32 msr_index)
  868. {
  869. struct vcpu_vmx *vmx = to_vmx(vcpu);
  870. struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
  871. bool in_vmcs12_store_list;
  872. int msr_autostore_slot;
  873. bool in_autostore_list;
  874. int last;
  875. msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
  876. in_autostore_list = msr_autostore_slot >= 0;
  877. in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
  878. if (in_vmcs12_store_list && !in_autostore_list) {
  879. if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
  880. /*
  881. * Emulated VMEntry does not fail here. Instead a less
  882. * accurate value will be returned by
  883. * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
  884. * instead of reading the value from the vmcs02 VMExit
  885. * MSR-store area.
  886. */
  887. pr_warn_ratelimited(
  888. "Not enough msr entries in msr_autostore. Can't add msr %x\n",
  889. msr_index);
  890. return;
  891. }
  892. last = autostore->nr++;
  893. autostore->val[last].index = msr_index;
  894. } else if (!in_vmcs12_store_list && in_autostore_list) {
  895. last = --autostore->nr;
  896. autostore->val[msr_autostore_slot] = autostore->val[last];
  897. }
  898. }
  899. /*
  900. * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
  901. * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
  902. * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
  903. * @entry_failure_code.
  904. */
  905. static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
  906. bool nested_ept, bool reload_pdptrs,
  907. enum vm_entry_failure_code *entry_failure_code)
  908. {
  909. if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
  910. *entry_failure_code = ENTRY_FAIL_DEFAULT;
  911. return -EINVAL;
  912. }
  913. /*
  914. * If PAE paging and EPT are both on, CR3 is not used by the CPU and
  915. * must not be dereferenced.
  916. */
  917. if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
  918. CC(!load_pdptrs(vcpu, cr3))) {
  919. *entry_failure_code = ENTRY_FAIL_PDPTE;
  920. return -EINVAL;
  921. }
  922. vcpu->arch.cr3 = cr3;
  923. kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
  924. /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
  925. kvm_init_mmu(vcpu);
  926. if (!nested_ept)
  927. kvm_mmu_new_pgd(vcpu, cr3);
  928. return 0;
  929. }
  930. /*
  931. * Returns if KVM is able to config CPU to tag TLB entries
  932. * populated by L2 differently than TLB entries populated
  933. * by L1.
  934. *
  935. * If L0 uses EPT, L1 and L2 run with different EPTP because
  936. * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
  937. * are tagged with different EPTP.
  938. *
  939. * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
  940. * with different VPID (L1 entries are tagged with vmx->vpid
  941. * while L2 entries are tagged with vmx->nested.vpid02).
  942. */
  943. static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
  944. {
  945. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  946. return enable_ept ||
  947. (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
  948. }
  949. static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
  950. struct vmcs12 *vmcs12,
  951. bool is_vmenter)
  952. {
  953. struct vcpu_vmx *vmx = to_vmx(vcpu);
  954. /*
  955. * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
  956. * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
  957. * full TLB flush from the guest's perspective. This is required even
  958. * if VPID is disabled in the host as KVM may need to synchronize the
  959. * MMU in response to the guest TLB flush.
  960. *
  961. * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
  962. * EPT is a special snowflake, as guest-physical mappings aren't
  963. * flushed on VPID invalidations, including VM-Enter or VM-Exit with
  964. * VPID disabled. As a result, KVM _never_ needs to sync nEPT
  965. * entries on VM-Enter because L1 can't rely on VM-Enter to flush
  966. * those mappings.
  967. */
  968. if (!nested_cpu_has_vpid(vmcs12)) {
  969. kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
  970. return;
  971. }
  972. /* L2 should never have a VPID if VPID is disabled. */
  973. WARN_ON(!enable_vpid);
  974. /*
  975. * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
  976. * emulate a guest TLB flush as KVM does not track vpid12 history nor
  977. * is the VPID incorporated into the MMU context. I.e. KVM must assume
  978. * that the new vpid12 has never been used and thus represents a new
  979. * guest ASID that cannot have entries in the TLB.
  980. */
  981. if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
  982. vmx->nested.last_vpid = vmcs12->virtual_processor_id;
  983. kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
  984. return;
  985. }
  986. /*
  987. * If VPID is enabled, used by vmc12, and vpid12 is not changing but
  988. * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
  989. * KVM was unable to allocate a VPID for L2, flush the current context
  990. * as the effective ASID is common to both L1 and L2.
  991. */
  992. if (!nested_has_guest_tlb_tag(vcpu))
  993. kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
  994. }
  995. static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
  996. {
  997. superset &= mask;
  998. subset &= mask;
  999. return (superset | subset) == superset;
  1000. }
  1001. static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
  1002. {
  1003. const u64 feature_and_reserved =
  1004. /* feature (except bit 48; see below) */
  1005. BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
  1006. /* reserved */
  1007. BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
  1008. u64 vmx_basic = vmcs_config.nested.basic;
  1009. if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
  1010. return -EINVAL;
  1011. /*
  1012. * KVM does not emulate a version of VMX that constrains physical
  1013. * addresses of VMX structures (e.g. VMCS) to 32-bits.
  1014. */
  1015. if (data & BIT_ULL(48))
  1016. return -EINVAL;
  1017. if (vmx_basic_vmcs_revision_id(vmx_basic) !=
  1018. vmx_basic_vmcs_revision_id(data))
  1019. return -EINVAL;
  1020. if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
  1021. return -EINVAL;
  1022. vmx->nested.msrs.basic = data;
  1023. return 0;
  1024. }
  1025. static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
  1026. u32 **low, u32 **high)
  1027. {
  1028. switch (msr_index) {
  1029. case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
  1030. *low = &msrs->pinbased_ctls_low;
  1031. *high = &msrs->pinbased_ctls_high;
  1032. break;
  1033. case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
  1034. *low = &msrs->procbased_ctls_low;
  1035. *high = &msrs->procbased_ctls_high;
  1036. break;
  1037. case MSR_IA32_VMX_TRUE_EXIT_CTLS:
  1038. *low = &msrs->exit_ctls_low;
  1039. *high = &msrs->exit_ctls_high;
  1040. break;
  1041. case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
  1042. *low = &msrs->entry_ctls_low;
  1043. *high = &msrs->entry_ctls_high;
  1044. break;
  1045. case MSR_IA32_VMX_PROCBASED_CTLS2:
  1046. *low = &msrs->secondary_ctls_low;
  1047. *high = &msrs->secondary_ctls_high;
  1048. break;
  1049. default:
  1050. BUG();
  1051. }
  1052. }
  1053. static int
  1054. vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
  1055. {
  1056. u32 *lowp, *highp;
  1057. u64 supported;
  1058. vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
  1059. supported = vmx_control_msr(*lowp, *highp);
  1060. /* Check must-be-1 bits are still 1. */
  1061. if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
  1062. return -EINVAL;
  1063. /* Check must-be-0 bits are still 0. */
  1064. if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
  1065. return -EINVAL;
  1066. vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
  1067. *lowp = data;
  1068. *highp = data >> 32;
  1069. return 0;
  1070. }
  1071. static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
  1072. {
  1073. const u64 feature_and_reserved_bits =
  1074. /* feature */
  1075. BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
  1076. BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
  1077. /* reserved */
  1078. GENMASK_ULL(13, 9) | BIT_ULL(31);
  1079. u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
  1080. vmcs_config.nested.misc_high);
  1081. if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
  1082. return -EINVAL;
  1083. if ((vmx->nested.msrs.pinbased_ctls_high &
  1084. PIN_BASED_VMX_PREEMPTION_TIMER) &&
  1085. vmx_misc_preemption_timer_rate(data) !=
  1086. vmx_misc_preemption_timer_rate(vmx_misc))
  1087. return -EINVAL;
  1088. if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
  1089. return -EINVAL;
  1090. if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
  1091. return -EINVAL;
  1092. if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
  1093. return -EINVAL;
  1094. vmx->nested.msrs.misc_low = data;
  1095. vmx->nested.msrs.misc_high = data >> 32;
  1096. return 0;
  1097. }
  1098. static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
  1099. {
  1100. u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
  1101. vmcs_config.nested.vpid_caps);
  1102. /* Every bit is either reserved or a feature bit. */
  1103. if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
  1104. return -EINVAL;
  1105. vmx->nested.msrs.ept_caps = data;
  1106. vmx->nested.msrs.vpid_caps = data >> 32;
  1107. return 0;
  1108. }
  1109. static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
  1110. {
  1111. switch (msr_index) {
  1112. case MSR_IA32_VMX_CR0_FIXED0:
  1113. return &msrs->cr0_fixed0;
  1114. case MSR_IA32_VMX_CR4_FIXED0:
  1115. return &msrs->cr4_fixed0;
  1116. default:
  1117. BUG();
  1118. }
  1119. }
  1120. static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
  1121. {
  1122. const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
  1123. /*
  1124. * 1 bits (which indicates bits which "must-be-1" during VMX operation)
  1125. * must be 1 in the restored value.
  1126. */
  1127. if (!is_bitwise_subset(data, *msr, -1ULL))
  1128. return -EINVAL;
  1129. *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
  1130. return 0;
  1131. }
  1132. /*
  1133. * Called when userspace is restoring VMX MSRs.
  1134. *
  1135. * Returns 0 on success, non-0 otherwise.
  1136. */
  1137. int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
  1138. {
  1139. struct vcpu_vmx *vmx = to_vmx(vcpu);
  1140. /*
  1141. * Don't allow changes to the VMX capability MSRs while the vCPU
  1142. * is in VMX operation.
  1143. */
  1144. if (vmx->nested.vmxon)
  1145. return -EBUSY;
  1146. switch (msr_index) {
  1147. case MSR_IA32_VMX_BASIC:
  1148. return vmx_restore_vmx_basic(vmx, data);
  1149. case MSR_IA32_VMX_PINBASED_CTLS:
  1150. case MSR_IA32_VMX_PROCBASED_CTLS:
  1151. case MSR_IA32_VMX_EXIT_CTLS:
  1152. case MSR_IA32_VMX_ENTRY_CTLS:
  1153. /*
  1154. * The "non-true" VMX capability MSRs are generated from the
  1155. * "true" MSRs, so we do not support restoring them directly.
  1156. *
  1157. * If userspace wants to emulate VMX_BASIC[55]=0, userspace
  1158. * should restore the "true" MSRs with the must-be-1 bits
  1159. * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
  1160. * DEFAULT SETTINGS".
  1161. */
  1162. return -EINVAL;
  1163. case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
  1164. case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
  1165. case MSR_IA32_VMX_TRUE_EXIT_CTLS:
  1166. case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
  1167. case MSR_IA32_VMX_PROCBASED_CTLS2:
  1168. return vmx_restore_control_msr(vmx, msr_index, data);
  1169. case MSR_IA32_VMX_MISC:
  1170. return vmx_restore_vmx_misc(vmx, data);
  1171. case MSR_IA32_VMX_CR0_FIXED0:
  1172. case MSR_IA32_VMX_CR4_FIXED0:
  1173. return vmx_restore_fixed0_msr(vmx, msr_index, data);
  1174. case MSR_IA32_VMX_CR0_FIXED1:
  1175. case MSR_IA32_VMX_CR4_FIXED1:
  1176. /*
  1177. * These MSRs are generated based on the vCPU's CPUID, so we
  1178. * do not support restoring them directly.
  1179. */
  1180. return -EINVAL;
  1181. case MSR_IA32_VMX_EPT_VPID_CAP:
  1182. return vmx_restore_vmx_ept_vpid_cap(vmx, data);
  1183. case MSR_IA32_VMX_VMCS_ENUM:
  1184. vmx->nested.msrs.vmcs_enum = data;
  1185. return 0;
  1186. case MSR_IA32_VMX_VMFUNC:
  1187. if (data & ~vmcs_config.nested.vmfunc_controls)
  1188. return -EINVAL;
  1189. vmx->nested.msrs.vmfunc_controls = data;
  1190. return 0;
  1191. default:
  1192. /*
  1193. * The rest of the VMX capability MSRs do not support restore.
  1194. */
  1195. return -EINVAL;
  1196. }
  1197. }
  1198. /* Returns 0 on success, non-0 otherwise. */
  1199. int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
  1200. {
  1201. switch (msr_index) {
  1202. case MSR_IA32_VMX_BASIC:
  1203. *pdata = msrs->basic;
  1204. break;
  1205. case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
  1206. case MSR_IA32_VMX_PINBASED_CTLS:
  1207. *pdata = vmx_control_msr(
  1208. msrs->pinbased_ctls_low,
  1209. msrs->pinbased_ctls_high);
  1210. if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
  1211. *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
  1212. break;
  1213. case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
  1214. case MSR_IA32_VMX_PROCBASED_CTLS:
  1215. *pdata = vmx_control_msr(
  1216. msrs->procbased_ctls_low,
  1217. msrs->procbased_ctls_high);
  1218. if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
  1219. *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
  1220. break;
  1221. case MSR_IA32_VMX_TRUE_EXIT_CTLS:
  1222. case MSR_IA32_VMX_EXIT_CTLS:
  1223. *pdata = vmx_control_msr(
  1224. msrs->exit_ctls_low,
  1225. msrs->exit_ctls_high);
  1226. if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
  1227. *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  1228. break;
  1229. case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
  1230. case MSR_IA32_VMX_ENTRY_CTLS:
  1231. *pdata = vmx_control_msr(
  1232. msrs->entry_ctls_low,
  1233. msrs->entry_ctls_high);
  1234. if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
  1235. *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
  1236. break;
  1237. case MSR_IA32_VMX_MISC:
  1238. *pdata = vmx_control_msr(
  1239. msrs->misc_low,
  1240. msrs->misc_high);
  1241. break;
  1242. case MSR_IA32_VMX_CR0_FIXED0:
  1243. *pdata = msrs->cr0_fixed0;
  1244. break;
  1245. case MSR_IA32_VMX_CR0_FIXED1:
  1246. *pdata = msrs->cr0_fixed1;
  1247. break;
  1248. case MSR_IA32_VMX_CR4_FIXED0:
  1249. *pdata = msrs->cr4_fixed0;
  1250. break;
  1251. case MSR_IA32_VMX_CR4_FIXED1:
  1252. *pdata = msrs->cr4_fixed1;
  1253. break;
  1254. case MSR_IA32_VMX_VMCS_ENUM:
  1255. *pdata = msrs->vmcs_enum;
  1256. break;
  1257. case MSR_IA32_VMX_PROCBASED_CTLS2:
  1258. *pdata = vmx_control_msr(
  1259. msrs->secondary_ctls_low,
  1260. msrs->secondary_ctls_high);
  1261. break;
  1262. case MSR_IA32_VMX_EPT_VPID_CAP:
  1263. *pdata = msrs->ept_caps |
  1264. ((u64)msrs->vpid_caps << 32);
  1265. break;
  1266. case MSR_IA32_VMX_VMFUNC:
  1267. *pdata = msrs->vmfunc_controls;
  1268. break;
  1269. default:
  1270. return 1;
  1271. }
  1272. return 0;
  1273. }
  1274. /*
  1275. * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
  1276. * been modified by the L1 guest. Note, "writable" in this context means
  1277. * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
  1278. * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
  1279. * VM-exit information fields (which are actually writable if the vCPU is
  1280. * configured to support "VMWRITE to any supported field in the VMCS").
  1281. */
  1282. static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
  1283. {
  1284. struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
  1285. struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
  1286. struct shadow_vmcs_field field;
  1287. unsigned long val;
  1288. int i;
  1289. if (WARN_ON(!shadow_vmcs))
  1290. return;
  1291. preempt_disable();
  1292. vmcs_load(shadow_vmcs);
  1293. for (i = 0; i < max_shadow_read_write_fields; i++) {
  1294. field = shadow_read_write_fields[i];
  1295. val = __vmcs_readl(field.encoding);
  1296. vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
  1297. }
  1298. vmcs_clear(shadow_vmcs);
  1299. vmcs_load(vmx->loaded_vmcs->vmcs);
  1300. preempt_enable();
  1301. }
  1302. static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
  1303. {
  1304. const struct shadow_vmcs_field *fields[] = {
  1305. shadow_read_write_fields,
  1306. shadow_read_only_fields
  1307. };
  1308. const int max_fields[] = {
  1309. max_shadow_read_write_fields,
  1310. max_shadow_read_only_fields
  1311. };
  1312. struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
  1313. struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
  1314. struct shadow_vmcs_field field;
  1315. unsigned long val;
  1316. int i, q;
  1317. if (WARN_ON(!shadow_vmcs))
  1318. return;
  1319. vmcs_load(shadow_vmcs);
  1320. for (q = 0; q < ARRAY_SIZE(fields); q++) {
  1321. for (i = 0; i < max_fields[q]; i++) {
  1322. field = fields[q][i];
  1323. val = vmcs12_read_any(vmcs12, field.encoding,
  1324. field.offset);
  1325. __vmcs_writel(field.encoding, val);
  1326. }
  1327. }
  1328. vmcs_clear(shadow_vmcs);
  1329. vmcs_load(vmx->loaded_vmcs->vmcs);
  1330. }
  1331. static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
  1332. {
  1333. struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
  1334. struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
  1335. /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
  1336. vmcs12->tpr_threshold = evmcs->tpr_threshold;
  1337. vmcs12->guest_rip = evmcs->guest_rip;
  1338. if (unlikely(!(hv_clean_fields &
  1339. HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
  1340. vmcs12->guest_rsp = evmcs->guest_rsp;
  1341. vmcs12->guest_rflags = evmcs->guest_rflags;
  1342. vmcs12->guest_interruptibility_info =
  1343. evmcs->guest_interruptibility_info;
  1344. /*
  1345. * Not present in struct vmcs12:
  1346. * vmcs12->guest_ssp = evmcs->guest_ssp;
  1347. */
  1348. }
  1349. if (unlikely(!(hv_clean_fields &
  1350. HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
  1351. vmcs12->cpu_based_vm_exec_control =
  1352. evmcs->cpu_based_vm_exec_control;
  1353. }
  1354. if (unlikely(!(hv_clean_fields &
  1355. HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
  1356. vmcs12->exception_bitmap = evmcs->exception_bitmap;
  1357. }
  1358. if (unlikely(!(hv_clean_fields &
  1359. HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
  1360. vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
  1361. }
  1362. if (unlikely(!(hv_clean_fields &
  1363. HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
  1364. vmcs12->vm_entry_intr_info_field =
  1365. evmcs->vm_entry_intr_info_field;
  1366. vmcs12->vm_entry_exception_error_code =
  1367. evmcs->vm_entry_exception_error_code;
  1368. vmcs12->vm_entry_instruction_len =
  1369. evmcs->vm_entry_instruction_len;
  1370. }
  1371. if (unlikely(!(hv_clean_fields &
  1372. HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
  1373. vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
  1374. vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
  1375. vmcs12->host_cr0 = evmcs->host_cr0;
  1376. vmcs12->host_cr3 = evmcs->host_cr3;
  1377. vmcs12->host_cr4 = evmcs->host_cr4;
  1378. vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
  1379. vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
  1380. vmcs12->host_rip = evmcs->host_rip;
  1381. vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
  1382. vmcs12->host_es_selector = evmcs->host_es_selector;
  1383. vmcs12->host_cs_selector = evmcs->host_cs_selector;
  1384. vmcs12->host_ss_selector = evmcs->host_ss_selector;
  1385. vmcs12->host_ds_selector = evmcs->host_ds_selector;
  1386. vmcs12->host_fs_selector = evmcs->host_fs_selector;
  1387. vmcs12->host_gs_selector = evmcs->host_gs_selector;
  1388. vmcs12->host_tr_selector = evmcs->host_tr_selector;
  1389. vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
  1390. /*
  1391. * Not present in struct vmcs12:
  1392. * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
  1393. * vmcs12->host_ssp = evmcs->host_ssp;
  1394. * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
  1395. */
  1396. }
  1397. if (unlikely(!(hv_clean_fields &
  1398. HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
  1399. vmcs12->pin_based_vm_exec_control =
  1400. evmcs->pin_based_vm_exec_control;
  1401. vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
  1402. vmcs12->secondary_vm_exec_control =
  1403. evmcs->secondary_vm_exec_control;
  1404. }
  1405. if (unlikely(!(hv_clean_fields &
  1406. HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
  1407. vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
  1408. vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
  1409. }
  1410. if (unlikely(!(hv_clean_fields &
  1411. HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
  1412. vmcs12->msr_bitmap = evmcs->msr_bitmap;
  1413. }
  1414. if (unlikely(!(hv_clean_fields &
  1415. HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
  1416. vmcs12->guest_es_base = evmcs->guest_es_base;
  1417. vmcs12->guest_cs_base = evmcs->guest_cs_base;
  1418. vmcs12->guest_ss_base = evmcs->guest_ss_base;
  1419. vmcs12->guest_ds_base = evmcs->guest_ds_base;
  1420. vmcs12->guest_fs_base = evmcs->guest_fs_base;
  1421. vmcs12->guest_gs_base = evmcs->guest_gs_base;
  1422. vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
  1423. vmcs12->guest_tr_base = evmcs->guest_tr_base;
  1424. vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
  1425. vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
  1426. vmcs12->guest_es_limit = evmcs->guest_es_limit;
  1427. vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
  1428. vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
  1429. vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
  1430. vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
  1431. vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
  1432. vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
  1433. vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
  1434. vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
  1435. vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
  1436. vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
  1437. vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
  1438. vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
  1439. vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
  1440. vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
  1441. vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
  1442. vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
  1443. vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
  1444. vmcs12->guest_es_selector = evmcs->guest_es_selector;
  1445. vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
  1446. vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
  1447. vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
  1448. vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
  1449. vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
  1450. vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
  1451. vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
  1452. }
  1453. if (unlikely(!(hv_clean_fields &
  1454. HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
  1455. vmcs12->tsc_offset = evmcs->tsc_offset;
  1456. vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
  1457. vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
  1458. vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
  1459. vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
  1460. }
  1461. if (unlikely(!(hv_clean_fields &
  1462. HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
  1463. vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
  1464. vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
  1465. vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
  1466. vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
  1467. vmcs12->guest_cr0 = evmcs->guest_cr0;
  1468. vmcs12->guest_cr3 = evmcs->guest_cr3;
  1469. vmcs12->guest_cr4 = evmcs->guest_cr4;
  1470. vmcs12->guest_dr7 = evmcs->guest_dr7;
  1471. }
  1472. if (unlikely(!(hv_clean_fields &
  1473. HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
  1474. vmcs12->host_fs_base = evmcs->host_fs_base;
  1475. vmcs12->host_gs_base = evmcs->host_gs_base;
  1476. vmcs12->host_tr_base = evmcs->host_tr_base;
  1477. vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
  1478. vmcs12->host_idtr_base = evmcs->host_idtr_base;
  1479. vmcs12->host_rsp = evmcs->host_rsp;
  1480. }
  1481. if (unlikely(!(hv_clean_fields &
  1482. HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
  1483. vmcs12->ept_pointer = evmcs->ept_pointer;
  1484. vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
  1485. }
  1486. if (unlikely(!(hv_clean_fields &
  1487. HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
  1488. vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
  1489. vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
  1490. vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
  1491. vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
  1492. vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
  1493. vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
  1494. vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
  1495. vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
  1496. vmcs12->guest_pending_dbg_exceptions =
  1497. evmcs->guest_pending_dbg_exceptions;
  1498. vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
  1499. vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
  1500. vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
  1501. vmcs12->guest_activity_state = evmcs->guest_activity_state;
  1502. vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
  1503. vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
  1504. /*
  1505. * Not present in struct vmcs12:
  1506. * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
  1507. * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
  1508. * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
  1509. */
  1510. }
  1511. /*
  1512. * Not used?
  1513. * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
  1514. * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
  1515. * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
  1516. * vmcs12->page_fault_error_code_mask =
  1517. * evmcs->page_fault_error_code_mask;
  1518. * vmcs12->page_fault_error_code_match =
  1519. * evmcs->page_fault_error_code_match;
  1520. * vmcs12->cr3_target_count = evmcs->cr3_target_count;
  1521. * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
  1522. * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
  1523. * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
  1524. */
  1525. /*
  1526. * Read only fields:
  1527. * vmcs12->guest_physical_address = evmcs->guest_physical_address;
  1528. * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
  1529. * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
  1530. * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
  1531. * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
  1532. * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
  1533. * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
  1534. * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
  1535. * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
  1536. * vmcs12->exit_qualification = evmcs->exit_qualification;
  1537. * vmcs12->guest_linear_address = evmcs->guest_linear_address;
  1538. *
  1539. * Not present in struct vmcs12:
  1540. * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
  1541. * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
  1542. * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
  1543. * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
  1544. */
  1545. return;
  1546. }
  1547. static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
  1548. {
  1549. struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
  1550. struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
  1551. /*
  1552. * Should not be changed by KVM:
  1553. *
  1554. * evmcs->host_es_selector = vmcs12->host_es_selector;
  1555. * evmcs->host_cs_selector = vmcs12->host_cs_selector;
  1556. * evmcs->host_ss_selector = vmcs12->host_ss_selector;
  1557. * evmcs->host_ds_selector = vmcs12->host_ds_selector;
  1558. * evmcs->host_fs_selector = vmcs12->host_fs_selector;
  1559. * evmcs->host_gs_selector = vmcs12->host_gs_selector;
  1560. * evmcs->host_tr_selector = vmcs12->host_tr_selector;
  1561. * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
  1562. * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
  1563. * evmcs->host_cr0 = vmcs12->host_cr0;
  1564. * evmcs->host_cr3 = vmcs12->host_cr3;
  1565. * evmcs->host_cr4 = vmcs12->host_cr4;
  1566. * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
  1567. * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
  1568. * evmcs->host_rip = vmcs12->host_rip;
  1569. * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
  1570. * evmcs->host_fs_base = vmcs12->host_fs_base;
  1571. * evmcs->host_gs_base = vmcs12->host_gs_base;
  1572. * evmcs->host_tr_base = vmcs12->host_tr_base;
  1573. * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
  1574. * evmcs->host_idtr_base = vmcs12->host_idtr_base;
  1575. * evmcs->host_rsp = vmcs12->host_rsp;
  1576. * sync_vmcs02_to_vmcs12() doesn't read these:
  1577. * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
  1578. * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
  1579. * evmcs->msr_bitmap = vmcs12->msr_bitmap;
  1580. * evmcs->ept_pointer = vmcs12->ept_pointer;
  1581. * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
  1582. * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
  1583. * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
  1584. * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
  1585. * evmcs->tpr_threshold = vmcs12->tpr_threshold;
  1586. * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
  1587. * evmcs->exception_bitmap = vmcs12->exception_bitmap;
  1588. * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
  1589. * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
  1590. * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
  1591. * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
  1592. * evmcs->page_fault_error_code_mask =
  1593. * vmcs12->page_fault_error_code_mask;
  1594. * evmcs->page_fault_error_code_match =
  1595. * vmcs12->page_fault_error_code_match;
  1596. * evmcs->cr3_target_count = vmcs12->cr3_target_count;
  1597. * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
  1598. * evmcs->tsc_offset = vmcs12->tsc_offset;
  1599. * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
  1600. * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
  1601. * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
  1602. * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
  1603. * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
  1604. * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
  1605. * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
  1606. * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
  1607. * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
  1608. * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
  1609. * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
  1610. * evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
  1611. *
  1612. * Not present in struct vmcs12:
  1613. * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
  1614. * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
  1615. * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
  1616. * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
  1617. * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
  1618. * evmcs->host_ssp = vmcs12->host_ssp;
  1619. * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
  1620. * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
  1621. * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
  1622. * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
  1623. * evmcs->guest_ssp = vmcs12->guest_ssp;
  1624. */
  1625. evmcs->guest_es_selector = vmcs12->guest_es_selector;
  1626. evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
  1627. evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
  1628. evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
  1629. evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
  1630. evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
  1631. evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
  1632. evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
  1633. evmcs->guest_es_limit = vmcs12->guest_es_limit;
  1634. evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
  1635. evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
  1636. evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
  1637. evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
  1638. evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
  1639. evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
  1640. evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
  1641. evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
  1642. evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
  1643. evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
  1644. evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
  1645. evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
  1646. evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
  1647. evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
  1648. evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
  1649. evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
  1650. evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
  1651. evmcs->guest_es_base = vmcs12->guest_es_base;
  1652. evmcs->guest_cs_base = vmcs12->guest_cs_base;
  1653. evmcs->guest_ss_base = vmcs12->guest_ss_base;
  1654. evmcs->guest_ds_base = vmcs12->guest_ds_base;
  1655. evmcs->guest_fs_base = vmcs12->guest_fs_base;
  1656. evmcs->guest_gs_base = vmcs12->guest_gs_base;
  1657. evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
  1658. evmcs->guest_tr_base = vmcs12->guest_tr_base;
  1659. evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
  1660. evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
  1661. evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
  1662. evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
  1663. evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
  1664. evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
  1665. evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
  1666. evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
  1667. evmcs->guest_pending_dbg_exceptions =
  1668. vmcs12->guest_pending_dbg_exceptions;
  1669. evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
  1670. evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
  1671. evmcs->guest_activity_state = vmcs12->guest_activity_state;
  1672. evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
  1673. evmcs->guest_cr0 = vmcs12->guest_cr0;
  1674. evmcs->guest_cr3 = vmcs12->guest_cr3;
  1675. evmcs->guest_cr4 = vmcs12->guest_cr4;
  1676. evmcs->guest_dr7 = vmcs12->guest_dr7;
  1677. evmcs->guest_physical_address = vmcs12->guest_physical_address;
  1678. evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
  1679. evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
  1680. evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
  1681. evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
  1682. evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
  1683. evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
  1684. evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
  1685. evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
  1686. evmcs->exit_qualification = vmcs12->exit_qualification;
  1687. evmcs->guest_linear_address = vmcs12->guest_linear_address;
  1688. evmcs->guest_rsp = vmcs12->guest_rsp;
  1689. evmcs->guest_rflags = vmcs12->guest_rflags;
  1690. evmcs->guest_interruptibility_info =
  1691. vmcs12->guest_interruptibility_info;
  1692. evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
  1693. evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
  1694. evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
  1695. evmcs->vm_entry_exception_error_code =
  1696. vmcs12->vm_entry_exception_error_code;
  1697. evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
  1698. evmcs->guest_rip = vmcs12->guest_rip;
  1699. evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
  1700. return;
  1701. }
  1702. /*
  1703. * This is an equivalent of the nested hypervisor executing the vmptrld
  1704. * instruction.
  1705. */
  1706. static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
  1707. struct kvm_vcpu *vcpu, bool from_launch)
  1708. {
  1709. struct vcpu_vmx *vmx = to_vmx(vcpu);
  1710. bool evmcs_gpa_changed = false;
  1711. u64 evmcs_gpa;
  1712. if (likely(!guest_cpuid_has_evmcs(vcpu)))
  1713. return EVMPTRLD_DISABLED;
  1714. if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
  1715. nested_release_evmcs(vcpu);
  1716. return EVMPTRLD_DISABLED;
  1717. }
  1718. if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
  1719. vmx->nested.current_vmptr = INVALID_GPA;
  1720. nested_release_evmcs(vcpu);
  1721. if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
  1722. &vmx->nested.hv_evmcs_map))
  1723. return EVMPTRLD_ERROR;
  1724. vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
  1725. /*
  1726. * Currently, KVM only supports eVMCS version 1
  1727. * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
  1728. * value to first u32 field of eVMCS which should specify eVMCS
  1729. * VersionNumber.
  1730. *
  1731. * Guest should be aware of supported eVMCS versions by host by
  1732. * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
  1733. * expected to set this CPUID leaf according to the value
  1734. * returned in vmcs_version from nested_enable_evmcs().
  1735. *
  1736. * However, it turns out that Microsoft Hyper-V fails to comply
  1737. * to their own invented interface: When Hyper-V use eVMCS, it
  1738. * just sets first u32 field of eVMCS to revision_id specified
  1739. * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
  1740. * which is one of the supported versions specified in
  1741. * CPUID.0x4000000A.EAX[0:15].
  1742. *
  1743. * To overcome Hyper-V bug, we accept here either a supported
  1744. * eVMCS version or VMCS12 revision_id as valid values for first
  1745. * u32 field of eVMCS.
  1746. */
  1747. if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
  1748. (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
  1749. nested_release_evmcs(vcpu);
  1750. return EVMPTRLD_VMFAIL;
  1751. }
  1752. vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
  1753. evmcs_gpa_changed = true;
  1754. /*
  1755. * Unlike normal vmcs12, enlightened vmcs12 is not fully
  1756. * reloaded from guest's memory (read only fields, fields not
  1757. * present in struct hv_enlightened_vmcs, ...). Make sure there
  1758. * are no leftovers.
  1759. */
  1760. if (from_launch) {
  1761. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  1762. memset(vmcs12, 0, sizeof(*vmcs12));
  1763. vmcs12->hdr.revision_id = VMCS12_REVISION;
  1764. }
  1765. }
  1766. /*
  1767. * Clean fields data can't be used on VMLAUNCH and when we switch
  1768. * between different L2 guests as KVM keeps a single VMCS12 per L1.
  1769. */
  1770. if (from_launch || evmcs_gpa_changed) {
  1771. vmx->nested.hv_evmcs->hv_clean_fields &=
  1772. ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
  1773. vmx->nested.force_msr_bitmap_recalc = true;
  1774. }
  1775. return EVMPTRLD_SUCCEEDED;
  1776. }
  1777. void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
  1778. {
  1779. struct vcpu_vmx *vmx = to_vmx(vcpu);
  1780. if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
  1781. copy_vmcs12_to_enlightened(vmx);
  1782. else
  1783. copy_vmcs12_to_shadow(vmx);
  1784. vmx->nested.need_vmcs12_to_shadow_sync = false;
  1785. }
  1786. static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
  1787. {
  1788. struct vcpu_vmx *vmx =
  1789. container_of(timer, struct vcpu_vmx, nested.preemption_timer);
  1790. vmx->nested.preemption_timer_expired = true;
  1791. kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
  1792. kvm_vcpu_kick(&vmx->vcpu);
  1793. return HRTIMER_NORESTART;
  1794. }
  1795. static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
  1796. {
  1797. struct vcpu_vmx *vmx = to_vmx(vcpu);
  1798. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  1799. u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
  1800. VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
  1801. if (!vmx->nested.has_preemption_timer_deadline) {
  1802. vmx->nested.preemption_timer_deadline =
  1803. vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
  1804. vmx->nested.has_preemption_timer_deadline = true;
  1805. }
  1806. return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
  1807. }
  1808. static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
  1809. u64 preemption_timeout)
  1810. {
  1811. struct vcpu_vmx *vmx = to_vmx(vcpu);
  1812. /*
  1813. * A timer value of zero is architecturally guaranteed to cause
  1814. * a VMExit prior to executing any instructions in the guest.
  1815. */
  1816. if (preemption_timeout == 0) {
  1817. vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
  1818. return;
  1819. }
  1820. if (vcpu->arch.virtual_tsc_khz == 0)
  1821. return;
  1822. preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
  1823. preemption_timeout *= 1000000;
  1824. do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
  1825. hrtimer_start(&vmx->nested.preemption_timer,
  1826. ktime_add_ns(ktime_get(), preemption_timeout),
  1827. HRTIMER_MODE_ABS_PINNED);
  1828. }
  1829. static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
  1830. {
  1831. if (vmx->nested.nested_run_pending &&
  1832. (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
  1833. return vmcs12->guest_ia32_efer;
  1834. else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
  1835. return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
  1836. else
  1837. return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
  1838. }
  1839. static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
  1840. {
  1841. struct kvm *kvm = vmx->vcpu.kvm;
  1842. /*
  1843. * If vmcs02 hasn't been initialized, set the constant vmcs02 state
  1844. * according to L0's settings (vmcs12 is irrelevant here). Host
  1845. * fields that come from L0 and are not constant, e.g. HOST_CR3,
  1846. * will be set as needed prior to VMLAUNCH/VMRESUME.
  1847. */
  1848. if (vmx->nested.vmcs02_initialized)
  1849. return;
  1850. vmx->nested.vmcs02_initialized = true;
  1851. /*
  1852. * We don't care what the EPTP value is we just need to guarantee
  1853. * it's valid so we don't get a false positive when doing early
  1854. * consistency checks.
  1855. */
  1856. if (enable_ept && nested_early_check)
  1857. vmcs_write64(EPT_POINTER,
  1858. construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
  1859. /* All VMFUNCs are currently emulated through L0 vmexits. */
  1860. if (cpu_has_vmx_vmfunc())
  1861. vmcs_write64(VM_FUNCTION_CONTROL, 0);
  1862. if (cpu_has_vmx_posted_intr())
  1863. vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
  1864. if (cpu_has_vmx_msr_bitmap())
  1865. vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
  1866. /*
  1867. * PML is emulated for L2, but never enabled in hardware as the MMU
  1868. * handles A/D emulation. Disabling PML for L2 also avoids having to
  1869. * deal with filtering out L2 GPAs from the buffer.
  1870. */
  1871. if (enable_pml) {
  1872. vmcs_write64(PML_ADDRESS, 0);
  1873. vmcs_write16(GUEST_PML_INDEX, -1);
  1874. }
  1875. if (cpu_has_vmx_encls_vmexit())
  1876. vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
  1877. if (kvm_notify_vmexit_enabled(kvm))
  1878. vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
  1879. /*
  1880. * Set the MSR load/store lists to match L0's settings. Only the
  1881. * addresses are constant (for vmcs02), the counts can change based
  1882. * on L2's behavior, e.g. switching to/from long mode.
  1883. */
  1884. vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
  1885. vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
  1886. vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
  1887. vmx_set_constant_host_state(vmx);
  1888. }
  1889. static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
  1890. struct vmcs12 *vmcs12)
  1891. {
  1892. prepare_vmcs02_constant_state(vmx);
  1893. vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
  1894. if (enable_vpid) {
  1895. if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
  1896. vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
  1897. else
  1898. vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
  1899. }
  1900. }
  1901. static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
  1902. struct vmcs12 *vmcs12)
  1903. {
  1904. u32 exec_control;
  1905. u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
  1906. if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
  1907. prepare_vmcs02_early_rare(vmx, vmcs12);
  1908. /*
  1909. * PIN CONTROLS
  1910. */
  1911. exec_control = __pin_controls_get(vmcs01);
  1912. exec_control |= (vmcs12->pin_based_vm_exec_control &
  1913. ~PIN_BASED_VMX_PREEMPTION_TIMER);
  1914. /* Posted interrupts setting is only taken from vmcs12. */
  1915. vmx->nested.pi_pending = false;
  1916. if (nested_cpu_has_posted_intr(vmcs12))
  1917. vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
  1918. else
  1919. exec_control &= ~PIN_BASED_POSTED_INTR;
  1920. pin_controls_set(vmx, exec_control);
  1921. /*
  1922. * EXEC CONTROLS
  1923. */
  1924. exec_control = __exec_controls_get(vmcs01); /* L0's desires */
  1925. exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
  1926. exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
  1927. exec_control &= ~CPU_BASED_TPR_SHADOW;
  1928. exec_control |= vmcs12->cpu_based_vm_exec_control;
  1929. vmx->nested.l1_tpr_threshold = -1;
  1930. if (exec_control & CPU_BASED_TPR_SHADOW)
  1931. vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
  1932. #ifdef CONFIG_X86_64
  1933. else
  1934. exec_control |= CPU_BASED_CR8_LOAD_EXITING |
  1935. CPU_BASED_CR8_STORE_EXITING;
  1936. #endif
  1937. /*
  1938. * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
  1939. * for I/O port accesses.
  1940. */
  1941. exec_control |= CPU_BASED_UNCOND_IO_EXITING;
  1942. exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
  1943. /*
  1944. * This bit will be computed in nested_get_vmcs12_pages, because
  1945. * we do not have access to L1's MSR bitmap yet. For now, keep
  1946. * the same bit as before, hoping to avoid multiple VMWRITEs that
  1947. * only set/clear this bit.
  1948. */
  1949. exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
  1950. exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
  1951. exec_controls_set(vmx, exec_control);
  1952. /*
  1953. * SECONDARY EXEC CONTROLS
  1954. */
  1955. if (cpu_has_secondary_exec_ctrls()) {
  1956. exec_control = __secondary_exec_controls_get(vmcs01);
  1957. /* Take the following fields only from vmcs12 */
  1958. exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
  1959. SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
  1960. SECONDARY_EXEC_ENABLE_INVPCID |
  1961. SECONDARY_EXEC_ENABLE_RDTSCP |
  1962. SECONDARY_EXEC_XSAVES |
  1963. SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
  1964. SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
  1965. SECONDARY_EXEC_APIC_REGISTER_VIRT |
  1966. SECONDARY_EXEC_ENABLE_VMFUNC |
  1967. SECONDARY_EXEC_DESC);
  1968. if (nested_cpu_has(vmcs12,
  1969. CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
  1970. exec_control |= vmcs12->secondary_vm_exec_control;
  1971. /* PML is emulated and never enabled in hardware for L2. */
  1972. exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
  1973. /* VMCS shadowing for L2 is emulated for now */
  1974. exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
  1975. /*
  1976. * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
  1977. * will not have to rewrite the controls just for this bit.
  1978. */
  1979. if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
  1980. (vmcs12->guest_cr4 & X86_CR4_UMIP))
  1981. exec_control |= SECONDARY_EXEC_DESC;
  1982. if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
  1983. vmcs_write16(GUEST_INTR_STATUS,
  1984. vmcs12->guest_intr_status);
  1985. if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
  1986. exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
  1987. if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
  1988. vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
  1989. secondary_exec_controls_set(vmx, exec_control);
  1990. }
  1991. /*
  1992. * ENTRY CONTROLS
  1993. *
  1994. * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
  1995. * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
  1996. * on the related bits (if supported by the CPU) in the hope that
  1997. * we can avoid VMWrites during vmx_set_efer().
  1998. *
  1999. * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
  2000. * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
  2001. * do the same for L2.
  2002. */
  2003. exec_control = __vm_entry_controls_get(vmcs01);
  2004. exec_control |= (vmcs12->vm_entry_controls &
  2005. ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
  2006. exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
  2007. if (cpu_has_load_ia32_efer()) {
  2008. if (guest_efer & EFER_LMA)
  2009. exec_control |= VM_ENTRY_IA32E_MODE;
  2010. if (guest_efer != host_efer)
  2011. exec_control |= VM_ENTRY_LOAD_IA32_EFER;
  2012. }
  2013. vm_entry_controls_set(vmx, exec_control);
  2014. /*
  2015. * EXIT CONTROLS
  2016. *
  2017. * L2->L1 exit controls are emulated - the hardware exit is to L0 so
  2018. * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
  2019. * bits may be modified by vmx_set_efer() in prepare_vmcs02().
  2020. */
  2021. exec_control = __vm_exit_controls_get(vmcs01);
  2022. if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
  2023. exec_control |= VM_EXIT_LOAD_IA32_EFER;
  2024. else
  2025. exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
  2026. vm_exit_controls_set(vmx, exec_control);
  2027. /*
  2028. * Interrupt/Exception Fields
  2029. */
  2030. if (vmx->nested.nested_run_pending) {
  2031. vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
  2032. vmcs12->vm_entry_intr_info_field);
  2033. vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
  2034. vmcs12->vm_entry_exception_error_code);
  2035. vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
  2036. vmcs12->vm_entry_instruction_len);
  2037. vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
  2038. vmcs12->guest_interruptibility_info);
  2039. vmx->loaded_vmcs->nmi_known_unmasked =
  2040. !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
  2041. } else {
  2042. vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
  2043. }
  2044. }
  2045. static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
  2046. {
  2047. struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
  2048. if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
  2049. HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
  2050. vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
  2051. vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
  2052. vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
  2053. vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
  2054. vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
  2055. vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
  2056. vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
  2057. vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
  2058. vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
  2059. vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
  2060. vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
  2061. vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
  2062. vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
  2063. vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
  2064. vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
  2065. vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
  2066. vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
  2067. vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
  2068. vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
  2069. vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
  2070. vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
  2071. vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
  2072. vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
  2073. vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
  2074. vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
  2075. vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
  2076. vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
  2077. vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
  2078. vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
  2079. vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
  2080. vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
  2081. vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
  2082. vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
  2083. vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
  2084. vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
  2085. vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
  2086. vmx->segment_cache.bitmask = 0;
  2087. }
  2088. if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
  2089. HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
  2090. vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
  2091. vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
  2092. vmcs12->guest_pending_dbg_exceptions);
  2093. vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
  2094. vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
  2095. /*
  2096. * L1 may access the L2's PDPTR, so save them to construct
  2097. * vmcs12
  2098. */
  2099. if (enable_ept) {
  2100. vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
  2101. vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
  2102. vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
  2103. vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
  2104. }
  2105. if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
  2106. (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
  2107. vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
  2108. }
  2109. if (nested_cpu_has_xsaves(vmcs12))
  2110. vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
  2111. /*
  2112. * Whether page-faults are trapped is determined by a combination of
  2113. * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
  2114. * doesn't care about page faults then we should set all of these to
  2115. * L1's desires. However, if L0 does care about (some) page faults, it
  2116. * is not easy (if at all possible?) to merge L0 and L1's desires, we
  2117. * simply ask to exit on each and every L2 page fault. This is done by
  2118. * setting MASK=MATCH=0 and (see below) EB.PF=1.
  2119. * Note that below we don't need special code to set EB.PF beyond the
  2120. * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
  2121. * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
  2122. * !enable_ept, EB.PF is 1, so the "or" will always be 1.
  2123. */
  2124. if (vmx_need_pf_intercept(&vmx->vcpu)) {
  2125. /*
  2126. * TODO: if both L0 and L1 need the same MASK and MATCH,
  2127. * go ahead and use it?
  2128. */
  2129. vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
  2130. vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
  2131. } else {
  2132. vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
  2133. vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
  2134. }
  2135. if (cpu_has_vmx_apicv()) {
  2136. vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
  2137. vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
  2138. vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
  2139. vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
  2140. }
  2141. /*
  2142. * Make sure the msr_autostore list is up to date before we set the
  2143. * count in the vmcs02.
  2144. */
  2145. prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
  2146. vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
  2147. vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
  2148. vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
  2149. set_cr4_guest_host_mask(vmx);
  2150. }
  2151. /*
  2152. * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  2153. * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
  2154. * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
  2155. * guest in a way that will both be appropriate to L1's requests, and our
  2156. * needs. In addition to modifying the active vmcs (which is vmcs02), this
  2157. * function also has additional necessary side-effects, like setting various
  2158. * vcpu->arch fields.
  2159. * Returns 0 on success, 1 on failure. Invalid state exit qualification code
  2160. * is assigned to entry_failure_code on failure.
  2161. */
  2162. static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
  2163. bool from_vmentry,
  2164. enum vm_entry_failure_code *entry_failure_code)
  2165. {
  2166. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2167. bool load_guest_pdptrs_vmcs12 = false;
  2168. if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
  2169. prepare_vmcs02_rare(vmx, vmcs12);
  2170. vmx->nested.dirty_vmcs12 = false;
  2171. load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
  2172. !(vmx->nested.hv_evmcs->hv_clean_fields &
  2173. HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
  2174. }
  2175. if (vmx->nested.nested_run_pending &&
  2176. (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
  2177. kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
  2178. vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
  2179. } else {
  2180. kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
  2181. vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
  2182. }
  2183. if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
  2184. !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
  2185. vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
  2186. vmx_set_rflags(vcpu, vmcs12->guest_rflags);
  2187. /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
  2188. * bitwise-or of what L1 wants to trap for L2, and what we want to
  2189. * trap. Note that CR0.TS also needs updating - we do this later.
  2190. */
  2191. vmx_update_exception_bitmap(vcpu);
  2192. vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
  2193. vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
  2194. if (vmx->nested.nested_run_pending &&
  2195. (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
  2196. vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
  2197. vcpu->arch.pat = vmcs12->guest_ia32_pat;
  2198. } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
  2199. vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
  2200. }
  2201. vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
  2202. vcpu->arch.l1_tsc_offset,
  2203. vmx_get_l2_tsc_offset(vcpu),
  2204. vmx_get_l2_tsc_multiplier(vcpu));
  2205. vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
  2206. vcpu->arch.l1_tsc_scaling_ratio,
  2207. vmx_get_l2_tsc_multiplier(vcpu));
  2208. vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
  2209. if (kvm_caps.has_tsc_control)
  2210. vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
  2211. nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
  2212. if (nested_cpu_has_ept(vmcs12))
  2213. nested_ept_init_mmu_context(vcpu);
  2214. /*
  2215. * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
  2216. * bits which we consider mandatory enabled.
  2217. * The CR0_READ_SHADOW is what L2 should have expected to read given
  2218. * the specifications by L1; It's not enough to take
  2219. * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
  2220. * have more bits than L1 expected.
  2221. */
  2222. vmx_set_cr0(vcpu, vmcs12->guest_cr0);
  2223. vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
  2224. vmx_set_cr4(vcpu, vmcs12->guest_cr4);
  2225. vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
  2226. vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
  2227. /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
  2228. vmx_set_efer(vcpu, vcpu->arch.efer);
  2229. /*
  2230. * Guest state is invalid and unrestricted guest is disabled,
  2231. * which means L1 attempted VMEntry to L2 with invalid state.
  2232. * Fail the VMEntry.
  2233. *
  2234. * However when force loading the guest state (SMM exit or
  2235. * loading nested state after migration, it is possible to
  2236. * have invalid guest state now, which will be later fixed by
  2237. * restoring L2 register state
  2238. */
  2239. if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
  2240. *entry_failure_code = ENTRY_FAIL_DEFAULT;
  2241. return -EINVAL;
  2242. }
  2243. /* Shadow page tables on either EPT or shadow page tables. */
  2244. if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
  2245. from_vmentry, entry_failure_code))
  2246. return -EINVAL;
  2247. /*
  2248. * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
  2249. * on nested VM-Exit, which can occur without actually running L2 and
  2250. * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
  2251. * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
  2252. * transition to HLT instead of running L2.
  2253. */
  2254. if (enable_ept)
  2255. vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
  2256. /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
  2257. if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
  2258. is_pae_paging(vcpu)) {
  2259. vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
  2260. vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
  2261. vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
  2262. vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
  2263. }
  2264. if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
  2265. intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
  2266. WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
  2267. vmcs12->guest_ia32_perf_global_ctrl))) {
  2268. *entry_failure_code = ENTRY_FAIL_DEFAULT;
  2269. return -EINVAL;
  2270. }
  2271. kvm_rsp_write(vcpu, vmcs12->guest_rsp);
  2272. kvm_rip_write(vcpu, vmcs12->guest_rip);
  2273. /*
  2274. * It was observed that genuine Hyper-V running in L1 doesn't reset
  2275. * 'hv_clean_fields' by itself, it only sets the corresponding dirty
  2276. * bits when it changes a field in eVMCS. Mark all fields as clean
  2277. * here.
  2278. */
  2279. if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
  2280. vmx->nested.hv_evmcs->hv_clean_fields |=
  2281. HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
  2282. return 0;
  2283. }
  2284. static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
  2285. {
  2286. if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
  2287. nested_cpu_has_virtual_nmis(vmcs12)))
  2288. return -EINVAL;
  2289. if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
  2290. nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
  2291. return -EINVAL;
  2292. return 0;
  2293. }
  2294. static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
  2295. {
  2296. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2297. /* Check for memory type validity */
  2298. switch (new_eptp & VMX_EPTP_MT_MASK) {
  2299. case VMX_EPTP_MT_UC:
  2300. if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
  2301. return false;
  2302. break;
  2303. case VMX_EPTP_MT_WB:
  2304. if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
  2305. return false;
  2306. break;
  2307. default:
  2308. return false;
  2309. }
  2310. /* Page-walk levels validity. */
  2311. switch (new_eptp & VMX_EPTP_PWL_MASK) {
  2312. case VMX_EPTP_PWL_5:
  2313. if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
  2314. return false;
  2315. break;
  2316. case VMX_EPTP_PWL_4:
  2317. if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
  2318. return false;
  2319. break;
  2320. default:
  2321. return false;
  2322. }
  2323. /* Reserved bits should not be set */
  2324. if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
  2325. return false;
  2326. /* AD, if set, should be supported */
  2327. if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
  2328. if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
  2329. return false;
  2330. }
  2331. return true;
  2332. }
  2333. /*
  2334. * Checks related to VM-Execution Control Fields
  2335. */
  2336. static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
  2337. struct vmcs12 *vmcs12)
  2338. {
  2339. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2340. if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
  2341. vmx->nested.msrs.pinbased_ctls_low,
  2342. vmx->nested.msrs.pinbased_ctls_high)) ||
  2343. CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
  2344. vmx->nested.msrs.procbased_ctls_low,
  2345. vmx->nested.msrs.procbased_ctls_high)))
  2346. return -EINVAL;
  2347. if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
  2348. CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
  2349. vmx->nested.msrs.secondary_ctls_low,
  2350. vmx->nested.msrs.secondary_ctls_high)))
  2351. return -EINVAL;
  2352. if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
  2353. nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
  2354. nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
  2355. nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
  2356. nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
  2357. nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
  2358. nested_vmx_check_nmi_controls(vmcs12) ||
  2359. nested_vmx_check_pml_controls(vcpu, vmcs12) ||
  2360. nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
  2361. nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
  2362. nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
  2363. CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
  2364. return -EINVAL;
  2365. if (!nested_cpu_has_preemption_timer(vmcs12) &&
  2366. nested_cpu_has_save_preemption_timer(vmcs12))
  2367. return -EINVAL;
  2368. if (nested_cpu_has_ept(vmcs12) &&
  2369. CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
  2370. return -EINVAL;
  2371. if (nested_cpu_has_vmfunc(vmcs12)) {
  2372. if (CC(vmcs12->vm_function_control &
  2373. ~vmx->nested.msrs.vmfunc_controls))
  2374. return -EINVAL;
  2375. if (nested_cpu_has_eptp_switching(vmcs12)) {
  2376. if (CC(!nested_cpu_has_ept(vmcs12)) ||
  2377. CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
  2378. return -EINVAL;
  2379. }
  2380. }
  2381. return 0;
  2382. }
  2383. /*
  2384. * Checks related to VM-Exit Control Fields
  2385. */
  2386. static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
  2387. struct vmcs12 *vmcs12)
  2388. {
  2389. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2390. if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
  2391. vmx->nested.msrs.exit_ctls_low,
  2392. vmx->nested.msrs.exit_ctls_high)) ||
  2393. CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
  2394. return -EINVAL;
  2395. return 0;
  2396. }
  2397. /*
  2398. * Checks related to VM-Entry Control Fields
  2399. */
  2400. static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
  2401. struct vmcs12 *vmcs12)
  2402. {
  2403. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2404. if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
  2405. vmx->nested.msrs.entry_ctls_low,
  2406. vmx->nested.msrs.entry_ctls_high)))
  2407. return -EINVAL;
  2408. /*
  2409. * From the Intel SDM, volume 3:
  2410. * Fields relevant to VM-entry event injection must be set properly.
  2411. * These fields are the VM-entry interruption-information field, the
  2412. * VM-entry exception error code, and the VM-entry instruction length.
  2413. */
  2414. if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
  2415. u32 intr_info = vmcs12->vm_entry_intr_info_field;
  2416. u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
  2417. u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
  2418. bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
  2419. bool should_have_error_code;
  2420. bool urg = nested_cpu_has2(vmcs12,
  2421. SECONDARY_EXEC_UNRESTRICTED_GUEST);
  2422. bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
  2423. /* VM-entry interruption-info field: interruption type */
  2424. if (CC(intr_type == INTR_TYPE_RESERVED) ||
  2425. CC(intr_type == INTR_TYPE_OTHER_EVENT &&
  2426. !nested_cpu_supports_monitor_trap_flag(vcpu)))
  2427. return -EINVAL;
  2428. /* VM-entry interruption-info field: vector */
  2429. if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
  2430. CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
  2431. CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
  2432. return -EINVAL;
  2433. /* VM-entry interruption-info field: deliver error code */
  2434. should_have_error_code =
  2435. intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
  2436. x86_exception_has_error_code(vector);
  2437. if (CC(has_error_code != should_have_error_code))
  2438. return -EINVAL;
  2439. /* VM-entry exception error code */
  2440. if (CC(has_error_code &&
  2441. vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
  2442. return -EINVAL;
  2443. /* VM-entry interruption-info field: reserved bits */
  2444. if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
  2445. return -EINVAL;
  2446. /* VM-entry instruction length */
  2447. switch (intr_type) {
  2448. case INTR_TYPE_SOFT_EXCEPTION:
  2449. case INTR_TYPE_SOFT_INTR:
  2450. case INTR_TYPE_PRIV_SW_EXCEPTION:
  2451. if (CC(vmcs12->vm_entry_instruction_len > 15) ||
  2452. CC(vmcs12->vm_entry_instruction_len == 0 &&
  2453. CC(!nested_cpu_has_zero_length_injection(vcpu))))
  2454. return -EINVAL;
  2455. }
  2456. }
  2457. if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
  2458. return -EINVAL;
  2459. return 0;
  2460. }
  2461. static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
  2462. struct vmcs12 *vmcs12)
  2463. {
  2464. if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
  2465. nested_check_vm_exit_controls(vcpu, vmcs12) ||
  2466. nested_check_vm_entry_controls(vcpu, vmcs12))
  2467. return -EINVAL;
  2468. if (guest_cpuid_has_evmcs(vcpu))
  2469. return nested_evmcs_check_controls(vmcs12);
  2470. return 0;
  2471. }
  2472. static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
  2473. struct vmcs12 *vmcs12)
  2474. {
  2475. #ifdef CONFIG_X86_64
  2476. if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
  2477. !!(vcpu->arch.efer & EFER_LMA)))
  2478. return -EINVAL;
  2479. #endif
  2480. return 0;
  2481. }
  2482. static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
  2483. struct vmcs12 *vmcs12)
  2484. {
  2485. bool ia32e;
  2486. if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
  2487. CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
  2488. CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
  2489. return -EINVAL;
  2490. if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
  2491. CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
  2492. return -EINVAL;
  2493. if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
  2494. CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
  2495. return -EINVAL;
  2496. if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
  2497. CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
  2498. vmcs12->host_ia32_perf_global_ctrl)))
  2499. return -EINVAL;
  2500. #ifdef CONFIG_X86_64
  2501. ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
  2502. #else
  2503. ia32e = false;
  2504. #endif
  2505. if (ia32e) {
  2506. if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
  2507. return -EINVAL;
  2508. } else {
  2509. if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
  2510. CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
  2511. CC((vmcs12->host_rip) >> 32))
  2512. return -EINVAL;
  2513. }
  2514. if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
  2515. CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
  2516. CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
  2517. CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
  2518. CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
  2519. CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
  2520. CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
  2521. CC(vmcs12->host_cs_selector == 0) ||
  2522. CC(vmcs12->host_tr_selector == 0) ||
  2523. CC(vmcs12->host_ss_selector == 0 && !ia32e))
  2524. return -EINVAL;
  2525. if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
  2526. CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
  2527. CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
  2528. CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
  2529. CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
  2530. CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
  2531. return -EINVAL;
  2532. /*
  2533. * If the load IA32_EFER VM-exit control is 1, bits reserved in the
  2534. * IA32_EFER MSR must be 0 in the field for that register. In addition,
  2535. * the values of the LMA and LME bits in the field must each be that of
  2536. * the host address-space size VM-exit control.
  2537. */
  2538. if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
  2539. if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
  2540. CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
  2541. CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
  2542. return -EINVAL;
  2543. }
  2544. return 0;
  2545. }
  2546. static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
  2547. struct vmcs12 *vmcs12)
  2548. {
  2549. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2550. struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
  2551. struct vmcs_hdr hdr;
  2552. if (vmcs12->vmcs_link_pointer == INVALID_GPA)
  2553. return 0;
  2554. if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
  2555. return -EINVAL;
  2556. if (ghc->gpa != vmcs12->vmcs_link_pointer &&
  2557. CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
  2558. vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
  2559. return -EINVAL;
  2560. if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
  2561. offsetof(struct vmcs12, hdr),
  2562. sizeof(hdr))))
  2563. return -EINVAL;
  2564. if (CC(hdr.revision_id != VMCS12_REVISION) ||
  2565. CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
  2566. return -EINVAL;
  2567. return 0;
  2568. }
  2569. /*
  2570. * Checks related to Guest Non-register State
  2571. */
  2572. static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
  2573. {
  2574. if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
  2575. vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
  2576. vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
  2577. return -EINVAL;
  2578. return 0;
  2579. }
  2580. static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
  2581. struct vmcs12 *vmcs12,
  2582. enum vm_entry_failure_code *entry_failure_code)
  2583. {
  2584. bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
  2585. *entry_failure_code = ENTRY_FAIL_DEFAULT;
  2586. if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
  2587. CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
  2588. return -EINVAL;
  2589. if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
  2590. CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
  2591. return -EINVAL;
  2592. if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
  2593. CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
  2594. return -EINVAL;
  2595. if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
  2596. *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
  2597. return -EINVAL;
  2598. }
  2599. if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
  2600. CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
  2601. vmcs12->guest_ia32_perf_global_ctrl)))
  2602. return -EINVAL;
  2603. if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG))
  2604. return -EINVAL;
  2605. if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
  2606. CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
  2607. return -EINVAL;
  2608. /*
  2609. * If the load IA32_EFER VM-entry control is 1, the following checks
  2610. * are performed on the field for the IA32_EFER MSR:
  2611. * - Bits reserved in the IA32_EFER MSR must be 0.
  2612. * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
  2613. * the IA-32e mode guest VM-exit control. It must also be identical
  2614. * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
  2615. * CR0.PG) is 1.
  2616. */
  2617. if (to_vmx(vcpu)->nested.nested_run_pending &&
  2618. (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
  2619. if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
  2620. CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
  2621. CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
  2622. ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
  2623. return -EINVAL;
  2624. }
  2625. if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
  2626. (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
  2627. CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
  2628. return -EINVAL;
  2629. if (nested_check_guest_non_reg_state(vmcs12))
  2630. return -EINVAL;
  2631. return 0;
  2632. }
  2633. static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
  2634. {
  2635. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2636. unsigned long cr3, cr4;
  2637. bool vm_fail;
  2638. if (!nested_early_check)
  2639. return 0;
  2640. if (vmx->msr_autoload.host.nr)
  2641. vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
  2642. if (vmx->msr_autoload.guest.nr)
  2643. vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
  2644. preempt_disable();
  2645. vmx_prepare_switch_to_guest(vcpu);
  2646. /*
  2647. * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
  2648. * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
  2649. * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
  2650. * there is no need to preserve other bits or save/restore the field.
  2651. */
  2652. vmcs_writel(GUEST_RFLAGS, 0);
  2653. cr3 = __get_current_cr3_fast();
  2654. if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
  2655. vmcs_writel(HOST_CR3, cr3);
  2656. vmx->loaded_vmcs->host_state.cr3 = cr3;
  2657. }
  2658. cr4 = cr4_read_shadow();
  2659. if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
  2660. vmcs_writel(HOST_CR4, cr4);
  2661. vmx->loaded_vmcs->host_state.cr4 = cr4;
  2662. }
  2663. vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
  2664. __vmx_vcpu_run_flags(vmx));
  2665. if (vmx->msr_autoload.host.nr)
  2666. vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
  2667. if (vmx->msr_autoload.guest.nr)
  2668. vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
  2669. if (vm_fail) {
  2670. u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
  2671. preempt_enable();
  2672. trace_kvm_nested_vmenter_failed(
  2673. "early hardware check VM-instruction error: ", error);
  2674. WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  2675. return 1;
  2676. }
  2677. /*
  2678. * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
  2679. */
  2680. if (hw_breakpoint_active())
  2681. set_debugreg(__this_cpu_read(cpu_dr7), 7);
  2682. local_irq_enable();
  2683. preempt_enable();
  2684. /*
  2685. * A non-failing VMEntry means we somehow entered guest mode with
  2686. * an illegal RIP, and that's just the tip of the iceberg. There
  2687. * is no telling what memory has been modified or what state has
  2688. * been exposed to unknown code. Hitting this all but guarantees
  2689. * a (very critical) hardware issue.
  2690. */
  2691. WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
  2692. VMX_EXIT_REASONS_FAILED_VMENTRY));
  2693. return 0;
  2694. }
  2695. static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
  2696. {
  2697. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2698. /*
  2699. * hv_evmcs may end up being not mapped after migration (when
  2700. * L2 was running), map it here to make sure vmcs12 changes are
  2701. * properly reflected.
  2702. */
  2703. if (guest_cpuid_has_evmcs(vcpu) &&
  2704. vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
  2705. enum nested_evmptrld_status evmptrld_status =
  2706. nested_vmx_handle_enlightened_vmptrld(vcpu, false);
  2707. if (evmptrld_status == EVMPTRLD_VMFAIL ||
  2708. evmptrld_status == EVMPTRLD_ERROR)
  2709. return false;
  2710. /*
  2711. * Post migration VMCS12 always provides the most actual
  2712. * information, copy it to eVMCS upon entry.
  2713. */
  2714. vmx->nested.need_vmcs12_to_shadow_sync = true;
  2715. }
  2716. return true;
  2717. }
  2718. static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
  2719. {
  2720. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  2721. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2722. struct kvm_host_map *map;
  2723. if (!vcpu->arch.pdptrs_from_userspace &&
  2724. !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
  2725. /*
  2726. * Reload the guest's PDPTRs since after a migration
  2727. * the guest CR3 might be restored prior to setting the nested
  2728. * state which can lead to a load of wrong PDPTRs.
  2729. */
  2730. if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
  2731. return false;
  2732. }
  2733. if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
  2734. map = &vmx->nested.apic_access_page_map;
  2735. if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
  2736. vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
  2737. } else {
  2738. pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
  2739. __func__);
  2740. vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
  2741. vcpu->run->internal.suberror =
  2742. KVM_INTERNAL_ERROR_EMULATION;
  2743. vcpu->run->internal.ndata = 0;
  2744. return false;
  2745. }
  2746. }
  2747. if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
  2748. map = &vmx->nested.virtual_apic_map;
  2749. if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
  2750. vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
  2751. } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
  2752. nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
  2753. !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
  2754. /*
  2755. * The processor will never use the TPR shadow, simply
  2756. * clear the bit from the execution control. Such a
  2757. * configuration is useless, but it happens in tests.
  2758. * For any other configuration, failing the vm entry is
  2759. * _not_ what the processor does but it's basically the
  2760. * only possibility we have.
  2761. */
  2762. exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
  2763. } else {
  2764. /*
  2765. * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
  2766. * force VM-Entry to fail.
  2767. */
  2768. vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
  2769. }
  2770. }
  2771. if (nested_cpu_has_posted_intr(vmcs12)) {
  2772. map = &vmx->nested.pi_desc_map;
  2773. if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
  2774. vmx->nested.pi_desc =
  2775. (struct pi_desc *)(((void *)map->hva) +
  2776. offset_in_page(vmcs12->posted_intr_desc_addr));
  2777. vmcs_write64(POSTED_INTR_DESC_ADDR,
  2778. pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
  2779. } else {
  2780. /*
  2781. * Defer the KVM_INTERNAL_EXIT until KVM tries to
  2782. * access the contents of the VMCS12 posted interrupt
  2783. * descriptor. (Note that KVM may do this when it
  2784. * should not, per the architectural specification.)
  2785. */
  2786. vmx->nested.pi_desc = NULL;
  2787. pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
  2788. }
  2789. }
  2790. if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
  2791. exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
  2792. else
  2793. exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
  2794. return true;
  2795. }
  2796. static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
  2797. {
  2798. if (!nested_get_evmcs_page(vcpu)) {
  2799. pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
  2800. __func__);
  2801. vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
  2802. vcpu->run->internal.suberror =
  2803. KVM_INTERNAL_ERROR_EMULATION;
  2804. vcpu->run->internal.ndata = 0;
  2805. return false;
  2806. }
  2807. if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
  2808. return false;
  2809. return true;
  2810. }
  2811. static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
  2812. {
  2813. struct vmcs12 *vmcs12;
  2814. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2815. gpa_t dst;
  2816. if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
  2817. return 0;
  2818. if (WARN_ON_ONCE(vmx->nested.pml_full))
  2819. return 1;
  2820. /*
  2821. * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
  2822. * set is already checked as part of A/D emulation.
  2823. */
  2824. vmcs12 = get_vmcs12(vcpu);
  2825. if (!nested_cpu_has_pml(vmcs12))
  2826. return 0;
  2827. if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
  2828. vmx->nested.pml_full = true;
  2829. return 1;
  2830. }
  2831. gpa &= ~0xFFFull;
  2832. dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
  2833. if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
  2834. offset_in_page(dst), sizeof(gpa)))
  2835. return 0;
  2836. vmcs12->guest_pml_index--;
  2837. return 0;
  2838. }
  2839. /*
  2840. * Intel's VMX Instruction Reference specifies a common set of prerequisites
  2841. * for running VMX instructions (except VMXON, whose prerequisites are
  2842. * slightly different). It also specifies what exception to inject otherwise.
  2843. * Note that many of these exceptions have priority over VM exits, so they
  2844. * don't have to be checked again here.
  2845. */
  2846. static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
  2847. {
  2848. if (!to_vmx(vcpu)->nested.vmxon) {
  2849. kvm_queue_exception(vcpu, UD_VECTOR);
  2850. return 0;
  2851. }
  2852. if (vmx_get_cpl(vcpu)) {
  2853. kvm_inject_gp(vcpu, 0);
  2854. return 0;
  2855. }
  2856. return 1;
  2857. }
  2858. static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
  2859. {
  2860. u8 rvi = vmx_get_rvi();
  2861. u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
  2862. return ((rvi & 0xf0) > (vppr & 0xf0));
  2863. }
  2864. static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
  2865. struct vmcs12 *vmcs12);
  2866. /*
  2867. * If from_vmentry is false, this is being called from state restore (either RSM
  2868. * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
  2869. *
  2870. * Returns:
  2871. * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
  2872. * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
  2873. * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
  2874. * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
  2875. */
  2876. enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
  2877. bool from_vmentry)
  2878. {
  2879. struct vcpu_vmx *vmx = to_vmx(vcpu);
  2880. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  2881. enum vm_entry_failure_code entry_failure_code;
  2882. bool evaluate_pending_interrupts;
  2883. union vmx_exit_reason exit_reason = {
  2884. .basic = EXIT_REASON_INVALID_STATE,
  2885. .failed_vmentry = 1,
  2886. };
  2887. u32 failed_index;
  2888. trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
  2889. vmx->nested.current_vmptr,
  2890. vmcs12->guest_rip,
  2891. vmcs12->guest_intr_status,
  2892. vmcs12->vm_entry_intr_info_field,
  2893. vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
  2894. vmcs12->ept_pointer,
  2895. vmcs12->guest_cr3,
  2896. KVM_ISA_VMX);
  2897. kvm_service_local_tlb_flush_requests(vcpu);
  2898. evaluate_pending_interrupts = exec_controls_get(vmx) &
  2899. (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
  2900. if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
  2901. evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
  2902. if (!evaluate_pending_interrupts)
  2903. evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
  2904. if (!vmx->nested.nested_run_pending ||
  2905. !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
  2906. vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
  2907. if (kvm_mpx_supported() &&
  2908. (!vmx->nested.nested_run_pending ||
  2909. !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
  2910. vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
  2911. /*
  2912. * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
  2913. * nested early checks are disabled. In the event of a "late" VM-Fail,
  2914. * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
  2915. * software model to the pre-VMEntry host state. When EPT is disabled,
  2916. * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
  2917. * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
  2918. * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
  2919. * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
  2920. * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
  2921. * guaranteed to be overwritten with a shadow CR3 prior to re-entering
  2922. * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
  2923. * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
  2924. * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
  2925. * path would need to manually save/restore vmcs01.GUEST_CR3.
  2926. */
  2927. if (!enable_ept && !nested_early_check)
  2928. vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
  2929. vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
  2930. prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
  2931. if (from_vmentry) {
  2932. if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
  2933. vmx_switch_vmcs(vcpu, &vmx->vmcs01);
  2934. return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
  2935. }
  2936. if (nested_vmx_check_vmentry_hw(vcpu)) {
  2937. vmx_switch_vmcs(vcpu, &vmx->vmcs01);
  2938. return NVMX_VMENTRY_VMFAIL;
  2939. }
  2940. if (nested_vmx_check_guest_state(vcpu, vmcs12,
  2941. &entry_failure_code)) {
  2942. exit_reason.basic = EXIT_REASON_INVALID_STATE;
  2943. vmcs12->exit_qualification = entry_failure_code;
  2944. goto vmentry_fail_vmexit;
  2945. }
  2946. }
  2947. enter_guest_mode(vcpu);
  2948. if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
  2949. exit_reason.basic = EXIT_REASON_INVALID_STATE;
  2950. vmcs12->exit_qualification = entry_failure_code;
  2951. goto vmentry_fail_vmexit_guest_mode;
  2952. }
  2953. if (from_vmentry) {
  2954. failed_index = nested_vmx_load_msr(vcpu,
  2955. vmcs12->vm_entry_msr_load_addr,
  2956. vmcs12->vm_entry_msr_load_count);
  2957. if (failed_index) {
  2958. exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
  2959. vmcs12->exit_qualification = failed_index;
  2960. goto vmentry_fail_vmexit_guest_mode;
  2961. }
  2962. } else {
  2963. /*
  2964. * The MMU is not initialized to point at the right entities yet and
  2965. * "get pages" would need to read data from the guest (i.e. we will
  2966. * need to perform gpa to hpa translation). Request a call
  2967. * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
  2968. * have already been set at vmentry time and should not be reset.
  2969. */
  2970. kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
  2971. }
  2972. /*
  2973. * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
  2974. * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
  2975. * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
  2976. * unconditionally.
  2977. */
  2978. if (unlikely(evaluate_pending_interrupts))
  2979. kvm_make_request(KVM_REQ_EVENT, vcpu);
  2980. /*
  2981. * Do not start the preemption timer hrtimer until after we know
  2982. * we are successful, so that only nested_vmx_vmexit needs to cancel
  2983. * the timer.
  2984. */
  2985. vmx->nested.preemption_timer_expired = false;
  2986. if (nested_cpu_has_preemption_timer(vmcs12)) {
  2987. u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
  2988. vmx_start_preemption_timer(vcpu, timer_value);
  2989. }
  2990. /*
  2991. * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
  2992. * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
  2993. * returned as far as L1 is concerned. It will only return (and set
  2994. * the success flag) when L2 exits (see nested_vmx_vmexit()).
  2995. */
  2996. return NVMX_VMENTRY_SUCCESS;
  2997. /*
  2998. * A failed consistency check that leads to a VMExit during L1's
  2999. * VMEnter to L2 is a variation of a normal VMexit, as explained in
  3000. * 26.7 "VM-entry failures during or after loading guest state".
  3001. */
  3002. vmentry_fail_vmexit_guest_mode:
  3003. if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
  3004. vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
  3005. leave_guest_mode(vcpu);
  3006. vmentry_fail_vmexit:
  3007. vmx_switch_vmcs(vcpu, &vmx->vmcs01);
  3008. if (!from_vmentry)
  3009. return NVMX_VMENTRY_VMEXIT;
  3010. load_vmcs12_host_state(vcpu, vmcs12);
  3011. vmcs12->vm_exit_reason = exit_reason.full;
  3012. if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
  3013. vmx->nested.need_vmcs12_to_shadow_sync = true;
  3014. return NVMX_VMENTRY_VMEXIT;
  3015. }
  3016. /*
  3017. * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
  3018. * for running an L2 nested guest.
  3019. */
  3020. static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
  3021. {
  3022. struct vmcs12 *vmcs12;
  3023. enum nvmx_vmentry_status status;
  3024. struct vcpu_vmx *vmx = to_vmx(vcpu);
  3025. u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
  3026. enum nested_evmptrld_status evmptrld_status;
  3027. if (!nested_vmx_check_permission(vcpu))
  3028. return 1;
  3029. evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
  3030. if (evmptrld_status == EVMPTRLD_ERROR) {
  3031. kvm_queue_exception(vcpu, UD_VECTOR);
  3032. return 1;
  3033. }
  3034. kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
  3035. if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
  3036. return nested_vmx_failInvalid(vcpu);
  3037. if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
  3038. vmx->nested.current_vmptr == INVALID_GPA))
  3039. return nested_vmx_failInvalid(vcpu);
  3040. vmcs12 = get_vmcs12(vcpu);
  3041. /*
  3042. * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
  3043. * that there *is* a valid VMCS pointer, RFLAGS.CF is set
  3044. * rather than RFLAGS.ZF, and no error number is stored to the
  3045. * VM-instruction error field.
  3046. */
  3047. if (CC(vmcs12->hdr.shadow_vmcs))
  3048. return nested_vmx_failInvalid(vcpu);
  3049. if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
  3050. copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
  3051. /* Enlightened VMCS doesn't have launch state */
  3052. vmcs12->launch_state = !launch;
  3053. } else if (enable_shadow_vmcs) {
  3054. copy_shadow_to_vmcs12(vmx);
  3055. }
  3056. /*
  3057. * The nested entry process starts with enforcing various prerequisites
  3058. * on vmcs12 as required by the Intel SDM, and act appropriately when
  3059. * they fail: As the SDM explains, some conditions should cause the
  3060. * instruction to fail, while others will cause the instruction to seem
  3061. * to succeed, but return an EXIT_REASON_INVALID_STATE.
  3062. * To speed up the normal (success) code path, we should avoid checking
  3063. * for misconfigurations which will anyway be caught by the processor
  3064. * when using the merged vmcs02.
  3065. */
  3066. if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
  3067. return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
  3068. if (CC(vmcs12->launch_state == launch))
  3069. return nested_vmx_fail(vcpu,
  3070. launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
  3071. : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
  3072. if (nested_vmx_check_controls(vcpu, vmcs12))
  3073. return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  3074. if (nested_vmx_check_address_space_size(vcpu, vmcs12))
  3075. return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
  3076. if (nested_vmx_check_host_state(vcpu, vmcs12))
  3077. return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
  3078. /*
  3079. * We're finally done with prerequisite checking, and can start with
  3080. * the nested entry.
  3081. */
  3082. vmx->nested.nested_run_pending = 1;
  3083. vmx->nested.has_preemption_timer_deadline = false;
  3084. status = nested_vmx_enter_non_root_mode(vcpu, true);
  3085. if (unlikely(status != NVMX_VMENTRY_SUCCESS))
  3086. goto vmentry_failed;
  3087. /* Emulate processing of posted interrupts on VM-Enter. */
  3088. if (nested_cpu_has_posted_intr(vmcs12) &&
  3089. kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
  3090. vmx->nested.pi_pending = true;
  3091. kvm_make_request(KVM_REQ_EVENT, vcpu);
  3092. kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
  3093. }
  3094. /* Hide L1D cache contents from the nested guest. */
  3095. vmx->vcpu.arch.l1tf_flush_l1d = true;
  3096. /*
  3097. * Must happen outside of nested_vmx_enter_non_root_mode() as it will
  3098. * also be used as part of restoring nVMX state for
  3099. * snapshot restore (migration).
  3100. *
  3101. * In this flow, it is assumed that vmcs12 cache was
  3102. * transferred as part of captured nVMX state and should
  3103. * therefore not be read from guest memory (which may not
  3104. * exist on destination host yet).
  3105. */
  3106. nested_cache_shadow_vmcs12(vcpu, vmcs12);
  3107. switch (vmcs12->guest_activity_state) {
  3108. case GUEST_ACTIVITY_HLT:
  3109. /*
  3110. * If we're entering a halted L2 vcpu and the L2 vcpu won't be
  3111. * awakened by event injection or by an NMI-window VM-exit or
  3112. * by an interrupt-window VM-exit, halt the vcpu.
  3113. */
  3114. if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
  3115. !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
  3116. !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
  3117. (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
  3118. vmx->nested.nested_run_pending = 0;
  3119. return kvm_emulate_halt_noskip(vcpu);
  3120. }
  3121. break;
  3122. case GUEST_ACTIVITY_WAIT_SIPI:
  3123. vmx->nested.nested_run_pending = 0;
  3124. vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
  3125. break;
  3126. default:
  3127. break;
  3128. }
  3129. return 1;
  3130. vmentry_failed:
  3131. vmx->nested.nested_run_pending = 0;
  3132. if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
  3133. return 0;
  3134. if (status == NVMX_VMENTRY_VMEXIT)
  3135. return 1;
  3136. WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
  3137. return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  3138. }
  3139. /*
  3140. * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
  3141. * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
  3142. * This function returns the new value we should put in vmcs12.guest_cr0.
  3143. * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
  3144. * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
  3145. * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
  3146. * didn't trap the bit, because if L1 did, so would L0).
  3147. * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
  3148. * been modified by L2, and L1 knows it. So just leave the old value of
  3149. * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
  3150. * isn't relevant, because if L0 traps this bit it can set it to anything.
  3151. * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
  3152. * changed these bits, and therefore they need to be updated, but L0
  3153. * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
  3154. * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
  3155. */
  3156. static inline unsigned long
  3157. vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  3158. {
  3159. return
  3160. /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
  3161. /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
  3162. /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
  3163. vcpu->arch.cr0_guest_owned_bits));
  3164. }
  3165. static inline unsigned long
  3166. vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  3167. {
  3168. return
  3169. /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
  3170. /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
  3171. /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
  3172. vcpu->arch.cr4_guest_owned_bits));
  3173. }
  3174. static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
  3175. struct vmcs12 *vmcs12,
  3176. u32 vm_exit_reason, u32 exit_intr_info)
  3177. {
  3178. u32 idt_vectoring;
  3179. unsigned int nr;
  3180. /*
  3181. * Per the SDM, VM-Exits due to double and triple faults are never
  3182. * considered to occur during event delivery, even if the double/triple
  3183. * fault is the result of an escalating vectoring issue.
  3184. *
  3185. * Note, the SDM qualifies the double fault behavior with "The original
  3186. * event results in a double-fault exception". It's unclear why the
  3187. * qualification exists since exits due to double fault can occur only
  3188. * while vectoring a different exception (injected events are never
  3189. * subject to interception), i.e. there's _always_ an original event.
  3190. *
  3191. * The SDM also uses NMI as a confusing example for the "original event
  3192. * causes the VM exit directly" clause. NMI isn't special in any way,
  3193. * the same rule applies to all events that cause an exit directly.
  3194. * NMI is an odd choice for the example because NMIs can only occur on
  3195. * instruction boundaries, i.e. they _can't_ occur during vectoring.
  3196. */
  3197. if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
  3198. ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
  3199. is_double_fault(exit_intr_info))) {
  3200. vmcs12->idt_vectoring_info_field = 0;
  3201. } else if (vcpu->arch.exception.injected) {
  3202. nr = vcpu->arch.exception.vector;
  3203. idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
  3204. if (kvm_exception_is_soft(nr)) {
  3205. vmcs12->vm_exit_instruction_len =
  3206. vcpu->arch.event_exit_inst_len;
  3207. idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
  3208. } else
  3209. idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
  3210. if (vcpu->arch.exception.has_error_code) {
  3211. idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
  3212. vmcs12->idt_vectoring_error_code =
  3213. vcpu->arch.exception.error_code;
  3214. }
  3215. vmcs12->idt_vectoring_info_field = idt_vectoring;
  3216. } else if (vcpu->arch.nmi_injected) {
  3217. vmcs12->idt_vectoring_info_field =
  3218. INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
  3219. } else if (vcpu->arch.interrupt.injected) {
  3220. nr = vcpu->arch.interrupt.nr;
  3221. idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
  3222. if (vcpu->arch.interrupt.soft) {
  3223. idt_vectoring |= INTR_TYPE_SOFT_INTR;
  3224. vmcs12->vm_entry_instruction_len =
  3225. vcpu->arch.event_exit_inst_len;
  3226. } else
  3227. idt_vectoring |= INTR_TYPE_EXT_INTR;
  3228. vmcs12->idt_vectoring_info_field = idt_vectoring;
  3229. } else {
  3230. vmcs12->idt_vectoring_info_field = 0;
  3231. }
  3232. }
  3233. void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
  3234. {
  3235. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  3236. gfn_t gfn;
  3237. /*
  3238. * Don't need to mark the APIC access page dirty; it is never
  3239. * written to by the CPU during APIC virtualization.
  3240. */
  3241. if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
  3242. gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
  3243. kvm_vcpu_mark_page_dirty(vcpu, gfn);
  3244. }
  3245. if (nested_cpu_has_posted_intr(vmcs12)) {
  3246. gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
  3247. kvm_vcpu_mark_page_dirty(vcpu, gfn);
  3248. }
  3249. }
  3250. static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
  3251. {
  3252. struct vcpu_vmx *vmx = to_vmx(vcpu);
  3253. int max_irr;
  3254. void *vapic_page;
  3255. u16 status;
  3256. if (!vmx->nested.pi_pending)
  3257. return 0;
  3258. if (!vmx->nested.pi_desc)
  3259. goto mmio_needed;
  3260. vmx->nested.pi_pending = false;
  3261. if (!pi_test_and_clear_on(vmx->nested.pi_desc))
  3262. return 0;
  3263. max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
  3264. if (max_irr != 256) {
  3265. vapic_page = vmx->nested.virtual_apic_map.hva;
  3266. if (!vapic_page)
  3267. goto mmio_needed;
  3268. __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
  3269. vapic_page, &max_irr);
  3270. status = vmcs_read16(GUEST_INTR_STATUS);
  3271. if ((u8)max_irr > ((u8)status & 0xff)) {
  3272. status &= ~0xff;
  3273. status |= (u8)max_irr;
  3274. vmcs_write16(GUEST_INTR_STATUS, status);
  3275. }
  3276. }
  3277. nested_mark_vmcs12_pages_dirty(vcpu);
  3278. return 0;
  3279. mmio_needed:
  3280. kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
  3281. return -ENXIO;
  3282. }
  3283. static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
  3284. {
  3285. struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
  3286. u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
  3287. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  3288. unsigned long exit_qual;
  3289. if (ex->has_payload) {
  3290. exit_qual = ex->payload;
  3291. } else if (ex->vector == PF_VECTOR) {
  3292. exit_qual = vcpu->arch.cr2;
  3293. } else if (ex->vector == DB_VECTOR) {
  3294. exit_qual = vcpu->arch.dr6;
  3295. exit_qual &= ~DR6_BT;
  3296. exit_qual ^= DR6_ACTIVE_LOW;
  3297. } else {
  3298. exit_qual = 0;
  3299. }
  3300. /*
  3301. * Unlike AMD's Paged Real Mode, which reports an error code on #PF
  3302. * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
  3303. * "has error code" flags on VM-Exit if the CPU is in Real Mode.
  3304. */
  3305. if (ex->has_error_code && is_protmode(vcpu)) {
  3306. /*
  3307. * Intel CPUs do not generate error codes with bits 31:16 set,
  3308. * and more importantly VMX disallows setting bits 31:16 in the
  3309. * injected error code for VM-Entry. Drop the bits to mimic
  3310. * hardware and avoid inducing failure on nested VM-Entry if L1
  3311. * chooses to inject the exception back to L2. AMD CPUs _do_
  3312. * generate "full" 32-bit error codes, so KVM allows userspace
  3313. * to inject exception error codes with bits 31:16 set.
  3314. */
  3315. vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
  3316. intr_info |= INTR_INFO_DELIVER_CODE_MASK;
  3317. }
  3318. if (kvm_exception_is_soft(ex->vector))
  3319. intr_info |= INTR_TYPE_SOFT_EXCEPTION;
  3320. else
  3321. intr_info |= INTR_TYPE_HARD_EXCEPTION;
  3322. if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
  3323. vmx_get_nmi_mask(vcpu))
  3324. intr_info |= INTR_INFO_UNBLOCK_NMI;
  3325. nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
  3326. }
  3327. /*
  3328. * Returns true if a debug trap is (likely) pending delivery. Infer the class
  3329. * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
  3330. * Using the payload is flawed because code breakpoints (fault-like) and data
  3331. * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
  3332. * this will return false positives if a to-be-injected code breakpoint #DB is
  3333. * pending (from KVM's perspective, but not "pending" across an instruction
  3334. * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it
  3335. * too is trap-like.
  3336. *
  3337. * KVM "works" despite these flaws as ICEBP isn't currently supported by the
  3338. * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
  3339. * #DB has already happened), and MTF isn't marked pending on code breakpoints
  3340. * from the emulator (because such #DBs are fault-like and thus don't trigger
  3341. * actions that fire on instruction retire).
  3342. */
  3343. static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
  3344. {
  3345. if (!ex->pending || ex->vector != DB_VECTOR)
  3346. return 0;
  3347. /* General Detect #DBs are always fault-like. */
  3348. return ex->payload & ~DR6_BD;
  3349. }
  3350. /*
  3351. * Returns true if there's a pending #DB exception that is lower priority than
  3352. * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by
  3353. * KVM, but could theoretically be injected by userspace. Note, this code is
  3354. * imperfect, see above.
  3355. */
  3356. static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
  3357. {
  3358. return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
  3359. }
  3360. /*
  3361. * Certain VM-exits set the 'pending debug exceptions' field to indicate a
  3362. * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
  3363. * represents these debug traps with a payload that is said to be compatible
  3364. * with the 'pending debug exceptions' field, write the payload to the VMCS
  3365. * field if a VM-exit is delivered before the debug trap.
  3366. */
  3367. static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
  3368. {
  3369. unsigned long pending_dbg;
  3370. pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
  3371. if (pending_dbg)
  3372. vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
  3373. }
  3374. static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
  3375. {
  3376. return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
  3377. to_vmx(vcpu)->nested.preemption_timer_expired;
  3378. }
  3379. static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
  3380. {
  3381. return nested_vmx_preemption_timer_pending(vcpu) ||
  3382. to_vmx(vcpu)->nested.mtf_pending;
  3383. }
  3384. /*
  3385. * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
  3386. * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
  3387. * and less minor edits to splice in the priority of VMX Non-Root specific
  3388. * events, e.g. MTF and NMI/INTR-window exiting.
  3389. *
  3390. * 1 Hardware Reset and Machine Checks
  3391. * - RESET
  3392. * - Machine Check
  3393. *
  3394. * 2 Trap on Task Switch
  3395. * - T flag in TSS is set (on task switch)
  3396. *
  3397. * 3 External Hardware Interventions
  3398. * - FLUSH
  3399. * - STOPCLK
  3400. * - SMI
  3401. * - INIT
  3402. *
  3403. * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
  3404. *
  3405. * 4 Traps on Previous Instruction
  3406. * - Breakpoints
  3407. * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
  3408. * breakpoint, or #DB due to a split-lock access)
  3409. *
  3410. * 4.3 VMX-preemption timer expired VM-exit
  3411. *
  3412. * 4.6 NMI-window exiting VM-exit[2]
  3413. *
  3414. * 5 Nonmaskable Interrupts (NMI)
  3415. *
  3416. * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
  3417. *
  3418. * 6 Maskable Hardware Interrupts
  3419. *
  3420. * 7 Code Breakpoint Fault
  3421. *
  3422. * 8 Faults from Fetching Next Instruction
  3423. * - Code-Segment Limit Violation
  3424. * - Code Page Fault
  3425. * - Control protection exception (missing ENDBRANCH at target of indirect
  3426. * call or jump)
  3427. *
  3428. * 9 Faults from Decoding Next Instruction
  3429. * - Instruction length > 15 bytes
  3430. * - Invalid Opcode
  3431. * - Coprocessor Not Available
  3432. *
  3433. *10 Faults on Executing Instruction
  3434. * - Overflow
  3435. * - Bound error
  3436. * - Invalid TSS
  3437. * - Segment Not Present
  3438. * - Stack fault
  3439. * - General Protection
  3440. * - Data Page Fault
  3441. * - Alignment Check
  3442. * - x86 FPU Floating-point exception
  3443. * - SIMD floating-point exception
  3444. * - Virtualization exception
  3445. * - Control protection exception
  3446. *
  3447. * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
  3448. * INIT signals, and higher priority events take priority over MTF VM exits.
  3449. * MTF VM exits take priority over debug-trap exceptions and lower priority
  3450. * events.
  3451. *
  3452. * [2] Debug-trap exceptions and higher priority events take priority over VM exits
  3453. * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption
  3454. * timer take priority over VM exits caused by the "NMI-window exiting"
  3455. * VM-execution control and lower priority events.
  3456. *
  3457. * [3] Debug-trap exceptions and higher priority events take priority over VM exits
  3458. * caused by "NMI-window exiting". VM exits caused by this control take
  3459. * priority over non-maskable interrupts (NMIs) and lower priority events.
  3460. *
  3461. * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
  3462. * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus,
  3463. * non-maskable interrupts (NMIs) and higher priority events take priority over
  3464. * delivery of a virtual interrupt; delivery of a virtual interrupt takes
  3465. * priority over external interrupts and lower priority events.
  3466. */
  3467. static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
  3468. {
  3469. struct kvm_lapic *apic = vcpu->arch.apic;
  3470. struct vcpu_vmx *vmx = to_vmx(vcpu);
  3471. /*
  3472. * Only a pending nested run blocks a pending exception. If there is a
  3473. * previously injected event, the pending exception occurred while said
  3474. * event was being delivered and thus needs to be handled.
  3475. */
  3476. bool block_nested_exceptions = vmx->nested.nested_run_pending;
  3477. /*
  3478. * New events (not exceptions) are only recognized at instruction
  3479. * boundaries. If an event needs reinjection, then KVM is handling a
  3480. * VM-Exit that occurred _during_ instruction execution; new events are
  3481. * blocked until the instruction completes.
  3482. */
  3483. bool block_nested_events = block_nested_exceptions ||
  3484. kvm_event_needs_reinjection(vcpu);
  3485. if (lapic_in_kernel(vcpu) &&
  3486. test_bit(KVM_APIC_INIT, &apic->pending_events)) {
  3487. if (block_nested_events)
  3488. return -EBUSY;
  3489. nested_vmx_update_pending_dbg(vcpu);
  3490. clear_bit(KVM_APIC_INIT, &apic->pending_events);
  3491. if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
  3492. nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
  3493. /* MTF is discarded if the vCPU is in WFS. */
  3494. vmx->nested.mtf_pending = false;
  3495. return 0;
  3496. }
  3497. if (lapic_in_kernel(vcpu) &&
  3498. test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
  3499. if (block_nested_events)
  3500. return -EBUSY;
  3501. clear_bit(KVM_APIC_SIPI, &apic->pending_events);
  3502. if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
  3503. nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
  3504. apic->sipi_vector & 0xFFUL);
  3505. return 0;
  3506. }
  3507. /* Fallthrough, the SIPI is completely ignored. */
  3508. }
  3509. /*
  3510. * Process exceptions that are higher priority than Monitor Trap Flag:
  3511. * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
  3512. * could theoretically come in from userspace), and ICEBP (INT1).
  3513. *
  3514. * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
  3515. * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF
  3516. * across SMI/RSM as it should; that needs to be addressed in order to
  3517. * prioritize SMI over MTF and trap-like #DBs.
  3518. */
  3519. if (vcpu->arch.exception_vmexit.pending &&
  3520. !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
  3521. if (block_nested_exceptions)
  3522. return -EBUSY;
  3523. nested_vmx_inject_exception_vmexit(vcpu);
  3524. return 0;
  3525. }
  3526. if (vcpu->arch.exception.pending &&
  3527. !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
  3528. if (block_nested_exceptions)
  3529. return -EBUSY;
  3530. goto no_vmexit;
  3531. }
  3532. if (vmx->nested.mtf_pending) {
  3533. if (block_nested_events)
  3534. return -EBUSY;
  3535. nested_vmx_update_pending_dbg(vcpu);
  3536. nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
  3537. return 0;
  3538. }
  3539. if (vcpu->arch.exception_vmexit.pending) {
  3540. if (block_nested_exceptions)
  3541. return -EBUSY;
  3542. nested_vmx_inject_exception_vmexit(vcpu);
  3543. return 0;
  3544. }
  3545. if (vcpu->arch.exception.pending) {
  3546. if (block_nested_exceptions)
  3547. return -EBUSY;
  3548. goto no_vmexit;
  3549. }
  3550. if (nested_vmx_preemption_timer_pending(vcpu)) {
  3551. if (block_nested_events)
  3552. return -EBUSY;
  3553. nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
  3554. return 0;
  3555. }
  3556. if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
  3557. if (block_nested_events)
  3558. return -EBUSY;
  3559. goto no_vmexit;
  3560. }
  3561. if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
  3562. if (block_nested_events)
  3563. return -EBUSY;
  3564. if (!nested_exit_on_nmi(vcpu))
  3565. goto no_vmexit;
  3566. nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
  3567. NMI_VECTOR | INTR_TYPE_NMI_INTR |
  3568. INTR_INFO_VALID_MASK, 0);
  3569. /*
  3570. * The NMI-triggered VM exit counts as injection:
  3571. * clear this one and block further NMIs.
  3572. */
  3573. vcpu->arch.nmi_pending = 0;
  3574. vmx_set_nmi_mask(vcpu, true);
  3575. return 0;
  3576. }
  3577. if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
  3578. if (block_nested_events)
  3579. return -EBUSY;
  3580. if (!nested_exit_on_intr(vcpu))
  3581. goto no_vmexit;
  3582. nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
  3583. return 0;
  3584. }
  3585. no_vmexit:
  3586. return vmx_complete_nested_posted_interrupt(vcpu);
  3587. }
  3588. static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
  3589. {
  3590. ktime_t remaining =
  3591. hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
  3592. u64 value;
  3593. if (ktime_to_ns(remaining) <= 0)
  3594. return 0;
  3595. value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
  3596. do_div(value, 1000000);
  3597. return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
  3598. }
  3599. static bool is_vmcs12_ext_field(unsigned long field)
  3600. {
  3601. switch (field) {
  3602. case GUEST_ES_SELECTOR:
  3603. case GUEST_CS_SELECTOR:
  3604. case GUEST_SS_SELECTOR:
  3605. case GUEST_DS_SELECTOR:
  3606. case GUEST_FS_SELECTOR:
  3607. case GUEST_GS_SELECTOR:
  3608. case GUEST_LDTR_SELECTOR:
  3609. case GUEST_TR_SELECTOR:
  3610. case GUEST_ES_LIMIT:
  3611. case GUEST_CS_LIMIT:
  3612. case GUEST_SS_LIMIT:
  3613. case GUEST_DS_LIMIT:
  3614. case GUEST_FS_LIMIT:
  3615. case GUEST_GS_LIMIT:
  3616. case GUEST_LDTR_LIMIT:
  3617. case GUEST_TR_LIMIT:
  3618. case GUEST_GDTR_LIMIT:
  3619. case GUEST_IDTR_LIMIT:
  3620. case GUEST_ES_AR_BYTES:
  3621. case GUEST_DS_AR_BYTES:
  3622. case GUEST_FS_AR_BYTES:
  3623. case GUEST_GS_AR_BYTES:
  3624. case GUEST_LDTR_AR_BYTES:
  3625. case GUEST_TR_AR_BYTES:
  3626. case GUEST_ES_BASE:
  3627. case GUEST_CS_BASE:
  3628. case GUEST_SS_BASE:
  3629. case GUEST_DS_BASE:
  3630. case GUEST_FS_BASE:
  3631. case GUEST_GS_BASE:
  3632. case GUEST_LDTR_BASE:
  3633. case GUEST_TR_BASE:
  3634. case GUEST_GDTR_BASE:
  3635. case GUEST_IDTR_BASE:
  3636. case GUEST_PENDING_DBG_EXCEPTIONS:
  3637. case GUEST_BNDCFGS:
  3638. return true;
  3639. default:
  3640. break;
  3641. }
  3642. return false;
  3643. }
  3644. static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
  3645. struct vmcs12 *vmcs12)
  3646. {
  3647. struct vcpu_vmx *vmx = to_vmx(vcpu);
  3648. vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
  3649. vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
  3650. vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
  3651. vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
  3652. vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
  3653. vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
  3654. vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
  3655. vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
  3656. vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
  3657. vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
  3658. vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
  3659. vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
  3660. vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
  3661. vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
  3662. vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
  3663. vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
  3664. vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
  3665. vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
  3666. vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
  3667. vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
  3668. vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
  3669. vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
  3670. vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
  3671. vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
  3672. vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
  3673. vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
  3674. vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
  3675. vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
  3676. vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
  3677. vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
  3678. vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
  3679. vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
  3680. vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
  3681. vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
  3682. vmcs12->guest_pending_dbg_exceptions =
  3683. vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
  3684. vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
  3685. }
  3686. static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
  3687. struct vmcs12 *vmcs12)
  3688. {
  3689. struct vcpu_vmx *vmx = to_vmx(vcpu);
  3690. int cpu;
  3691. if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
  3692. return;
  3693. WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
  3694. cpu = get_cpu();
  3695. vmx->loaded_vmcs = &vmx->nested.vmcs02;
  3696. vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
  3697. sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
  3698. vmx->loaded_vmcs = &vmx->vmcs01;
  3699. vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
  3700. put_cpu();
  3701. }
  3702. /*
  3703. * Update the guest state fields of vmcs12 to reflect changes that
  3704. * occurred while L2 was running. (The "IA-32e mode guest" bit of the
  3705. * VM-entry controls is also updated, since this is really a guest
  3706. * state bit.)
  3707. */
  3708. static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  3709. {
  3710. struct vcpu_vmx *vmx = to_vmx(vcpu);
  3711. if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
  3712. sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
  3713. vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
  3714. !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
  3715. vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
  3716. vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
  3717. vmcs12->guest_rsp = kvm_rsp_read(vcpu);
  3718. vmcs12->guest_rip = kvm_rip_read(vcpu);
  3719. vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
  3720. vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
  3721. vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
  3722. vmcs12->guest_interruptibility_info =
  3723. vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
  3724. if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
  3725. vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
  3726. else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
  3727. vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
  3728. else
  3729. vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
  3730. if (nested_cpu_has_preemption_timer(vmcs12) &&
  3731. vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
  3732. !vmx->nested.nested_run_pending)
  3733. vmcs12->vmx_preemption_timer_value =
  3734. vmx_get_preemption_timer_value(vcpu);
  3735. /*
  3736. * In some cases (usually, nested EPT), L2 is allowed to change its
  3737. * own CR3 without exiting. If it has changed it, we must keep it.
  3738. * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
  3739. * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
  3740. *
  3741. * Additionally, restore L2's PDPTR to vmcs12.
  3742. */
  3743. if (enable_ept) {
  3744. vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
  3745. if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
  3746. vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
  3747. vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
  3748. vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
  3749. vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
  3750. }
  3751. }
  3752. vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
  3753. if (nested_cpu_has_vid(vmcs12))
  3754. vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
  3755. vmcs12->vm_entry_controls =
  3756. (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
  3757. (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
  3758. if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
  3759. kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
  3760. if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
  3761. vmcs12->guest_ia32_efer = vcpu->arch.efer;
  3762. }
  3763. /*
  3764. * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
  3765. * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
  3766. * and this function updates it to reflect the changes to the guest state while
  3767. * L2 was running (and perhaps made some exits which were handled directly by L0
  3768. * without going back to L1), and to reflect the exit reason.
  3769. * Note that we do not have to copy here all VMCS fields, just those that
  3770. * could have changed by the L2 guest or the exit - i.e., the guest-state and
  3771. * exit-information fields only. Other fields are modified by L1 with VMWRITE,
  3772. * which already writes to vmcs12 directly.
  3773. */
  3774. static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
  3775. u32 vm_exit_reason, u32 exit_intr_info,
  3776. unsigned long exit_qualification)
  3777. {
  3778. /* update exit information fields: */
  3779. vmcs12->vm_exit_reason = vm_exit_reason;
  3780. if (to_vmx(vcpu)->exit_reason.enclave_mode)
  3781. vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
  3782. vmcs12->exit_qualification = exit_qualification;
  3783. /*
  3784. * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
  3785. * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
  3786. * exit info fields are unmodified.
  3787. */
  3788. if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
  3789. vmcs12->launch_state = 1;
  3790. /* vm_entry_intr_info_field is cleared on exit. Emulate this
  3791. * instead of reading the real value. */
  3792. vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
  3793. /*
  3794. * Transfer the event that L0 or L1 may wanted to inject into
  3795. * L2 to IDT_VECTORING_INFO_FIELD.
  3796. */
  3797. vmcs12_save_pending_event(vcpu, vmcs12,
  3798. vm_exit_reason, exit_intr_info);
  3799. vmcs12->vm_exit_intr_info = exit_intr_info;
  3800. vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
  3801. vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  3802. /*
  3803. * According to spec, there's no need to store the guest's
  3804. * MSRs if the exit is due to a VM-entry failure that occurs
  3805. * during or after loading the guest state. Since this exit
  3806. * does not fall in that category, we need to save the MSRs.
  3807. */
  3808. if (nested_vmx_store_msr(vcpu,
  3809. vmcs12->vm_exit_msr_store_addr,
  3810. vmcs12->vm_exit_msr_store_count))
  3811. nested_vmx_abort(vcpu,
  3812. VMX_ABORT_SAVE_GUEST_MSR_FAIL);
  3813. }
  3814. }
  3815. /*
  3816. * A part of what we need to when the nested L2 guest exits and we want to
  3817. * run its L1 parent, is to reset L1's guest state to the host state specified
  3818. * in vmcs12.
  3819. * This function is to be called not only on normal nested exit, but also on
  3820. * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
  3821. * Failures During or After Loading Guest State").
  3822. * This function should be called when the active VMCS is L1's (vmcs01).
  3823. */
  3824. static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
  3825. struct vmcs12 *vmcs12)
  3826. {
  3827. enum vm_entry_failure_code ignored;
  3828. struct kvm_segment seg;
  3829. if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
  3830. vcpu->arch.efer = vmcs12->host_ia32_efer;
  3831. else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
  3832. vcpu->arch.efer |= (EFER_LMA | EFER_LME);
  3833. else
  3834. vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
  3835. vmx_set_efer(vcpu, vcpu->arch.efer);
  3836. kvm_rsp_write(vcpu, vmcs12->host_rsp);
  3837. kvm_rip_write(vcpu, vmcs12->host_rip);
  3838. vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
  3839. vmx_set_interrupt_shadow(vcpu, 0);
  3840. /*
  3841. * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
  3842. * actually changed, because vmx_set_cr0 refers to efer set above.
  3843. *
  3844. * CR0_GUEST_HOST_MASK is already set in the original vmcs01
  3845. * (KVM doesn't change it);
  3846. */
  3847. vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
  3848. vmx_set_cr0(vcpu, vmcs12->host_cr0);
  3849. /* Same as above - no reason to call set_cr4_guest_host_mask(). */
  3850. vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
  3851. vmx_set_cr4(vcpu, vmcs12->host_cr4);
  3852. nested_ept_uninit_mmu_context(vcpu);
  3853. /*
  3854. * Only PDPTE load can fail as the value of cr3 was checked on entry and
  3855. * couldn't have changed.
  3856. */
  3857. if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
  3858. nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
  3859. nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
  3860. vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
  3861. vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
  3862. vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
  3863. vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
  3864. vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
  3865. vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
  3866. vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
  3867. /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
  3868. if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
  3869. vmcs_write64(GUEST_BNDCFGS, 0);
  3870. if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
  3871. vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
  3872. vcpu->arch.pat = vmcs12->host_ia32_pat;
  3873. }
  3874. if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
  3875. intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
  3876. WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
  3877. vmcs12->host_ia32_perf_global_ctrl));
  3878. /* Set L1 segment info according to Intel SDM
  3879. 27.5.2 Loading Host Segment and Descriptor-Table Registers */
  3880. seg = (struct kvm_segment) {
  3881. .base = 0,
  3882. .limit = 0xFFFFFFFF,
  3883. .selector = vmcs12->host_cs_selector,
  3884. .type = 11,
  3885. .present = 1,
  3886. .s = 1,
  3887. .g = 1
  3888. };
  3889. if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
  3890. seg.l = 1;
  3891. else
  3892. seg.db = 1;
  3893. __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
  3894. seg = (struct kvm_segment) {
  3895. .base = 0,
  3896. .limit = 0xFFFFFFFF,
  3897. .type = 3,
  3898. .present = 1,
  3899. .s = 1,
  3900. .db = 1,
  3901. .g = 1
  3902. };
  3903. seg.selector = vmcs12->host_ds_selector;
  3904. __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
  3905. seg.selector = vmcs12->host_es_selector;
  3906. __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
  3907. seg.selector = vmcs12->host_ss_selector;
  3908. __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
  3909. seg.selector = vmcs12->host_fs_selector;
  3910. seg.base = vmcs12->host_fs_base;
  3911. __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
  3912. seg.selector = vmcs12->host_gs_selector;
  3913. seg.base = vmcs12->host_gs_base;
  3914. __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
  3915. seg = (struct kvm_segment) {
  3916. .base = vmcs12->host_tr_base,
  3917. .limit = 0x67,
  3918. .selector = vmcs12->host_tr_selector,
  3919. .type = 11,
  3920. .present = 1
  3921. };
  3922. __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
  3923. memset(&seg, 0, sizeof(seg));
  3924. seg.unusable = 1;
  3925. __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
  3926. kvm_set_dr(vcpu, 7, 0x400);
  3927. vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
  3928. if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
  3929. vmcs12->vm_exit_msr_load_count))
  3930. nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
  3931. to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
  3932. }
  3933. static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
  3934. {
  3935. struct vmx_uret_msr *efer_msr;
  3936. unsigned int i;
  3937. if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
  3938. return vmcs_read64(GUEST_IA32_EFER);
  3939. if (cpu_has_load_ia32_efer())
  3940. return host_efer;
  3941. for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
  3942. if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
  3943. return vmx->msr_autoload.guest.val[i].value;
  3944. }
  3945. efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
  3946. if (efer_msr)
  3947. return efer_msr->data;
  3948. return host_efer;
  3949. }
  3950. static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
  3951. {
  3952. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  3953. struct vcpu_vmx *vmx = to_vmx(vcpu);
  3954. struct vmx_msr_entry g, h;
  3955. gpa_t gpa;
  3956. u32 i, j;
  3957. vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
  3958. if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
  3959. /*
  3960. * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
  3961. * as vmcs01.GUEST_DR7 contains a userspace defined value
  3962. * and vcpu->arch.dr7 is not squirreled away before the
  3963. * nested VMENTER (not worth adding a variable in nested_vmx).
  3964. */
  3965. if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
  3966. kvm_set_dr(vcpu, 7, DR7_FIXED_1);
  3967. else
  3968. WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
  3969. }
  3970. /*
  3971. * Note that calling vmx_set_{efer,cr0,cr4} is important as they
  3972. * handle a variety of side effects to KVM's software model.
  3973. */
  3974. vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
  3975. vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
  3976. vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
  3977. vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
  3978. vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
  3979. nested_ept_uninit_mmu_context(vcpu);
  3980. vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
  3981. kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
  3982. /*
  3983. * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
  3984. * from vmcs01 (if necessary). The PDPTRs are not loaded on
  3985. * VMFail, like everything else we just need to ensure our
  3986. * software model is up-to-date.
  3987. */
  3988. if (enable_ept && is_pae_paging(vcpu))
  3989. ept_save_pdptrs(vcpu);
  3990. kvm_mmu_reset_context(vcpu);
  3991. /*
  3992. * This nasty bit of open coding is a compromise between blindly
  3993. * loading L1's MSRs using the exit load lists (incorrect emulation
  3994. * of VMFail), leaving the nested VM's MSRs in the software model
  3995. * (incorrect behavior) and snapshotting the modified MSRs (too
  3996. * expensive since the lists are unbound by hardware). For each
  3997. * MSR that was (prematurely) loaded from the nested VMEntry load
  3998. * list, reload it from the exit load list if it exists and differs
  3999. * from the guest value. The intent is to stuff host state as
  4000. * silently as possible, not to fully process the exit load list.
  4001. */
  4002. for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
  4003. gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
  4004. if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
  4005. pr_debug_ratelimited(
  4006. "%s read MSR index failed (%u, 0x%08llx)\n",
  4007. __func__, i, gpa);
  4008. goto vmabort;
  4009. }
  4010. for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
  4011. gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
  4012. if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
  4013. pr_debug_ratelimited(
  4014. "%s read MSR failed (%u, 0x%08llx)\n",
  4015. __func__, j, gpa);
  4016. goto vmabort;
  4017. }
  4018. if (h.index != g.index)
  4019. continue;
  4020. if (h.value == g.value)
  4021. break;
  4022. if (nested_vmx_load_msr_check(vcpu, &h)) {
  4023. pr_debug_ratelimited(
  4024. "%s check failed (%u, 0x%x, 0x%x)\n",
  4025. __func__, j, h.index, h.reserved);
  4026. goto vmabort;
  4027. }
  4028. if (kvm_set_msr(vcpu, h.index, h.value)) {
  4029. pr_debug_ratelimited(
  4030. "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
  4031. __func__, j, h.index, h.value);
  4032. goto vmabort;
  4033. }
  4034. }
  4035. }
  4036. return;
  4037. vmabort:
  4038. nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
  4039. }
  4040. /*
  4041. * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
  4042. * and modify vmcs12 to make it see what it would expect to see there if
  4043. * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
  4044. */
  4045. void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
  4046. u32 exit_intr_info, unsigned long exit_qualification)
  4047. {
  4048. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4049. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  4050. /* Pending MTF traps are discarded on VM-Exit. */
  4051. vmx->nested.mtf_pending = false;
  4052. /* trying to cancel vmlaunch/vmresume is a bug */
  4053. WARN_ON_ONCE(vmx->nested.nested_run_pending);
  4054. if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
  4055. /*
  4056. * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
  4057. * Enlightened VMCS after migration and we still need to
  4058. * do that when something is forcing L2->L1 exit prior to
  4059. * the first L2 run.
  4060. */
  4061. (void)nested_get_evmcs_page(vcpu);
  4062. }
  4063. /* Service pending TLB flush requests for L2 before switching to L1. */
  4064. kvm_service_local_tlb_flush_requests(vcpu);
  4065. /*
  4066. * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
  4067. * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
  4068. * up-to-date before switching to L1.
  4069. */
  4070. if (enable_ept && is_pae_paging(vcpu))
  4071. vmx_ept_load_pdptrs(vcpu);
  4072. leave_guest_mode(vcpu);
  4073. if (nested_cpu_has_preemption_timer(vmcs12))
  4074. hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
  4075. if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
  4076. vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
  4077. if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
  4078. vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
  4079. }
  4080. if (likely(!vmx->fail)) {
  4081. sync_vmcs02_to_vmcs12(vcpu, vmcs12);
  4082. if (vm_exit_reason != -1)
  4083. prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
  4084. exit_intr_info, exit_qualification);
  4085. /*
  4086. * Must happen outside of sync_vmcs02_to_vmcs12() as it will
  4087. * also be used to capture vmcs12 cache as part of
  4088. * capturing nVMX state for snapshot (migration).
  4089. *
  4090. * Otherwise, this flush will dirty guest memory at a
  4091. * point it is already assumed by user-space to be
  4092. * immutable.
  4093. */
  4094. nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
  4095. } else {
  4096. /*
  4097. * The only expected VM-instruction error is "VM entry with
  4098. * invalid control field(s)." Anything else indicates a
  4099. * problem with L0. And we should never get here with a
  4100. * VMFail of any type if early consistency checks are enabled.
  4101. */
  4102. WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
  4103. VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  4104. WARN_ON_ONCE(nested_early_check);
  4105. }
  4106. /*
  4107. * Drop events/exceptions that were queued for re-injection to L2
  4108. * (picked up via vmx_complete_interrupts()), as well as exceptions
  4109. * that were pending for L2. Note, this must NOT be hoisted above
  4110. * prepare_vmcs12(), events/exceptions queued for re-injection need to
  4111. * be captured in vmcs12 (see vmcs12_save_pending_event()).
  4112. */
  4113. vcpu->arch.nmi_injected = false;
  4114. kvm_clear_exception_queue(vcpu);
  4115. kvm_clear_interrupt_queue(vcpu);
  4116. vmx_switch_vmcs(vcpu, &vmx->vmcs01);
  4117. /*
  4118. * If IBRS is advertised to the vCPU, KVM must flush the indirect
  4119. * branch predictors when transitioning from L2 to L1, as L1 expects
  4120. * hardware (KVM in this case) to provide separate predictor modes.
  4121. * Bare metal isolates VMX root (host) from VMX non-root (guest), but
  4122. * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
  4123. * separate modes for L2 vs L1.
  4124. */
  4125. if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
  4126. indirect_branch_prediction_barrier();
  4127. /* Update any VMCS fields that might have changed while L2 ran */
  4128. vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
  4129. vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
  4130. vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
  4131. if (kvm_caps.has_tsc_control)
  4132. vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
  4133. if (vmx->nested.l1_tpr_threshold != -1)
  4134. vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
  4135. if (vmx->nested.change_vmcs01_virtual_apic_mode) {
  4136. vmx->nested.change_vmcs01_virtual_apic_mode = false;
  4137. vmx_set_virtual_apic_mode(vcpu);
  4138. }
  4139. if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
  4140. vmx->nested.update_vmcs01_cpu_dirty_logging = false;
  4141. vmx_update_cpu_dirty_logging(vcpu);
  4142. }
  4143. /* Unpin physical memory we referred to in vmcs02 */
  4144. kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
  4145. kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
  4146. kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
  4147. vmx->nested.pi_desc = NULL;
  4148. if (vmx->nested.reload_vmcs01_apic_access_page) {
  4149. vmx->nested.reload_vmcs01_apic_access_page = false;
  4150. kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  4151. }
  4152. if (vmx->nested.update_vmcs01_apicv_status) {
  4153. vmx->nested.update_vmcs01_apicv_status = false;
  4154. kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
  4155. }
  4156. if ((vm_exit_reason != -1) &&
  4157. (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
  4158. vmx->nested.need_vmcs12_to_shadow_sync = true;
  4159. /* in case we halted in L2 */
  4160. vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  4161. if (likely(!vmx->fail)) {
  4162. if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
  4163. nested_exit_intr_ack_set(vcpu)) {
  4164. int irq = kvm_cpu_get_interrupt(vcpu);
  4165. WARN_ON(irq < 0);
  4166. vmcs12->vm_exit_intr_info = irq |
  4167. INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
  4168. }
  4169. if (vm_exit_reason != -1)
  4170. trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
  4171. vmcs12->exit_qualification,
  4172. vmcs12->idt_vectoring_info_field,
  4173. vmcs12->vm_exit_intr_info,
  4174. vmcs12->vm_exit_intr_error_code,
  4175. KVM_ISA_VMX);
  4176. load_vmcs12_host_state(vcpu, vmcs12);
  4177. return;
  4178. }
  4179. /*
  4180. * After an early L2 VM-entry failure, we're now back
  4181. * in L1 which thinks it just finished a VMLAUNCH or
  4182. * VMRESUME instruction, so we need to set the failure
  4183. * flag and the VM-instruction error field of the VMCS
  4184. * accordingly, and skip the emulated instruction.
  4185. */
  4186. (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  4187. /*
  4188. * Restore L1's host state to KVM's software model. We're here
  4189. * because a consistency check was caught by hardware, which
  4190. * means some amount of guest state has been propagated to KVM's
  4191. * model and needs to be unwound to the host's state.
  4192. */
  4193. nested_vmx_restore_host_state(vcpu);
  4194. vmx->fail = 0;
  4195. }
  4196. static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
  4197. {
  4198. kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
  4199. nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
  4200. }
  4201. /*
  4202. * Decode the memory-address operand of a vmx instruction, as recorded on an
  4203. * exit caused by such an instruction (run by a guest hypervisor).
  4204. * On success, returns 0. When the operand is invalid, returns 1 and throws
  4205. * #UD, #GP, or #SS.
  4206. */
  4207. int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
  4208. u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
  4209. {
  4210. gva_t off;
  4211. bool exn;
  4212. struct kvm_segment s;
  4213. /*
  4214. * According to Vol. 3B, "Information for VM Exits Due to Instruction
  4215. * Execution", on an exit, vmx_instruction_info holds most of the
  4216. * addressing components of the operand. Only the displacement part
  4217. * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
  4218. * For how an actual address is calculated from all these components,
  4219. * refer to Vol. 1, "Operand Addressing".
  4220. */
  4221. int scaling = vmx_instruction_info & 3;
  4222. int addr_size = (vmx_instruction_info >> 7) & 7;
  4223. bool is_reg = vmx_instruction_info & (1u << 10);
  4224. int seg_reg = (vmx_instruction_info >> 15) & 7;
  4225. int index_reg = (vmx_instruction_info >> 18) & 0xf;
  4226. bool index_is_valid = !(vmx_instruction_info & (1u << 22));
  4227. int base_reg = (vmx_instruction_info >> 23) & 0xf;
  4228. bool base_is_valid = !(vmx_instruction_info & (1u << 27));
  4229. if (is_reg) {
  4230. kvm_queue_exception(vcpu, UD_VECTOR);
  4231. return 1;
  4232. }
  4233. /* Addr = segment_base + offset */
  4234. /* offset = base + [index * scale] + displacement */
  4235. off = exit_qualification; /* holds the displacement */
  4236. if (addr_size == 1)
  4237. off = (gva_t)sign_extend64(off, 31);
  4238. else if (addr_size == 0)
  4239. off = (gva_t)sign_extend64(off, 15);
  4240. if (base_is_valid)
  4241. off += kvm_register_read(vcpu, base_reg);
  4242. if (index_is_valid)
  4243. off += kvm_register_read(vcpu, index_reg) << scaling;
  4244. vmx_get_segment(vcpu, &s, seg_reg);
  4245. /*
  4246. * The effective address, i.e. @off, of a memory operand is truncated
  4247. * based on the address size of the instruction. Note that this is
  4248. * the *effective address*, i.e. the address prior to accounting for
  4249. * the segment's base.
  4250. */
  4251. if (addr_size == 1) /* 32 bit */
  4252. off &= 0xffffffff;
  4253. else if (addr_size == 0) /* 16 bit */
  4254. off &= 0xffff;
  4255. /* Checks for #GP/#SS exceptions. */
  4256. exn = false;
  4257. if (is_long_mode(vcpu)) {
  4258. /*
  4259. * The virtual/linear address is never truncated in 64-bit
  4260. * mode, e.g. a 32-bit address size can yield a 64-bit virtual
  4261. * address when using FS/GS with a non-zero base.
  4262. */
  4263. if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
  4264. *ret = s.base + off;
  4265. else
  4266. *ret = off;
  4267. /* Long mode: #GP(0)/#SS(0) if the memory address is in a
  4268. * non-canonical form. This is the only check on the memory
  4269. * destination for long mode!
  4270. */
  4271. exn = is_noncanonical_address(*ret, vcpu);
  4272. } else {
  4273. /*
  4274. * When not in long mode, the virtual/linear address is
  4275. * unconditionally truncated to 32 bits regardless of the
  4276. * address size.
  4277. */
  4278. *ret = (s.base + off) & 0xffffffff;
  4279. /* Protected mode: apply checks for segment validity in the
  4280. * following order:
  4281. * - segment type check (#GP(0) may be thrown)
  4282. * - usability check (#GP(0)/#SS(0))
  4283. * - limit check (#GP(0)/#SS(0))
  4284. */
  4285. if (wr)
  4286. /* #GP(0) if the destination operand is located in a
  4287. * read-only data segment or any code segment.
  4288. */
  4289. exn = ((s.type & 0xa) == 0 || (s.type & 8));
  4290. else
  4291. /* #GP(0) if the source operand is located in an
  4292. * execute-only code segment
  4293. */
  4294. exn = ((s.type & 0xa) == 8);
  4295. if (exn) {
  4296. kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
  4297. return 1;
  4298. }
  4299. /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
  4300. */
  4301. exn = (s.unusable != 0);
  4302. /*
  4303. * Protected mode: #GP(0)/#SS(0) if the memory operand is
  4304. * outside the segment limit. All CPUs that support VMX ignore
  4305. * limit checks for flat segments, i.e. segments with base==0,
  4306. * limit==0xffffffff and of type expand-up data or code.
  4307. */
  4308. if (!(s.base == 0 && s.limit == 0xffffffff &&
  4309. ((s.type & 8) || !(s.type & 4))))
  4310. exn = exn || ((u64)off + len - 1 > s.limit);
  4311. }
  4312. if (exn) {
  4313. kvm_queue_exception_e(vcpu,
  4314. seg_reg == VCPU_SREG_SS ?
  4315. SS_VECTOR : GP_VECTOR,
  4316. 0);
  4317. return 1;
  4318. }
  4319. return 0;
  4320. }
  4321. static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
  4322. int *ret)
  4323. {
  4324. gva_t gva;
  4325. struct x86_exception e;
  4326. int r;
  4327. if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
  4328. vmcs_read32(VMX_INSTRUCTION_INFO), false,
  4329. sizeof(*vmpointer), &gva)) {
  4330. *ret = 1;
  4331. return -EINVAL;
  4332. }
  4333. r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
  4334. if (r != X86EMUL_CONTINUE) {
  4335. *ret = kvm_handle_memory_failure(vcpu, r, &e);
  4336. return -EINVAL;
  4337. }
  4338. return 0;
  4339. }
  4340. /*
  4341. * Allocate a shadow VMCS and associate it with the currently loaded
  4342. * VMCS, unless such a shadow VMCS already exists. The newly allocated
  4343. * VMCS is also VMCLEARed, so that it is ready for use.
  4344. */
  4345. static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
  4346. {
  4347. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4348. struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
  4349. /*
  4350. * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
  4351. * when L1 executes VMXOFF or the vCPU is forced out of nested
  4352. * operation. VMXON faults if the CPU is already post-VMXON, so it
  4353. * should be impossible to already have an allocated shadow VMCS. KVM
  4354. * doesn't support virtualization of VMCS shadowing, so vmcs01 should
  4355. * always be the loaded VMCS.
  4356. */
  4357. if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
  4358. return loaded_vmcs->shadow_vmcs;
  4359. loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
  4360. if (loaded_vmcs->shadow_vmcs)
  4361. vmcs_clear(loaded_vmcs->shadow_vmcs);
  4362. return loaded_vmcs->shadow_vmcs;
  4363. }
  4364. static int enter_vmx_operation(struct kvm_vcpu *vcpu)
  4365. {
  4366. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4367. int r;
  4368. r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
  4369. if (r < 0)
  4370. goto out_vmcs02;
  4371. vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
  4372. if (!vmx->nested.cached_vmcs12)
  4373. goto out_cached_vmcs12;
  4374. vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
  4375. vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
  4376. if (!vmx->nested.cached_shadow_vmcs12)
  4377. goto out_cached_shadow_vmcs12;
  4378. if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
  4379. goto out_shadow_vmcs;
  4380. hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
  4381. HRTIMER_MODE_ABS_PINNED);
  4382. vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
  4383. vmx->nested.vpid02 = allocate_vpid();
  4384. vmx->nested.vmcs02_initialized = false;
  4385. vmx->nested.vmxon = true;
  4386. if (vmx_pt_mode_is_host_guest()) {
  4387. vmx->pt_desc.guest.ctl = 0;
  4388. pt_update_intercept_for_msr(vcpu);
  4389. }
  4390. return 0;
  4391. out_shadow_vmcs:
  4392. kfree(vmx->nested.cached_shadow_vmcs12);
  4393. out_cached_shadow_vmcs12:
  4394. kfree(vmx->nested.cached_vmcs12);
  4395. out_cached_vmcs12:
  4396. free_loaded_vmcs(&vmx->nested.vmcs02);
  4397. out_vmcs02:
  4398. return -ENOMEM;
  4399. }
  4400. /* Emulate the VMXON instruction. */
  4401. static int handle_vmxon(struct kvm_vcpu *vcpu)
  4402. {
  4403. int ret;
  4404. gpa_t vmptr;
  4405. uint32_t revision;
  4406. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4407. const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
  4408. | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
  4409. /*
  4410. * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
  4411. * the guest and so cannot rely on hardware to perform the check,
  4412. * which has higher priority than VM-Exit (see Intel SDM's pseudocode
  4413. * for VMXON).
  4414. *
  4415. * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
  4416. * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
  4417. * force any of the relevant guest state. For a restricted guest, KVM
  4418. * does force CR0.PE=1, but only to also force VM86 in order to emulate
  4419. * Real Mode, and so there's no need to check CR0.PE manually.
  4420. */
  4421. if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
  4422. kvm_queue_exception(vcpu, UD_VECTOR);
  4423. return 1;
  4424. }
  4425. /*
  4426. * The CPL is checked for "not in VMX operation" and for "in VMX root",
  4427. * and has higher priority than the VM-Fail due to being post-VMXON,
  4428. * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
  4429. * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
  4430. * from L2 to L1, i.e. there's no need to check for the vCPU being in
  4431. * VMX non-root.
  4432. *
  4433. * Forwarding the VM-Exit unconditionally, i.e. without performing the
  4434. * #UD checks (see above), is functionally ok because KVM doesn't allow
  4435. * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
  4436. * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
  4437. * missed by hardware due to shadowing CR0 and/or CR4.
  4438. */
  4439. if (vmx_get_cpl(vcpu)) {
  4440. kvm_inject_gp(vcpu, 0);
  4441. return 1;
  4442. }
  4443. if (vmx->nested.vmxon)
  4444. return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
  4445. /*
  4446. * Invalid CR0/CR4 generates #GP. These checks are performed if and
  4447. * only if the vCPU isn't already in VMX operation, i.e. effectively
  4448. * have lower priority than the VM-Fail above.
  4449. */
  4450. if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
  4451. !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
  4452. kvm_inject_gp(vcpu, 0);
  4453. return 1;
  4454. }
  4455. if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
  4456. != VMXON_NEEDED_FEATURES) {
  4457. kvm_inject_gp(vcpu, 0);
  4458. return 1;
  4459. }
  4460. if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
  4461. return ret;
  4462. /*
  4463. * SDM 3: 24.11.5
  4464. * The first 4 bytes of VMXON region contain the supported
  4465. * VMCS revision identifier
  4466. *
  4467. * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
  4468. * which replaces physical address width with 32
  4469. */
  4470. if (!page_address_valid(vcpu, vmptr))
  4471. return nested_vmx_failInvalid(vcpu);
  4472. if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
  4473. revision != VMCS12_REVISION)
  4474. return nested_vmx_failInvalid(vcpu);
  4475. vmx->nested.vmxon_ptr = vmptr;
  4476. ret = enter_vmx_operation(vcpu);
  4477. if (ret)
  4478. return ret;
  4479. return nested_vmx_succeed(vcpu);
  4480. }
  4481. static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
  4482. {
  4483. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4484. if (vmx->nested.current_vmptr == INVALID_GPA)
  4485. return;
  4486. copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
  4487. if (enable_shadow_vmcs) {
  4488. /* copy to memory all shadowed fields in case
  4489. they were modified */
  4490. copy_shadow_to_vmcs12(vmx);
  4491. vmx_disable_shadow_vmcs(vmx);
  4492. }
  4493. vmx->nested.posted_intr_nv = -1;
  4494. /* Flush VMCS12 to guest memory */
  4495. kvm_vcpu_write_guest_page(vcpu,
  4496. vmx->nested.current_vmptr >> PAGE_SHIFT,
  4497. vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
  4498. kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
  4499. vmx->nested.current_vmptr = INVALID_GPA;
  4500. }
  4501. /* Emulate the VMXOFF instruction */
  4502. static int handle_vmxoff(struct kvm_vcpu *vcpu)
  4503. {
  4504. if (!nested_vmx_check_permission(vcpu))
  4505. return 1;
  4506. free_nested(vcpu);
  4507. if (kvm_apic_has_pending_init_or_sipi(vcpu))
  4508. kvm_make_request(KVM_REQ_EVENT, vcpu);
  4509. return nested_vmx_succeed(vcpu);
  4510. }
  4511. /* Emulate the VMCLEAR instruction */
  4512. static int handle_vmclear(struct kvm_vcpu *vcpu)
  4513. {
  4514. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4515. u32 zero = 0;
  4516. gpa_t vmptr;
  4517. u64 evmcs_gpa;
  4518. int r;
  4519. if (!nested_vmx_check_permission(vcpu))
  4520. return 1;
  4521. if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
  4522. return r;
  4523. if (!page_address_valid(vcpu, vmptr))
  4524. return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
  4525. if (vmptr == vmx->nested.vmxon_ptr)
  4526. return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
  4527. /*
  4528. * When Enlightened VMEntry is enabled on the calling CPU we treat
  4529. * memory area pointer by vmptr as Enlightened VMCS (as there's no good
  4530. * way to distinguish it from VMCS12) and we must not corrupt it by
  4531. * writing to the non-existent 'launch_state' field. The area doesn't
  4532. * have to be the currently active EVMCS on the calling CPU and there's
  4533. * nothing KVM has to do to transition it from 'active' to 'non-active'
  4534. * state. It is possible that the area will stay mapped as
  4535. * vmx->nested.hv_evmcs but this shouldn't be a problem.
  4536. */
  4537. if (likely(!guest_cpuid_has_evmcs(vcpu) ||
  4538. !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
  4539. if (vmptr == vmx->nested.current_vmptr)
  4540. nested_release_vmcs12(vcpu);
  4541. kvm_vcpu_write_guest(vcpu,
  4542. vmptr + offsetof(struct vmcs12,
  4543. launch_state),
  4544. &zero, sizeof(zero));
  4545. } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
  4546. nested_release_evmcs(vcpu);
  4547. }
  4548. return nested_vmx_succeed(vcpu);
  4549. }
  4550. /* Emulate the VMLAUNCH instruction */
  4551. static int handle_vmlaunch(struct kvm_vcpu *vcpu)
  4552. {
  4553. return nested_vmx_run(vcpu, true);
  4554. }
  4555. /* Emulate the VMRESUME instruction */
  4556. static int handle_vmresume(struct kvm_vcpu *vcpu)
  4557. {
  4558. return nested_vmx_run(vcpu, false);
  4559. }
  4560. static int handle_vmread(struct kvm_vcpu *vcpu)
  4561. {
  4562. struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
  4563. : get_vmcs12(vcpu);
  4564. unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
  4565. u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  4566. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4567. struct x86_exception e;
  4568. unsigned long field;
  4569. u64 value;
  4570. gva_t gva = 0;
  4571. short offset;
  4572. int len, r;
  4573. if (!nested_vmx_check_permission(vcpu))
  4574. return 1;
  4575. /* Decode instruction info and find the field to read */
  4576. field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
  4577. if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
  4578. /*
  4579. * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
  4580. * any VMREAD sets the ALU flags for VMfailInvalid.
  4581. */
  4582. if (vmx->nested.current_vmptr == INVALID_GPA ||
  4583. (is_guest_mode(vcpu) &&
  4584. get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
  4585. return nested_vmx_failInvalid(vcpu);
  4586. offset = get_vmcs12_field_offset(field);
  4587. if (offset < 0)
  4588. return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
  4589. if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
  4590. copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
  4591. /* Read the field, zero-extended to a u64 value */
  4592. value = vmcs12_read_any(vmcs12, field, offset);
  4593. } else {
  4594. /*
  4595. * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
  4596. * enlightened VMCS is active VMREAD/VMWRITE instructions are
  4597. * unsupported. Unfortunately, certain versions of Windows 11
  4598. * don't comply with this requirement which is not enforced in
  4599. * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
  4600. * workaround, as misbehaving guests will panic on VM-Fail.
  4601. * Note, enlightened VMCS is incompatible with shadow VMCS so
  4602. * all VMREADs from L2 should go to L1.
  4603. */
  4604. if (WARN_ON_ONCE(is_guest_mode(vcpu)))
  4605. return nested_vmx_failInvalid(vcpu);
  4606. offset = evmcs_field_offset(field, NULL);
  4607. if (offset < 0)
  4608. return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
  4609. /* Read the field, zero-extended to a u64 value */
  4610. value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
  4611. }
  4612. /*
  4613. * Now copy part of this value to register or memory, as requested.
  4614. * Note that the number of bits actually copied is 32 or 64 depending
  4615. * on the guest's mode (32 or 64 bit), not on the given field's length.
  4616. */
  4617. if (instr_info & BIT(10)) {
  4618. kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
  4619. } else {
  4620. len = is_64_bit_mode(vcpu) ? 8 : 4;
  4621. if (get_vmx_mem_address(vcpu, exit_qualification,
  4622. instr_info, true, len, &gva))
  4623. return 1;
  4624. /* _system ok, nested_vmx_check_permission has verified cpl=0 */
  4625. r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
  4626. if (r != X86EMUL_CONTINUE)
  4627. return kvm_handle_memory_failure(vcpu, r, &e);
  4628. }
  4629. return nested_vmx_succeed(vcpu);
  4630. }
  4631. static bool is_shadow_field_rw(unsigned long field)
  4632. {
  4633. switch (field) {
  4634. #define SHADOW_FIELD_RW(x, y) case x:
  4635. #include "vmcs_shadow_fields.h"
  4636. return true;
  4637. default:
  4638. break;
  4639. }
  4640. return false;
  4641. }
  4642. static bool is_shadow_field_ro(unsigned long field)
  4643. {
  4644. switch (field) {
  4645. #define SHADOW_FIELD_RO(x, y) case x:
  4646. #include "vmcs_shadow_fields.h"
  4647. return true;
  4648. default:
  4649. break;
  4650. }
  4651. return false;
  4652. }
  4653. static int handle_vmwrite(struct kvm_vcpu *vcpu)
  4654. {
  4655. struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
  4656. : get_vmcs12(vcpu);
  4657. unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
  4658. u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  4659. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4660. struct x86_exception e;
  4661. unsigned long field;
  4662. short offset;
  4663. gva_t gva;
  4664. int len, r;
  4665. /*
  4666. * The value to write might be 32 or 64 bits, depending on L1's long
  4667. * mode, and eventually we need to write that into a field of several
  4668. * possible lengths. The code below first zero-extends the value to 64
  4669. * bit (value), and then copies only the appropriate number of
  4670. * bits into the vmcs12 field.
  4671. */
  4672. u64 value = 0;
  4673. if (!nested_vmx_check_permission(vcpu))
  4674. return 1;
  4675. /*
  4676. * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
  4677. * any VMWRITE sets the ALU flags for VMfailInvalid.
  4678. */
  4679. if (vmx->nested.current_vmptr == INVALID_GPA ||
  4680. (is_guest_mode(vcpu) &&
  4681. get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
  4682. return nested_vmx_failInvalid(vcpu);
  4683. if (instr_info & BIT(10))
  4684. value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
  4685. else {
  4686. len = is_64_bit_mode(vcpu) ? 8 : 4;
  4687. if (get_vmx_mem_address(vcpu, exit_qualification,
  4688. instr_info, false, len, &gva))
  4689. return 1;
  4690. r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
  4691. if (r != X86EMUL_CONTINUE)
  4692. return kvm_handle_memory_failure(vcpu, r, &e);
  4693. }
  4694. field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
  4695. offset = get_vmcs12_field_offset(field);
  4696. if (offset < 0)
  4697. return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
  4698. /*
  4699. * If the vCPU supports "VMWRITE to any supported field in the
  4700. * VMCS," then the "read-only" fields are actually read/write.
  4701. */
  4702. if (vmcs_field_readonly(field) &&
  4703. !nested_cpu_has_vmwrite_any_field(vcpu))
  4704. return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
  4705. /*
  4706. * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
  4707. * vmcs12, else we may crush a field or consume a stale value.
  4708. */
  4709. if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
  4710. copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
  4711. /*
  4712. * Some Intel CPUs intentionally drop the reserved bits of the AR byte
  4713. * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
  4714. * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
  4715. * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
  4716. * from L1 will return a different value than VMREAD from L2 (L1 sees
  4717. * the stripped down value, L2 sees the full value as stored by KVM).
  4718. */
  4719. if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
  4720. value &= 0x1f0ff;
  4721. vmcs12_write_any(vmcs12, field, offset, value);
  4722. /*
  4723. * Do not track vmcs12 dirty-state if in guest-mode as we actually
  4724. * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
  4725. * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
  4726. * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
  4727. */
  4728. if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
  4729. /*
  4730. * L1 can read these fields without exiting, ensure the
  4731. * shadow VMCS is up-to-date.
  4732. */
  4733. if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
  4734. preempt_disable();
  4735. vmcs_load(vmx->vmcs01.shadow_vmcs);
  4736. __vmcs_writel(field, value);
  4737. vmcs_clear(vmx->vmcs01.shadow_vmcs);
  4738. vmcs_load(vmx->loaded_vmcs->vmcs);
  4739. preempt_enable();
  4740. }
  4741. vmx->nested.dirty_vmcs12 = true;
  4742. }
  4743. return nested_vmx_succeed(vcpu);
  4744. }
  4745. static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
  4746. {
  4747. vmx->nested.current_vmptr = vmptr;
  4748. if (enable_shadow_vmcs) {
  4749. secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
  4750. vmcs_write64(VMCS_LINK_POINTER,
  4751. __pa(vmx->vmcs01.shadow_vmcs));
  4752. vmx->nested.need_vmcs12_to_shadow_sync = true;
  4753. }
  4754. vmx->nested.dirty_vmcs12 = true;
  4755. vmx->nested.force_msr_bitmap_recalc = true;
  4756. }
  4757. /* Emulate the VMPTRLD instruction */
  4758. static int handle_vmptrld(struct kvm_vcpu *vcpu)
  4759. {
  4760. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4761. gpa_t vmptr;
  4762. int r;
  4763. if (!nested_vmx_check_permission(vcpu))
  4764. return 1;
  4765. if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
  4766. return r;
  4767. if (!page_address_valid(vcpu, vmptr))
  4768. return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
  4769. if (vmptr == vmx->nested.vmxon_ptr)
  4770. return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
  4771. /* Forbid normal VMPTRLD if Enlightened version was used */
  4772. if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
  4773. return 1;
  4774. if (vmx->nested.current_vmptr != vmptr) {
  4775. struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
  4776. struct vmcs_hdr hdr;
  4777. if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
  4778. /*
  4779. * Reads from an unbacked page return all 1s,
  4780. * which means that the 32 bits located at the
  4781. * given physical address won't match the required
  4782. * VMCS12_REVISION identifier.
  4783. */
  4784. return nested_vmx_fail(vcpu,
  4785. VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
  4786. }
  4787. if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
  4788. offsetof(struct vmcs12, hdr),
  4789. sizeof(hdr))) {
  4790. return nested_vmx_fail(vcpu,
  4791. VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
  4792. }
  4793. if (hdr.revision_id != VMCS12_REVISION ||
  4794. (hdr.shadow_vmcs &&
  4795. !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
  4796. return nested_vmx_fail(vcpu,
  4797. VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
  4798. }
  4799. nested_release_vmcs12(vcpu);
  4800. /*
  4801. * Load VMCS12 from guest memory since it is not already
  4802. * cached.
  4803. */
  4804. if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
  4805. VMCS12_SIZE)) {
  4806. return nested_vmx_fail(vcpu,
  4807. VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
  4808. }
  4809. set_current_vmptr(vmx, vmptr);
  4810. }
  4811. return nested_vmx_succeed(vcpu);
  4812. }
  4813. /* Emulate the VMPTRST instruction */
  4814. static int handle_vmptrst(struct kvm_vcpu *vcpu)
  4815. {
  4816. unsigned long exit_qual = vmx_get_exit_qual(vcpu);
  4817. u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  4818. gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
  4819. struct x86_exception e;
  4820. gva_t gva;
  4821. int r;
  4822. if (!nested_vmx_check_permission(vcpu))
  4823. return 1;
  4824. if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
  4825. return 1;
  4826. if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
  4827. true, sizeof(gpa_t), &gva))
  4828. return 1;
  4829. /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
  4830. r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
  4831. sizeof(gpa_t), &e);
  4832. if (r != X86EMUL_CONTINUE)
  4833. return kvm_handle_memory_failure(vcpu, r, &e);
  4834. return nested_vmx_succeed(vcpu);
  4835. }
  4836. /* Emulate the INVEPT instruction */
  4837. static int handle_invept(struct kvm_vcpu *vcpu)
  4838. {
  4839. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4840. u32 vmx_instruction_info, types;
  4841. unsigned long type, roots_to_free;
  4842. struct kvm_mmu *mmu;
  4843. gva_t gva;
  4844. struct x86_exception e;
  4845. struct {
  4846. u64 eptp, gpa;
  4847. } operand;
  4848. int i, r, gpr_index;
  4849. if (!(vmx->nested.msrs.secondary_ctls_high &
  4850. SECONDARY_EXEC_ENABLE_EPT) ||
  4851. !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
  4852. kvm_queue_exception(vcpu, UD_VECTOR);
  4853. return 1;
  4854. }
  4855. if (!nested_vmx_check_permission(vcpu))
  4856. return 1;
  4857. vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  4858. gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
  4859. type = kvm_register_read(vcpu, gpr_index);
  4860. types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
  4861. if (type >= 32 || !(types & (1 << type)))
  4862. return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  4863. /* According to the Intel VMX instruction reference, the memory
  4864. * operand is read even if it isn't needed (e.g., for type==global)
  4865. */
  4866. if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
  4867. vmx_instruction_info, false, sizeof(operand), &gva))
  4868. return 1;
  4869. r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
  4870. if (r != X86EMUL_CONTINUE)
  4871. return kvm_handle_memory_failure(vcpu, r, &e);
  4872. /*
  4873. * Nested EPT roots are always held through guest_mmu,
  4874. * not root_mmu.
  4875. */
  4876. mmu = &vcpu->arch.guest_mmu;
  4877. switch (type) {
  4878. case VMX_EPT_EXTENT_CONTEXT:
  4879. if (!nested_vmx_check_eptp(vcpu, operand.eptp))
  4880. return nested_vmx_fail(vcpu,
  4881. VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  4882. roots_to_free = 0;
  4883. if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
  4884. operand.eptp))
  4885. roots_to_free |= KVM_MMU_ROOT_CURRENT;
  4886. for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
  4887. if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
  4888. mmu->prev_roots[i].pgd,
  4889. operand.eptp))
  4890. roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
  4891. }
  4892. break;
  4893. case VMX_EPT_EXTENT_GLOBAL:
  4894. roots_to_free = KVM_MMU_ROOTS_ALL;
  4895. break;
  4896. default:
  4897. BUG();
  4898. break;
  4899. }
  4900. if (roots_to_free)
  4901. kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
  4902. return nested_vmx_succeed(vcpu);
  4903. }
  4904. static int handle_invvpid(struct kvm_vcpu *vcpu)
  4905. {
  4906. struct vcpu_vmx *vmx = to_vmx(vcpu);
  4907. u32 vmx_instruction_info;
  4908. unsigned long type, types;
  4909. gva_t gva;
  4910. struct x86_exception e;
  4911. struct {
  4912. u64 vpid;
  4913. u64 gla;
  4914. } operand;
  4915. u16 vpid02;
  4916. int r, gpr_index;
  4917. if (!(vmx->nested.msrs.secondary_ctls_high &
  4918. SECONDARY_EXEC_ENABLE_VPID) ||
  4919. !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
  4920. kvm_queue_exception(vcpu, UD_VECTOR);
  4921. return 1;
  4922. }
  4923. if (!nested_vmx_check_permission(vcpu))
  4924. return 1;
  4925. vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  4926. gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
  4927. type = kvm_register_read(vcpu, gpr_index);
  4928. types = (vmx->nested.msrs.vpid_caps &
  4929. VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
  4930. if (type >= 32 || !(types & (1 << type)))
  4931. return nested_vmx_fail(vcpu,
  4932. VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  4933. /* according to the intel vmx instruction reference, the memory
  4934. * operand is read even if it isn't needed (e.g., for type==global)
  4935. */
  4936. if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
  4937. vmx_instruction_info, false, sizeof(operand), &gva))
  4938. return 1;
  4939. r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
  4940. if (r != X86EMUL_CONTINUE)
  4941. return kvm_handle_memory_failure(vcpu, r, &e);
  4942. if (operand.vpid >> 16)
  4943. return nested_vmx_fail(vcpu,
  4944. VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  4945. vpid02 = nested_get_vpid02(vcpu);
  4946. switch (type) {
  4947. case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
  4948. if (!operand.vpid ||
  4949. is_noncanonical_address(operand.gla, vcpu))
  4950. return nested_vmx_fail(vcpu,
  4951. VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  4952. vpid_sync_vcpu_addr(vpid02, operand.gla);
  4953. break;
  4954. case VMX_VPID_EXTENT_SINGLE_CONTEXT:
  4955. case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
  4956. if (!operand.vpid)
  4957. return nested_vmx_fail(vcpu,
  4958. VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  4959. vpid_sync_context(vpid02);
  4960. break;
  4961. case VMX_VPID_EXTENT_ALL_CONTEXT:
  4962. vpid_sync_context(vpid02);
  4963. break;
  4964. default:
  4965. WARN_ON_ONCE(1);
  4966. return kvm_skip_emulated_instruction(vcpu);
  4967. }
  4968. /*
  4969. * Sync the shadow page tables if EPT is disabled, L1 is invalidating
  4970. * linear mappings for L2 (tagged with L2's VPID). Free all guest
  4971. * roots as VPIDs are not tracked in the MMU role.
  4972. *
  4973. * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
  4974. * an MMU when EPT is disabled.
  4975. *
  4976. * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
  4977. */
  4978. if (!enable_ept)
  4979. kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
  4980. return nested_vmx_succeed(vcpu);
  4981. }
  4982. static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
  4983. struct vmcs12 *vmcs12)
  4984. {
  4985. u32 index = kvm_rcx_read(vcpu);
  4986. u64 new_eptp;
  4987. if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
  4988. return 1;
  4989. if (index >= VMFUNC_EPTP_ENTRIES)
  4990. return 1;
  4991. if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
  4992. &new_eptp, index * 8, 8))
  4993. return 1;
  4994. /*
  4995. * If the (L2) guest does a vmfunc to the currently
  4996. * active ept pointer, we don't have to do anything else
  4997. */
  4998. if (vmcs12->ept_pointer != new_eptp) {
  4999. if (!nested_vmx_check_eptp(vcpu, new_eptp))
  5000. return 1;
  5001. vmcs12->ept_pointer = new_eptp;
  5002. nested_ept_new_eptp(vcpu);
  5003. if (!nested_cpu_has_vpid(vmcs12))
  5004. kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
  5005. }
  5006. return 0;
  5007. }
  5008. static int handle_vmfunc(struct kvm_vcpu *vcpu)
  5009. {
  5010. struct vcpu_vmx *vmx = to_vmx(vcpu);
  5011. struct vmcs12 *vmcs12;
  5012. u32 function = kvm_rax_read(vcpu);
  5013. /*
  5014. * VMFUNC is only supported for nested guests, but we always enable the
  5015. * secondary control for simplicity; for non-nested mode, fake that we
  5016. * didn't by injecting #UD.
  5017. */
  5018. if (!is_guest_mode(vcpu)) {
  5019. kvm_queue_exception(vcpu, UD_VECTOR);
  5020. return 1;
  5021. }
  5022. vmcs12 = get_vmcs12(vcpu);
  5023. /*
  5024. * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
  5025. * is enabled in vmcs02 if and only if it's enabled in vmcs12.
  5026. */
  5027. if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
  5028. kvm_queue_exception(vcpu, UD_VECTOR);
  5029. return 1;
  5030. }
  5031. if (!(vmcs12->vm_function_control & BIT_ULL(function)))
  5032. goto fail;
  5033. switch (function) {
  5034. case 0:
  5035. if (nested_vmx_eptp_switching(vcpu, vmcs12))
  5036. goto fail;
  5037. break;
  5038. default:
  5039. goto fail;
  5040. }
  5041. return kvm_skip_emulated_instruction(vcpu);
  5042. fail:
  5043. /*
  5044. * This is effectively a reflected VM-Exit, as opposed to a synthesized
  5045. * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
  5046. * EXIT_REASON_VMFUNC as the exit reason.
  5047. */
  5048. nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
  5049. vmx_get_intr_info(vcpu),
  5050. vmx_get_exit_qual(vcpu));
  5051. return 1;
  5052. }
  5053. /*
  5054. * Return true if an IO instruction with the specified port and size should cause
  5055. * a VM-exit into L1.
  5056. */
  5057. bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
  5058. int size)
  5059. {
  5060. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  5061. gpa_t bitmap, last_bitmap;
  5062. u8 b;
  5063. last_bitmap = INVALID_GPA;
  5064. b = -1;
  5065. while (size > 0) {
  5066. if (port < 0x8000)
  5067. bitmap = vmcs12->io_bitmap_a;
  5068. else if (port < 0x10000)
  5069. bitmap = vmcs12->io_bitmap_b;
  5070. else
  5071. return true;
  5072. bitmap += (port & 0x7fff) / 8;
  5073. if (last_bitmap != bitmap)
  5074. if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
  5075. return true;
  5076. if (b & (1 << (port & 7)))
  5077. return true;
  5078. port++;
  5079. size--;
  5080. last_bitmap = bitmap;
  5081. }
  5082. return false;
  5083. }
  5084. static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
  5085. struct vmcs12 *vmcs12)
  5086. {
  5087. unsigned long exit_qualification;
  5088. unsigned short port;
  5089. int size;
  5090. if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
  5091. return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
  5092. exit_qualification = vmx_get_exit_qual(vcpu);
  5093. port = exit_qualification >> 16;
  5094. size = (exit_qualification & 7) + 1;
  5095. return nested_vmx_check_io_bitmaps(vcpu, port, size);
  5096. }
  5097. /*
  5098. * Return 1 if we should exit from L2 to L1 to handle an MSR access,
  5099. * rather than handle it ourselves in L0. I.e., check whether L1 expressed
  5100. * disinterest in the current event (read or write a specific MSR) by using an
  5101. * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
  5102. */
  5103. static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
  5104. struct vmcs12 *vmcs12,
  5105. union vmx_exit_reason exit_reason)
  5106. {
  5107. u32 msr_index = kvm_rcx_read(vcpu);
  5108. gpa_t bitmap;
  5109. if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
  5110. return true;
  5111. /*
  5112. * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
  5113. * for the four combinations of read/write and low/high MSR numbers.
  5114. * First we need to figure out which of the four to use:
  5115. */
  5116. bitmap = vmcs12->msr_bitmap;
  5117. if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
  5118. bitmap += 2048;
  5119. if (msr_index >= 0xc0000000) {
  5120. msr_index -= 0xc0000000;
  5121. bitmap += 1024;
  5122. }
  5123. /* Then read the msr_index'th bit from this bitmap: */
  5124. if (msr_index < 1024*8) {
  5125. unsigned char b;
  5126. if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
  5127. return true;
  5128. return 1 & (b >> (msr_index & 7));
  5129. } else
  5130. return true; /* let L1 handle the wrong parameter */
  5131. }
  5132. /*
  5133. * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
  5134. * rather than handle it ourselves in L0. I.e., check if L1 wanted to
  5135. * intercept (via guest_host_mask etc.) the current event.
  5136. */
  5137. static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
  5138. struct vmcs12 *vmcs12)
  5139. {
  5140. unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
  5141. int cr = exit_qualification & 15;
  5142. int reg;
  5143. unsigned long val;
  5144. switch ((exit_qualification >> 4) & 3) {
  5145. case 0: /* mov to cr */
  5146. reg = (exit_qualification >> 8) & 15;
  5147. val = kvm_register_read(vcpu, reg);
  5148. switch (cr) {
  5149. case 0:
  5150. if (vmcs12->cr0_guest_host_mask &
  5151. (val ^ vmcs12->cr0_read_shadow))
  5152. return true;
  5153. break;
  5154. case 3:
  5155. if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
  5156. return true;
  5157. break;
  5158. case 4:
  5159. if (vmcs12->cr4_guest_host_mask &
  5160. (vmcs12->cr4_read_shadow ^ val))
  5161. return true;
  5162. break;
  5163. case 8:
  5164. if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
  5165. return true;
  5166. break;
  5167. }
  5168. break;
  5169. case 2: /* clts */
  5170. if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
  5171. (vmcs12->cr0_read_shadow & X86_CR0_TS))
  5172. return true;
  5173. break;
  5174. case 1: /* mov from cr */
  5175. switch (cr) {
  5176. case 3:
  5177. if (vmcs12->cpu_based_vm_exec_control &
  5178. CPU_BASED_CR3_STORE_EXITING)
  5179. return true;
  5180. break;
  5181. case 8:
  5182. if (vmcs12->cpu_based_vm_exec_control &
  5183. CPU_BASED_CR8_STORE_EXITING)
  5184. return true;
  5185. break;
  5186. }
  5187. break;
  5188. case 3: /* lmsw */
  5189. /*
  5190. * lmsw can change bits 1..3 of cr0, and only set bit 0 of
  5191. * cr0. Other attempted changes are ignored, with no exit.
  5192. */
  5193. val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
  5194. if (vmcs12->cr0_guest_host_mask & 0xe &
  5195. (val ^ vmcs12->cr0_read_shadow))
  5196. return true;
  5197. if ((vmcs12->cr0_guest_host_mask & 0x1) &&
  5198. !(vmcs12->cr0_read_shadow & 0x1) &&
  5199. (val & 0x1))
  5200. return true;
  5201. break;
  5202. }
  5203. return false;
  5204. }
  5205. static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
  5206. struct vmcs12 *vmcs12)
  5207. {
  5208. u32 encls_leaf;
  5209. if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
  5210. !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
  5211. return false;
  5212. encls_leaf = kvm_rax_read(vcpu);
  5213. if (encls_leaf > 62)
  5214. encls_leaf = 63;
  5215. return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
  5216. }
  5217. static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
  5218. struct vmcs12 *vmcs12, gpa_t bitmap)
  5219. {
  5220. u32 vmx_instruction_info;
  5221. unsigned long field;
  5222. u8 b;
  5223. if (!nested_cpu_has_shadow_vmcs(vmcs12))
  5224. return true;
  5225. /* Decode instruction info and find the field to access */
  5226. vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  5227. field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
  5228. /* Out-of-range fields always cause a VM exit from L2 to L1 */
  5229. if (field >> 15)
  5230. return true;
  5231. if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
  5232. return true;
  5233. return 1 & (b >> (field & 7));
  5234. }
  5235. static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
  5236. {
  5237. u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
  5238. if (nested_cpu_has_mtf(vmcs12))
  5239. return true;
  5240. /*
  5241. * An MTF VM-exit may be injected into the guest by setting the
  5242. * interruption-type to 7 (other event) and the vector field to 0. Such
  5243. * is the case regardless of the 'monitor trap flag' VM-execution
  5244. * control.
  5245. */
  5246. return entry_intr_info == (INTR_INFO_VALID_MASK
  5247. | INTR_TYPE_OTHER_EVENT);
  5248. }
  5249. /*
  5250. * Return true if L0 wants to handle an exit from L2 regardless of whether or not
  5251. * L1 wants the exit. Only call this when in is_guest_mode (L2).
  5252. */
  5253. static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
  5254. union vmx_exit_reason exit_reason)
  5255. {
  5256. u32 intr_info;
  5257. switch ((u16)exit_reason.basic) {
  5258. case EXIT_REASON_EXCEPTION_NMI:
  5259. intr_info = vmx_get_intr_info(vcpu);
  5260. if (is_nmi(intr_info))
  5261. return true;
  5262. else if (is_page_fault(intr_info))
  5263. return vcpu->arch.apf.host_apf_flags ||
  5264. vmx_need_pf_intercept(vcpu);
  5265. else if (is_debug(intr_info) &&
  5266. vcpu->guest_debug &
  5267. (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
  5268. return true;
  5269. else if (is_breakpoint(intr_info) &&
  5270. vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
  5271. return true;
  5272. else if (is_alignment_check(intr_info) &&
  5273. !vmx_guest_inject_ac(vcpu))
  5274. return true;
  5275. return false;
  5276. case EXIT_REASON_EXTERNAL_INTERRUPT:
  5277. return true;
  5278. case EXIT_REASON_MCE_DURING_VMENTRY:
  5279. return true;
  5280. case EXIT_REASON_EPT_VIOLATION:
  5281. /*
  5282. * L0 always deals with the EPT violation. If nested EPT is
  5283. * used, and the nested mmu code discovers that the address is
  5284. * missing in the guest EPT table (EPT12), the EPT violation
  5285. * will be injected with nested_ept_inject_page_fault()
  5286. */
  5287. return true;
  5288. case EXIT_REASON_EPT_MISCONFIG:
  5289. /*
  5290. * L2 never uses directly L1's EPT, but rather L0's own EPT
  5291. * table (shadow on EPT) or a merged EPT table that L0 built
  5292. * (EPT on EPT). So any problems with the structure of the
  5293. * table is L0's fault.
  5294. */
  5295. return true;
  5296. case EXIT_REASON_PREEMPTION_TIMER:
  5297. return true;
  5298. case EXIT_REASON_PML_FULL:
  5299. /*
  5300. * PML is emulated for an L1 VMM and should never be enabled in
  5301. * vmcs02, always "handle" PML_FULL by exiting to userspace.
  5302. */
  5303. return true;
  5304. case EXIT_REASON_VMFUNC:
  5305. /* VM functions are emulated through L2->L0 vmexits. */
  5306. return true;
  5307. case EXIT_REASON_BUS_LOCK:
  5308. /*
  5309. * At present, bus lock VM exit is never exposed to L1.
  5310. * Handle L2's bus locks in L0 directly.
  5311. */
  5312. return true;
  5313. default:
  5314. break;
  5315. }
  5316. return false;
  5317. }
  5318. /*
  5319. * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
  5320. * is_guest_mode (L2).
  5321. */
  5322. static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
  5323. union vmx_exit_reason exit_reason)
  5324. {
  5325. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  5326. u32 intr_info;
  5327. switch ((u16)exit_reason.basic) {
  5328. case EXIT_REASON_EXCEPTION_NMI:
  5329. intr_info = vmx_get_intr_info(vcpu);
  5330. if (is_nmi(intr_info))
  5331. return true;
  5332. else if (is_page_fault(intr_info))
  5333. return true;
  5334. return vmcs12->exception_bitmap &
  5335. (1u << (intr_info & INTR_INFO_VECTOR_MASK));
  5336. case EXIT_REASON_EXTERNAL_INTERRUPT:
  5337. return nested_exit_on_intr(vcpu);
  5338. case EXIT_REASON_TRIPLE_FAULT:
  5339. return true;
  5340. case EXIT_REASON_INTERRUPT_WINDOW:
  5341. return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
  5342. case EXIT_REASON_NMI_WINDOW:
  5343. return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
  5344. case EXIT_REASON_TASK_SWITCH:
  5345. return true;
  5346. case EXIT_REASON_CPUID:
  5347. return true;
  5348. case EXIT_REASON_HLT:
  5349. return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
  5350. case EXIT_REASON_INVD:
  5351. return true;
  5352. case EXIT_REASON_INVLPG:
  5353. return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
  5354. case EXIT_REASON_RDPMC:
  5355. return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
  5356. case EXIT_REASON_RDRAND:
  5357. return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
  5358. case EXIT_REASON_RDSEED:
  5359. return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
  5360. case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
  5361. return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
  5362. case EXIT_REASON_VMREAD:
  5363. return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
  5364. vmcs12->vmread_bitmap);
  5365. case EXIT_REASON_VMWRITE:
  5366. return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
  5367. vmcs12->vmwrite_bitmap);
  5368. case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
  5369. case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
  5370. case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
  5371. case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
  5372. case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
  5373. /*
  5374. * VMX instructions trap unconditionally. This allows L1 to
  5375. * emulate them for its L2 guest, i.e., allows 3-level nesting!
  5376. */
  5377. return true;
  5378. case EXIT_REASON_CR_ACCESS:
  5379. return nested_vmx_exit_handled_cr(vcpu, vmcs12);
  5380. case EXIT_REASON_DR_ACCESS:
  5381. return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
  5382. case EXIT_REASON_IO_INSTRUCTION:
  5383. return nested_vmx_exit_handled_io(vcpu, vmcs12);
  5384. case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
  5385. return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
  5386. case EXIT_REASON_MSR_READ:
  5387. case EXIT_REASON_MSR_WRITE:
  5388. return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
  5389. case EXIT_REASON_INVALID_STATE:
  5390. return true;
  5391. case EXIT_REASON_MWAIT_INSTRUCTION:
  5392. return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
  5393. case EXIT_REASON_MONITOR_TRAP_FLAG:
  5394. return nested_vmx_exit_handled_mtf(vmcs12);
  5395. case EXIT_REASON_MONITOR_INSTRUCTION:
  5396. return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
  5397. case EXIT_REASON_PAUSE_INSTRUCTION:
  5398. return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
  5399. nested_cpu_has2(vmcs12,
  5400. SECONDARY_EXEC_PAUSE_LOOP_EXITING);
  5401. case EXIT_REASON_MCE_DURING_VMENTRY:
  5402. return true;
  5403. case EXIT_REASON_TPR_BELOW_THRESHOLD:
  5404. return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
  5405. case EXIT_REASON_APIC_ACCESS:
  5406. case EXIT_REASON_APIC_WRITE:
  5407. case EXIT_REASON_EOI_INDUCED:
  5408. /*
  5409. * The controls for "virtualize APIC accesses," "APIC-
  5410. * register virtualization," and "virtual-interrupt
  5411. * delivery" only come from vmcs12.
  5412. */
  5413. return true;
  5414. case EXIT_REASON_INVPCID:
  5415. return
  5416. nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
  5417. nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
  5418. case EXIT_REASON_WBINVD:
  5419. return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
  5420. case EXIT_REASON_XSETBV:
  5421. return true;
  5422. case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
  5423. /*
  5424. * This should never happen, since it is not possible to
  5425. * set XSS to a non-zero value---neither in L1 nor in L2.
  5426. * If if it were, XSS would have to be checked against
  5427. * the XSS exit bitmap in vmcs12.
  5428. */
  5429. return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
  5430. case EXIT_REASON_UMWAIT:
  5431. case EXIT_REASON_TPAUSE:
  5432. return nested_cpu_has2(vmcs12,
  5433. SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
  5434. case EXIT_REASON_ENCLS:
  5435. return nested_vmx_exit_handled_encls(vcpu, vmcs12);
  5436. case EXIT_REASON_NOTIFY:
  5437. /* Notify VM exit is not exposed to L1 */
  5438. return false;
  5439. default:
  5440. return true;
  5441. }
  5442. }
  5443. /*
  5444. * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
  5445. * reflected into L1.
  5446. */
  5447. bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
  5448. {
  5449. struct vcpu_vmx *vmx = to_vmx(vcpu);
  5450. union vmx_exit_reason exit_reason = vmx->exit_reason;
  5451. unsigned long exit_qual;
  5452. u32 exit_intr_info;
  5453. WARN_ON_ONCE(vmx->nested.nested_run_pending);
  5454. /*
  5455. * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
  5456. * has already loaded L2's state.
  5457. */
  5458. if (unlikely(vmx->fail)) {
  5459. trace_kvm_nested_vmenter_failed(
  5460. "hardware VM-instruction error: ",
  5461. vmcs_read32(VM_INSTRUCTION_ERROR));
  5462. exit_intr_info = 0;
  5463. exit_qual = 0;
  5464. goto reflect_vmexit;
  5465. }
  5466. trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
  5467. /* If L0 (KVM) wants the exit, it trumps L1's desires. */
  5468. if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
  5469. return false;
  5470. /* If L1 doesn't want the exit, handle it in L0. */
  5471. if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
  5472. return false;
  5473. /*
  5474. * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
  5475. * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
  5476. * need to be synthesized by querying the in-kernel LAPIC, but external
  5477. * interrupts are never reflected to L1 so it's a non-issue.
  5478. */
  5479. exit_intr_info = vmx_get_intr_info(vcpu);
  5480. if (is_exception_with_error_code(exit_intr_info)) {
  5481. struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  5482. vmcs12->vm_exit_intr_error_code =
  5483. vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
  5484. }
  5485. exit_qual = vmx_get_exit_qual(vcpu);
  5486. reflect_vmexit:
  5487. nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
  5488. return true;
  5489. }
  5490. static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
  5491. struct kvm_nested_state __user *user_kvm_nested_state,
  5492. u32 user_data_size)
  5493. {
  5494. struct vcpu_vmx *vmx;
  5495. struct vmcs12 *vmcs12;
  5496. struct kvm_nested_state kvm_state = {
  5497. .flags = 0,
  5498. .format = KVM_STATE_NESTED_FORMAT_VMX,
  5499. .size = sizeof(kvm_state),
  5500. .hdr.vmx.flags = 0,
  5501. .hdr.vmx.vmxon_pa = INVALID_GPA,
  5502. .hdr.vmx.vmcs12_pa = INVALID_GPA,
  5503. .hdr.vmx.preemption_timer_deadline = 0,
  5504. };
  5505. struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
  5506. &user_kvm_nested_state->data.vmx[0];
  5507. if (!vcpu)
  5508. return kvm_state.size + sizeof(*user_vmx_nested_state);
  5509. vmx = to_vmx(vcpu);
  5510. vmcs12 = get_vmcs12(vcpu);
  5511. if (nested_vmx_allowed(vcpu) &&
  5512. (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
  5513. kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
  5514. kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
  5515. if (vmx_has_valid_vmcs12(vcpu)) {
  5516. kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
  5517. /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
  5518. if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
  5519. kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
  5520. if (is_guest_mode(vcpu) &&
  5521. nested_cpu_has_shadow_vmcs(vmcs12) &&
  5522. vmcs12->vmcs_link_pointer != INVALID_GPA)
  5523. kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
  5524. }
  5525. if (vmx->nested.smm.vmxon)
  5526. kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
  5527. if (vmx->nested.smm.guest_mode)
  5528. kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
  5529. if (is_guest_mode(vcpu)) {
  5530. kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
  5531. if (vmx->nested.nested_run_pending)
  5532. kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
  5533. if (vmx->nested.mtf_pending)
  5534. kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
  5535. if (nested_cpu_has_preemption_timer(vmcs12) &&
  5536. vmx->nested.has_preemption_timer_deadline) {
  5537. kvm_state.hdr.vmx.flags |=
  5538. KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
  5539. kvm_state.hdr.vmx.preemption_timer_deadline =
  5540. vmx->nested.preemption_timer_deadline;
  5541. }
  5542. }
  5543. }
  5544. if (user_data_size < kvm_state.size)
  5545. goto out;
  5546. if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
  5547. return -EFAULT;
  5548. if (!vmx_has_valid_vmcs12(vcpu))
  5549. goto out;
  5550. /*
  5551. * When running L2, the authoritative vmcs12 state is in the
  5552. * vmcs02. When running L1, the authoritative vmcs12 state is
  5553. * in the shadow or enlightened vmcs linked to vmcs01, unless
  5554. * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
  5555. * vmcs12 state is in the vmcs12 already.
  5556. */
  5557. if (is_guest_mode(vcpu)) {
  5558. sync_vmcs02_to_vmcs12(vcpu, vmcs12);
  5559. sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
  5560. } else {
  5561. copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
  5562. if (!vmx->nested.need_vmcs12_to_shadow_sync) {
  5563. if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
  5564. /*
  5565. * L1 hypervisor is not obliged to keep eVMCS
  5566. * clean fields data always up-to-date while
  5567. * not in guest mode, 'hv_clean_fields' is only
  5568. * supposed to be actual upon vmentry so we need
  5569. * to ignore it here and do full copy.
  5570. */
  5571. copy_enlightened_to_vmcs12(vmx, 0);
  5572. else if (enable_shadow_vmcs)
  5573. copy_shadow_to_vmcs12(vmx);
  5574. }
  5575. }
  5576. BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
  5577. BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
  5578. /*
  5579. * Copy over the full allocated size of vmcs12 rather than just the size
  5580. * of the struct.
  5581. */
  5582. if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
  5583. return -EFAULT;
  5584. if (nested_cpu_has_shadow_vmcs(vmcs12) &&
  5585. vmcs12->vmcs_link_pointer != INVALID_GPA) {
  5586. if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
  5587. get_shadow_vmcs12(vcpu), VMCS12_SIZE))
  5588. return -EFAULT;
  5589. }
  5590. out:
  5591. return kvm_state.size;
  5592. }
  5593. void vmx_leave_nested(struct kvm_vcpu *vcpu)
  5594. {
  5595. if (is_guest_mode(vcpu)) {
  5596. to_vmx(vcpu)->nested.nested_run_pending = 0;
  5597. nested_vmx_vmexit(vcpu, -1, 0, 0);
  5598. }
  5599. free_nested(vcpu);
  5600. }
  5601. static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
  5602. struct kvm_nested_state __user *user_kvm_nested_state,
  5603. struct kvm_nested_state *kvm_state)
  5604. {
  5605. struct vcpu_vmx *vmx = to_vmx(vcpu);
  5606. struct vmcs12 *vmcs12;
  5607. enum vm_entry_failure_code ignored;
  5608. struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
  5609. &user_kvm_nested_state->data.vmx[0];
  5610. int ret;
  5611. if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
  5612. return -EINVAL;
  5613. if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
  5614. if (kvm_state->hdr.vmx.smm.flags)
  5615. return -EINVAL;
  5616. if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
  5617. return -EINVAL;
  5618. /*
  5619. * KVM_STATE_NESTED_EVMCS used to signal that KVM should
  5620. * enable eVMCS capability on vCPU. However, since then
  5621. * code was changed such that flag signals vmcs12 should
  5622. * be copied into eVMCS in guest memory.
  5623. *
  5624. * To preserve backwards compatability, allow user
  5625. * to set this flag even when there is no VMXON region.
  5626. */
  5627. if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
  5628. return -EINVAL;
  5629. } else {
  5630. if (!nested_vmx_allowed(vcpu))
  5631. return -EINVAL;
  5632. if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
  5633. return -EINVAL;
  5634. }
  5635. if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
  5636. (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
  5637. return -EINVAL;
  5638. if (kvm_state->hdr.vmx.smm.flags &
  5639. ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
  5640. return -EINVAL;
  5641. if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
  5642. return -EINVAL;
  5643. /*
  5644. * SMM temporarily disables VMX, so we cannot be in guest mode,
  5645. * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
  5646. * must be zero.
  5647. */
  5648. if (is_smm(vcpu) ?
  5649. (kvm_state->flags &
  5650. (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
  5651. : kvm_state->hdr.vmx.smm.flags)
  5652. return -EINVAL;
  5653. if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
  5654. !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
  5655. return -EINVAL;
  5656. if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
  5657. (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
  5658. return -EINVAL;
  5659. vmx_leave_nested(vcpu);
  5660. if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
  5661. return 0;
  5662. vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
  5663. ret = enter_vmx_operation(vcpu);
  5664. if (ret)
  5665. return ret;
  5666. /* Empty 'VMXON' state is permitted if no VMCS loaded */
  5667. if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
  5668. /* See vmx_has_valid_vmcs12. */
  5669. if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
  5670. (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
  5671. (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
  5672. return -EINVAL;
  5673. else
  5674. return 0;
  5675. }
  5676. if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
  5677. if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
  5678. !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
  5679. return -EINVAL;
  5680. set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
  5681. } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
  5682. /*
  5683. * nested_vmx_handle_enlightened_vmptrld() cannot be called
  5684. * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
  5685. * restored yet. EVMCS will be mapped from
  5686. * nested_get_vmcs12_pages().
  5687. */
  5688. vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
  5689. kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
  5690. } else {
  5691. return -EINVAL;
  5692. }
  5693. if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
  5694. vmx->nested.smm.vmxon = true;
  5695. vmx->nested.vmxon = false;
  5696. if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
  5697. vmx->nested.smm.guest_mode = true;
  5698. }
  5699. vmcs12 = get_vmcs12(vcpu);
  5700. if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
  5701. return -EFAULT;
  5702. if (vmcs12->hdr.revision_id != VMCS12_REVISION)
  5703. return -EINVAL;
  5704. if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
  5705. return 0;
  5706. vmx->nested.nested_run_pending =
  5707. !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
  5708. vmx->nested.mtf_pending =
  5709. !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
  5710. ret = -EINVAL;
  5711. if (nested_cpu_has_shadow_vmcs(vmcs12) &&
  5712. vmcs12->vmcs_link_pointer != INVALID_GPA) {
  5713. struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
  5714. if (kvm_state->size <
  5715. sizeof(*kvm_state) +
  5716. sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
  5717. goto error_guest_mode;
  5718. if (copy_from_user(shadow_vmcs12,
  5719. user_vmx_nested_state->shadow_vmcs12,
  5720. sizeof(*shadow_vmcs12))) {
  5721. ret = -EFAULT;
  5722. goto error_guest_mode;
  5723. }
  5724. if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
  5725. !shadow_vmcs12->hdr.shadow_vmcs)
  5726. goto error_guest_mode;
  5727. }
  5728. vmx->nested.has_preemption_timer_deadline = false;
  5729. if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
  5730. vmx->nested.has_preemption_timer_deadline = true;
  5731. vmx->nested.preemption_timer_deadline =
  5732. kvm_state->hdr.vmx.preemption_timer_deadline;
  5733. }
  5734. if (nested_vmx_check_controls(vcpu, vmcs12) ||
  5735. nested_vmx_check_host_state(vcpu, vmcs12) ||
  5736. nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
  5737. goto error_guest_mode;
  5738. vmx->nested.dirty_vmcs12 = true;
  5739. vmx->nested.force_msr_bitmap_recalc = true;
  5740. ret = nested_vmx_enter_non_root_mode(vcpu, false);
  5741. if (ret)
  5742. goto error_guest_mode;
  5743. if (vmx->nested.mtf_pending)
  5744. kvm_make_request(KVM_REQ_EVENT, vcpu);
  5745. return 0;
  5746. error_guest_mode:
  5747. vmx->nested.nested_run_pending = 0;
  5748. return ret;
  5749. }
  5750. void nested_vmx_set_vmcs_shadowing_bitmap(void)
  5751. {
  5752. if (enable_shadow_vmcs) {
  5753. vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
  5754. vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
  5755. }
  5756. }
  5757. /*
  5758. * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
  5759. * that madness to get the encoding for comparison.
  5760. */
  5761. #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
  5762. static u64 nested_vmx_calc_vmcs_enum_msr(void)
  5763. {
  5764. /*
  5765. * Note these are the so called "index" of the VMCS field encoding, not
  5766. * the index into vmcs12.
  5767. */
  5768. unsigned int max_idx, idx;
  5769. int i;
  5770. /*
  5771. * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
  5772. * vmcs12, regardless of whether or not the associated feature is
  5773. * exposed to L1. Simply find the field with the highest index.
  5774. */
  5775. max_idx = 0;
  5776. for (i = 0; i < nr_vmcs12_fields; i++) {
  5777. /* The vmcs12 table is very, very sparsely populated. */
  5778. if (!vmcs12_field_offsets[i])
  5779. continue;
  5780. idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
  5781. if (idx > max_idx)
  5782. max_idx = idx;
  5783. }
  5784. return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
  5785. }
  5786. /*
  5787. * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
  5788. * returned for the various VMX controls MSRs when nested VMX is enabled.
  5789. * The same values should also be used to verify that vmcs12 control fields are
  5790. * valid during nested entry from L1 to L2.
  5791. * Each of these control msrs has a low and high 32-bit half: A low bit is on
  5792. * if the corresponding bit in the (32-bit) control field *must* be on, and a
  5793. * bit in the high half is on if the corresponding bit in the control field
  5794. * may be on. See also vmx_control_verify().
  5795. */
  5796. void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
  5797. {
  5798. struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
  5799. /*
  5800. * Note that as a general rule, the high half of the MSRs (bits in
  5801. * the control fields which may be 1) should be initialized by the
  5802. * intersection of the underlying hardware's MSR (i.e., features which
  5803. * can be supported) and the list of features we want to expose -
  5804. * because they are known to be properly supported in our code.
  5805. * Also, usually, the low half of the MSRs (bits which must be 1) can
  5806. * be set to 0, meaning that L1 may turn off any of these bits. The
  5807. * reason is that if one of these bits is necessary, it will appear
  5808. * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
  5809. * fields of vmcs01 and vmcs02, will turn these bits off - and
  5810. * nested_vmx_l1_wants_exit() will not pass related exits to L1.
  5811. * These rules have exceptions below.
  5812. */
  5813. /* pin-based controls */
  5814. msrs->pinbased_ctls_low =
  5815. PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
  5816. msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
  5817. msrs->pinbased_ctls_high &=
  5818. PIN_BASED_EXT_INTR_MASK |
  5819. PIN_BASED_NMI_EXITING |
  5820. PIN_BASED_VIRTUAL_NMIS |
  5821. (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
  5822. msrs->pinbased_ctls_high |=
  5823. PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
  5824. PIN_BASED_VMX_PREEMPTION_TIMER;
  5825. /* exit controls */
  5826. msrs->exit_ctls_low =
  5827. VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  5828. msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
  5829. msrs->exit_ctls_high &=
  5830. #ifdef CONFIG_X86_64
  5831. VM_EXIT_HOST_ADDR_SPACE_SIZE |
  5832. #endif
  5833. VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
  5834. VM_EXIT_CLEAR_BNDCFGS;
  5835. msrs->exit_ctls_high |=
  5836. VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
  5837. VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
  5838. VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
  5839. VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
  5840. /* We support free control of debug control saving. */
  5841. msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
  5842. /* entry controls */
  5843. msrs->entry_ctls_low =
  5844. VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
  5845. msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
  5846. msrs->entry_ctls_high &=
  5847. #ifdef CONFIG_X86_64
  5848. VM_ENTRY_IA32E_MODE |
  5849. #endif
  5850. VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
  5851. msrs->entry_ctls_high |=
  5852. (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
  5853. VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
  5854. /* We support free control of debug control loading. */
  5855. msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
  5856. /* cpu-based controls */
  5857. msrs->procbased_ctls_low =
  5858. CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
  5859. msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
  5860. msrs->procbased_ctls_high &=
  5861. CPU_BASED_INTR_WINDOW_EXITING |
  5862. CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
  5863. CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
  5864. CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
  5865. CPU_BASED_CR3_STORE_EXITING |
  5866. #ifdef CONFIG_X86_64
  5867. CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
  5868. #endif
  5869. CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
  5870. CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
  5871. CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
  5872. CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
  5873. CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
  5874. /*
  5875. * We can allow some features even when not supported by the
  5876. * hardware. For example, L1 can specify an MSR bitmap - and we
  5877. * can use it to avoid exits to L1 - even when L0 runs L2
  5878. * without MSR bitmaps.
  5879. */
  5880. msrs->procbased_ctls_high |=
  5881. CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
  5882. CPU_BASED_USE_MSR_BITMAPS;
  5883. /* We support free control of CR3 access interception. */
  5884. msrs->procbased_ctls_low &=
  5885. ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
  5886. /*
  5887. * secondary cpu-based controls. Do not include those that
  5888. * depend on CPUID bits, they are added later by
  5889. * vmx_vcpu_after_set_cpuid.
  5890. */
  5891. msrs->secondary_ctls_low = 0;
  5892. msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
  5893. msrs->secondary_ctls_high &=
  5894. SECONDARY_EXEC_DESC |
  5895. SECONDARY_EXEC_ENABLE_RDTSCP |
  5896. SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
  5897. SECONDARY_EXEC_WBINVD_EXITING |
  5898. SECONDARY_EXEC_APIC_REGISTER_VIRT |
  5899. SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
  5900. SECONDARY_EXEC_RDRAND_EXITING |
  5901. SECONDARY_EXEC_ENABLE_INVPCID |
  5902. SECONDARY_EXEC_RDSEED_EXITING |
  5903. SECONDARY_EXEC_XSAVES |
  5904. SECONDARY_EXEC_TSC_SCALING |
  5905. SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
  5906. /*
  5907. * We can emulate "VMCS shadowing," even if the hardware
  5908. * doesn't support it.
  5909. */
  5910. msrs->secondary_ctls_high |=
  5911. SECONDARY_EXEC_SHADOW_VMCS;
  5912. if (enable_ept) {
  5913. /* nested EPT: emulate EPT also to L1 */
  5914. msrs->secondary_ctls_high |=
  5915. SECONDARY_EXEC_ENABLE_EPT;
  5916. msrs->ept_caps =
  5917. VMX_EPT_PAGE_WALK_4_BIT |
  5918. VMX_EPT_PAGE_WALK_5_BIT |
  5919. VMX_EPTP_WB_BIT |
  5920. VMX_EPT_INVEPT_BIT |
  5921. VMX_EPT_EXECUTE_ONLY_BIT;
  5922. msrs->ept_caps &= ept_caps;
  5923. msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
  5924. VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
  5925. VMX_EPT_1GB_PAGE_BIT;
  5926. if (enable_ept_ad_bits) {
  5927. msrs->secondary_ctls_high |=
  5928. SECONDARY_EXEC_ENABLE_PML;
  5929. msrs->ept_caps |= VMX_EPT_AD_BIT;
  5930. }
  5931. }
  5932. if (cpu_has_vmx_vmfunc()) {
  5933. msrs->secondary_ctls_high |=
  5934. SECONDARY_EXEC_ENABLE_VMFUNC;
  5935. /*
  5936. * Advertise EPTP switching unconditionally
  5937. * since we emulate it
  5938. */
  5939. if (enable_ept)
  5940. msrs->vmfunc_controls =
  5941. VMX_VMFUNC_EPTP_SWITCHING;
  5942. }
  5943. /*
  5944. * Old versions of KVM use the single-context version without
  5945. * checking for support, so declare that it is supported even
  5946. * though it is treated as global context. The alternative is
  5947. * not failing the single-context invvpid, and it is worse.
  5948. */
  5949. if (enable_vpid) {
  5950. msrs->secondary_ctls_high |=
  5951. SECONDARY_EXEC_ENABLE_VPID;
  5952. msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
  5953. VMX_VPID_EXTENT_SUPPORTED_MASK;
  5954. }
  5955. if (enable_unrestricted_guest)
  5956. msrs->secondary_ctls_high |=
  5957. SECONDARY_EXEC_UNRESTRICTED_GUEST;
  5958. if (flexpriority_enabled)
  5959. msrs->secondary_ctls_high |=
  5960. SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
  5961. if (enable_sgx)
  5962. msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
  5963. /* miscellaneous data */
  5964. msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
  5965. msrs->misc_low |=
  5966. MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
  5967. VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
  5968. VMX_MISC_ACTIVITY_HLT |
  5969. VMX_MISC_ACTIVITY_WAIT_SIPI;
  5970. msrs->misc_high = 0;
  5971. /*
  5972. * This MSR reports some information about VMX support. We
  5973. * should return information about the VMX we emulate for the
  5974. * guest, and the VMCS structure we give it - not about the
  5975. * VMX support of the underlying hardware.
  5976. */
  5977. msrs->basic =
  5978. VMCS12_REVISION |
  5979. VMX_BASIC_TRUE_CTLS |
  5980. ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
  5981. (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
  5982. if (cpu_has_vmx_basic_inout())
  5983. msrs->basic |= VMX_BASIC_INOUT;
  5984. /*
  5985. * These MSRs specify bits which the guest must keep fixed on
  5986. * while L1 is in VMXON mode (in L1's root mode, or running an L2).
  5987. * We picked the standard core2 setting.
  5988. */
  5989. #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
  5990. #define VMXON_CR4_ALWAYSON X86_CR4_VMXE
  5991. msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
  5992. msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
  5993. /* These MSRs specify bits which the guest must keep fixed off. */
  5994. rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
  5995. rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
  5996. if (vmx_umip_emulated())
  5997. msrs->cr4_fixed1 |= X86_CR4_UMIP;
  5998. msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
  5999. }
  6000. void nested_vmx_hardware_unsetup(void)
  6001. {
  6002. int i;
  6003. if (enable_shadow_vmcs) {
  6004. for (i = 0; i < VMX_BITMAP_NR; i++)
  6005. free_page((unsigned long)vmx_bitmap[i]);
  6006. }
  6007. }
  6008. __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
  6009. {
  6010. int i;
  6011. if (!cpu_has_vmx_shadow_vmcs())
  6012. enable_shadow_vmcs = 0;
  6013. if (enable_shadow_vmcs) {
  6014. for (i = 0; i < VMX_BITMAP_NR; i++) {
  6015. /*
  6016. * The vmx_bitmap is not tied to a VM and so should
  6017. * not be charged to a memcg.
  6018. */
  6019. vmx_bitmap[i] = (unsigned long *)
  6020. __get_free_page(GFP_KERNEL);
  6021. if (!vmx_bitmap[i]) {
  6022. nested_vmx_hardware_unsetup();
  6023. return -ENOMEM;
  6024. }
  6025. }
  6026. init_vmcs_shadow_fields();
  6027. }
  6028. exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
  6029. exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
  6030. exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
  6031. exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
  6032. exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
  6033. exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
  6034. exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
  6035. exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff;
  6036. exit_handlers[EXIT_REASON_VMON] = handle_vmxon;
  6037. exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
  6038. exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
  6039. exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
  6040. return 0;
  6041. }
  6042. struct kvm_x86_nested_ops vmx_nested_ops = {
  6043. .leave_nested = vmx_leave_nested,
  6044. .is_exception_vmexit = nested_vmx_is_exception_vmexit,
  6045. .check_events = vmx_check_nested_events,
  6046. .has_events = vmx_has_nested_events,
  6047. .triple_fault = nested_vmx_triple_fault,
  6048. .get_state = vmx_get_nested_state,
  6049. .set_state = vmx_set_nested_state,
  6050. .get_nested_state_pages = vmx_get_nested_state_pages,
  6051. .write_log_dirty = nested_vmx_write_pml_buffer,
  6052. .enable_evmcs = nested_enable_evmcs,
  6053. .get_evmcs_version = nested_get_evmcs_version,
  6054. };