lpar.c 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * pSeries_lpar.c
  4. * Copyright (C) 2001 Todd Inglett, IBM Corporation
  5. *
  6. * pSeries LPAR support.
  7. */
  8. /* Enables debugging of low-level hash table routines - careful! */
  9. #undef DEBUG
  10. #define pr_fmt(fmt) "lpar: " fmt
  11. #include <linux/kernel.h>
  12. #include <linux/dma-mapping.h>
  13. #include <linux/console.h>
  14. #include <linux/export.h>
  15. #include <linux/jump_label.h>
  16. #include <linux/delay.h>
  17. #include <linux/stop_machine.h>
  18. #include <linux/spinlock.h>
  19. #include <linux/cpuhotplug.h>
  20. #include <linux/workqueue.h>
  21. #include <linux/proc_fs.h>
  22. #include <linux/pgtable.h>
  23. #include <linux/debugfs.h>
  24. #include <asm/processor.h>
  25. #include <asm/mmu.h>
  26. #include <asm/page.h>
  27. #include <asm/setup.h>
  28. #include <asm/mmu_context.h>
  29. #include <asm/iommu.h>
  30. #include <asm/tlb.h>
  31. #include <asm/cputable.h>
  32. #include <asm/udbg.h>
  33. #include <asm/smp.h>
  34. #include <asm/trace.h>
  35. #include <asm/firmware.h>
  36. #include <asm/plpar_wrappers.h>
  37. #include <asm/kexec.h>
  38. #include <asm/fadump.h>
  39. #include <asm/dtl.h>
  40. #include "pseries.h"
  41. /* Flag bits for H_BULK_REMOVE */
  42. #define HBR_REQUEST 0x4000000000000000UL
  43. #define HBR_RESPONSE 0x8000000000000000UL
  44. #define HBR_END 0xc000000000000000UL
  45. #define HBR_AVPN 0x0200000000000000UL
  46. #define HBR_ANDCOND 0x0100000000000000UL
  47. /* in hvCall.S */
  48. EXPORT_SYMBOL(plpar_hcall);
  49. EXPORT_SYMBOL(plpar_hcall9);
  50. EXPORT_SYMBOL(plpar_hcall_norets);
  51. #ifdef CONFIG_PPC_64S_HASH_MMU
  52. /*
  53. * H_BLOCK_REMOVE supported block size for this page size in segment who's base
  54. * page size is that page size.
  55. *
  56. * The first index is the segment base page size, the second one is the actual
  57. * page size.
  58. */
  59. static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
  60. #endif
  61. /*
  62. * Due to the involved complexity, and that the current hypervisor is only
  63. * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
  64. * buffer size to 8 size block.
  65. */
  66. #define HBLKRM_SUPPORTED_BLOCK_SIZE 8
  67. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  68. static u8 dtl_mask = DTL_LOG_PREEMPT;
  69. #else
  70. static u8 dtl_mask;
  71. #endif
  72. void alloc_dtl_buffers(unsigned long *time_limit)
  73. {
  74. int cpu;
  75. struct paca_struct *pp;
  76. struct dtl_entry *dtl;
  77. for_each_possible_cpu(cpu) {
  78. pp = paca_ptrs[cpu];
  79. if (pp->dispatch_log)
  80. continue;
  81. dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
  82. if (!dtl) {
  83. pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
  84. cpu);
  85. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  86. pr_warn("Stolen time statistics will be unreliable\n");
  87. #endif
  88. break;
  89. }
  90. pp->dtl_ridx = 0;
  91. pp->dispatch_log = dtl;
  92. pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
  93. pp->dtl_curr = dtl;
  94. if (time_limit && time_after(jiffies, *time_limit)) {
  95. cond_resched();
  96. *time_limit = jiffies + HZ;
  97. }
  98. }
  99. }
  100. void register_dtl_buffer(int cpu)
  101. {
  102. long ret;
  103. struct paca_struct *pp;
  104. struct dtl_entry *dtl;
  105. int hwcpu = get_hard_smp_processor_id(cpu);
  106. pp = paca_ptrs[cpu];
  107. dtl = pp->dispatch_log;
  108. if (dtl && dtl_mask) {
  109. pp->dtl_ridx = 0;
  110. pp->dtl_curr = dtl;
  111. lppaca_of(cpu).dtl_idx = 0;
  112. /* hypervisor reads buffer length from this field */
  113. dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
  114. ret = register_dtl(hwcpu, __pa(dtl));
  115. if (ret)
  116. pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n",
  117. cpu, hwcpu, ret);
  118. lppaca_of(cpu).dtl_enable_mask = dtl_mask;
  119. }
  120. }
  121. #ifdef CONFIG_PPC_SPLPAR
  122. struct dtl_worker {
  123. struct delayed_work work;
  124. int cpu;
  125. };
  126. struct vcpu_dispatch_data {
  127. int last_disp_cpu;
  128. int total_disp;
  129. int same_cpu_disp;
  130. int same_chip_disp;
  131. int diff_chip_disp;
  132. int far_chip_disp;
  133. int numa_home_disp;
  134. int numa_remote_disp;
  135. int numa_far_disp;
  136. };
  137. /*
  138. * This represents the number of cpus in the hypervisor. Since there is no
  139. * architected way to discover the number of processors in the host, we
  140. * provision for dealing with NR_CPUS. This is currently 2048 by default, and
  141. * is sufficient for our purposes. This will need to be tweaked if
  142. * CONFIG_NR_CPUS is changed.
  143. */
  144. #define NR_CPUS_H NR_CPUS
  145. DEFINE_RWLOCK(dtl_access_lock);
  146. static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data);
  147. static DEFINE_PER_CPU(u64, dtl_entry_ridx);
  148. static DEFINE_PER_CPU(struct dtl_worker, dtl_workers);
  149. static enum cpuhp_state dtl_worker_state;
  150. static DEFINE_MUTEX(dtl_enable_mutex);
  151. static int vcpudispatch_stats_on __read_mostly;
  152. static int vcpudispatch_stats_freq = 50;
  153. static __be32 *vcpu_associativity, *pcpu_associativity;
  154. static void free_dtl_buffers(unsigned long *time_limit)
  155. {
  156. #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  157. int cpu;
  158. struct paca_struct *pp;
  159. for_each_possible_cpu(cpu) {
  160. pp = paca_ptrs[cpu];
  161. if (!pp->dispatch_log)
  162. continue;
  163. kmem_cache_free(dtl_cache, pp->dispatch_log);
  164. pp->dtl_ridx = 0;
  165. pp->dispatch_log = 0;
  166. pp->dispatch_log_end = 0;
  167. pp->dtl_curr = 0;
  168. if (time_limit && time_after(jiffies, *time_limit)) {
  169. cond_resched();
  170. *time_limit = jiffies + HZ;
  171. }
  172. }
  173. #endif
  174. }
  175. static int init_cpu_associativity(void)
  176. {
  177. vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core,
  178. VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
  179. pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core,
  180. VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
  181. if (!vcpu_associativity || !pcpu_associativity) {
  182. pr_err("error allocating memory for associativity information\n");
  183. return -ENOMEM;
  184. }
  185. return 0;
  186. }
  187. static void destroy_cpu_associativity(void)
  188. {
  189. kfree(vcpu_associativity);
  190. kfree(pcpu_associativity);
  191. vcpu_associativity = pcpu_associativity = 0;
  192. }
  193. static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag)
  194. {
  195. __be32 *assoc;
  196. int rc = 0;
  197. assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE];
  198. if (!assoc[0]) {
  199. rc = hcall_vphn(cpu, flag, &assoc[0]);
  200. if (rc)
  201. return NULL;
  202. }
  203. return assoc;
  204. }
  205. static __be32 *get_pcpu_associativity(int cpu)
  206. {
  207. return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU);
  208. }
  209. static __be32 *get_vcpu_associativity(int cpu)
  210. {
  211. return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU);
  212. }
  213. static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
  214. {
  215. __be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc;
  216. if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H)
  217. return -EINVAL;
  218. last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu);
  219. cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu);
  220. if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
  221. return -EIO;
  222. return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
  223. }
  224. static int cpu_home_node_dispatch_distance(int disp_cpu)
  225. {
  226. __be32 *disp_cpu_assoc, *vcpu_assoc;
  227. int vcpu_id = smp_processor_id();
  228. if (disp_cpu >= NR_CPUS_H) {
  229. pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n",
  230. disp_cpu, NR_CPUS_H);
  231. return -EINVAL;
  232. }
  233. disp_cpu_assoc = get_pcpu_associativity(disp_cpu);
  234. vcpu_assoc = get_vcpu_associativity(vcpu_id);
  235. if (!disp_cpu_assoc || !vcpu_assoc)
  236. return -EIO;
  237. return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
  238. }
  239. static void update_vcpu_disp_stat(int disp_cpu)
  240. {
  241. struct vcpu_dispatch_data *disp;
  242. int distance;
  243. disp = this_cpu_ptr(&vcpu_disp_data);
  244. if (disp->last_disp_cpu == -1) {
  245. disp->last_disp_cpu = disp_cpu;
  246. return;
  247. }
  248. disp->total_disp++;
  249. if (disp->last_disp_cpu == disp_cpu ||
  250. (cpu_first_thread_sibling(disp->last_disp_cpu) ==
  251. cpu_first_thread_sibling(disp_cpu)))
  252. disp->same_cpu_disp++;
  253. else {
  254. distance = cpu_relative_dispatch_distance(disp->last_disp_cpu,
  255. disp_cpu);
  256. if (distance < 0)
  257. pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
  258. smp_processor_id());
  259. else {
  260. switch (distance) {
  261. case 0:
  262. disp->same_chip_disp++;
  263. break;
  264. case 1:
  265. disp->diff_chip_disp++;
  266. break;
  267. case 2:
  268. disp->far_chip_disp++;
  269. break;
  270. default:
  271. pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n",
  272. smp_processor_id(),
  273. disp->last_disp_cpu,
  274. disp_cpu,
  275. distance);
  276. }
  277. }
  278. }
  279. distance = cpu_home_node_dispatch_distance(disp_cpu);
  280. if (distance < 0)
  281. pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
  282. smp_processor_id());
  283. else {
  284. switch (distance) {
  285. case 0:
  286. disp->numa_home_disp++;
  287. break;
  288. case 1:
  289. disp->numa_remote_disp++;
  290. break;
  291. case 2:
  292. disp->numa_far_disp++;
  293. break;
  294. default:
  295. pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n",
  296. smp_processor_id(),
  297. disp_cpu,
  298. distance);
  299. }
  300. }
  301. disp->last_disp_cpu = disp_cpu;
  302. }
  303. static void process_dtl_buffer(struct work_struct *work)
  304. {
  305. struct dtl_entry dtle;
  306. u64 i = __this_cpu_read(dtl_entry_ridx);
  307. struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
  308. struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
  309. struct lppaca *vpa = local_paca->lppaca_ptr;
  310. struct dtl_worker *d = container_of(work, struct dtl_worker, work.work);
  311. if (!local_paca->dispatch_log)
  312. return;
  313. /* if we have been migrated away, we cancel ourself */
  314. if (d->cpu != smp_processor_id()) {
  315. pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n",
  316. smp_processor_id());
  317. return;
  318. }
  319. if (i == be64_to_cpu(vpa->dtl_idx))
  320. goto out;
  321. while (i < be64_to_cpu(vpa->dtl_idx)) {
  322. dtle = *dtl;
  323. barrier();
  324. if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
  325. /* buffer has overflowed */
  326. pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n",
  327. d->cpu,
  328. be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i);
  329. i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
  330. dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
  331. continue;
  332. }
  333. update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id));
  334. ++i;
  335. ++dtl;
  336. if (dtl == dtl_end)
  337. dtl = local_paca->dispatch_log;
  338. }
  339. __this_cpu_write(dtl_entry_ridx, i);
  340. out:
  341. schedule_delayed_work_on(d->cpu, to_delayed_work(work),
  342. HZ / vcpudispatch_stats_freq);
  343. }
  344. static int dtl_worker_online(unsigned int cpu)
  345. {
  346. struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
  347. memset(d, 0, sizeof(*d));
  348. INIT_DELAYED_WORK(&d->work, process_dtl_buffer);
  349. d->cpu = cpu;
  350. #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  351. per_cpu(dtl_entry_ridx, cpu) = 0;
  352. register_dtl_buffer(cpu);
  353. #else
  354. per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx);
  355. #endif
  356. schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq);
  357. return 0;
  358. }
  359. static int dtl_worker_offline(unsigned int cpu)
  360. {
  361. struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
  362. cancel_delayed_work_sync(&d->work);
  363. #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  364. unregister_dtl(get_hard_smp_processor_id(cpu));
  365. #endif
  366. return 0;
  367. }
  368. static void set_global_dtl_mask(u8 mask)
  369. {
  370. int cpu;
  371. dtl_mask = mask;
  372. for_each_present_cpu(cpu)
  373. lppaca_of(cpu).dtl_enable_mask = dtl_mask;
  374. }
  375. static void reset_global_dtl_mask(void)
  376. {
  377. int cpu;
  378. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  379. dtl_mask = DTL_LOG_PREEMPT;
  380. #else
  381. dtl_mask = 0;
  382. #endif
  383. for_each_present_cpu(cpu)
  384. lppaca_of(cpu).dtl_enable_mask = dtl_mask;
  385. }
  386. static int dtl_worker_enable(unsigned long *time_limit)
  387. {
  388. int rc = 0, state;
  389. if (!write_trylock(&dtl_access_lock)) {
  390. rc = -EBUSY;
  391. goto out;
  392. }
  393. set_global_dtl_mask(DTL_LOG_ALL);
  394. /* Setup dtl buffers and register those */
  395. alloc_dtl_buffers(time_limit);
  396. state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online",
  397. dtl_worker_online, dtl_worker_offline);
  398. if (state < 0) {
  399. pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n");
  400. free_dtl_buffers(time_limit);
  401. reset_global_dtl_mask();
  402. write_unlock(&dtl_access_lock);
  403. rc = -EINVAL;
  404. goto out;
  405. }
  406. dtl_worker_state = state;
  407. out:
  408. return rc;
  409. }
  410. static void dtl_worker_disable(unsigned long *time_limit)
  411. {
  412. cpuhp_remove_state(dtl_worker_state);
  413. free_dtl_buffers(time_limit);
  414. reset_global_dtl_mask();
  415. write_unlock(&dtl_access_lock);
  416. }
  417. static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p,
  418. size_t count, loff_t *ppos)
  419. {
  420. unsigned long time_limit = jiffies + HZ;
  421. struct vcpu_dispatch_data *disp;
  422. int rc, cmd, cpu;
  423. char buf[16];
  424. if (count > 15)
  425. return -EINVAL;
  426. if (copy_from_user(buf, p, count))
  427. return -EFAULT;
  428. buf[count] = 0;
  429. rc = kstrtoint(buf, 0, &cmd);
  430. if (rc || cmd < 0 || cmd > 1) {
  431. pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n");
  432. return rc ? rc : -EINVAL;
  433. }
  434. mutex_lock(&dtl_enable_mutex);
  435. if ((cmd == 0 && !vcpudispatch_stats_on) ||
  436. (cmd == 1 && vcpudispatch_stats_on))
  437. goto out;
  438. if (cmd) {
  439. rc = init_cpu_associativity();
  440. if (rc) {
  441. destroy_cpu_associativity();
  442. goto out;
  443. }
  444. for_each_possible_cpu(cpu) {
  445. disp = per_cpu_ptr(&vcpu_disp_data, cpu);
  446. memset(disp, 0, sizeof(*disp));
  447. disp->last_disp_cpu = -1;
  448. }
  449. rc = dtl_worker_enable(&time_limit);
  450. if (rc) {
  451. destroy_cpu_associativity();
  452. goto out;
  453. }
  454. } else {
  455. dtl_worker_disable(&time_limit);
  456. destroy_cpu_associativity();
  457. }
  458. vcpudispatch_stats_on = cmd;
  459. out:
  460. mutex_unlock(&dtl_enable_mutex);
  461. if (rc)
  462. return rc;
  463. return count;
  464. }
  465. static int vcpudispatch_stats_display(struct seq_file *p, void *v)
  466. {
  467. int cpu;
  468. struct vcpu_dispatch_data *disp;
  469. if (!vcpudispatch_stats_on) {
  470. seq_puts(p, "off\n");
  471. return 0;
  472. }
  473. for_each_online_cpu(cpu) {
  474. disp = per_cpu_ptr(&vcpu_disp_data, cpu);
  475. seq_printf(p, "cpu%d", cpu);
  476. seq_put_decimal_ull(p, " ", disp->total_disp);
  477. seq_put_decimal_ull(p, " ", disp->same_cpu_disp);
  478. seq_put_decimal_ull(p, " ", disp->same_chip_disp);
  479. seq_put_decimal_ull(p, " ", disp->diff_chip_disp);
  480. seq_put_decimal_ull(p, " ", disp->far_chip_disp);
  481. seq_put_decimal_ull(p, " ", disp->numa_home_disp);
  482. seq_put_decimal_ull(p, " ", disp->numa_remote_disp);
  483. seq_put_decimal_ull(p, " ", disp->numa_far_disp);
  484. seq_puts(p, "\n");
  485. }
  486. return 0;
  487. }
  488. static int vcpudispatch_stats_open(struct inode *inode, struct file *file)
  489. {
  490. return single_open(file, vcpudispatch_stats_display, NULL);
  491. }
  492. static const struct proc_ops vcpudispatch_stats_proc_ops = {
  493. .proc_open = vcpudispatch_stats_open,
  494. .proc_read = seq_read,
  495. .proc_write = vcpudispatch_stats_write,
  496. .proc_lseek = seq_lseek,
  497. .proc_release = single_release,
  498. };
  499. static ssize_t vcpudispatch_stats_freq_write(struct file *file,
  500. const char __user *p, size_t count, loff_t *ppos)
  501. {
  502. int rc, freq;
  503. char buf[16];
  504. if (count > 15)
  505. return -EINVAL;
  506. if (copy_from_user(buf, p, count))
  507. return -EFAULT;
  508. buf[count] = 0;
  509. rc = kstrtoint(buf, 0, &freq);
  510. if (rc || freq < 1 || freq > HZ) {
  511. pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n",
  512. HZ);
  513. return rc ? rc : -EINVAL;
  514. }
  515. vcpudispatch_stats_freq = freq;
  516. return count;
  517. }
  518. static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v)
  519. {
  520. seq_printf(p, "%d\n", vcpudispatch_stats_freq);
  521. return 0;
  522. }
  523. static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file)
  524. {
  525. return single_open(file, vcpudispatch_stats_freq_display, NULL);
  526. }
  527. static const struct proc_ops vcpudispatch_stats_freq_proc_ops = {
  528. .proc_open = vcpudispatch_stats_freq_open,
  529. .proc_read = seq_read,
  530. .proc_write = vcpudispatch_stats_freq_write,
  531. .proc_lseek = seq_lseek,
  532. .proc_release = single_release,
  533. };
  534. static int __init vcpudispatch_stats_procfs_init(void)
  535. {
  536. if (!lppaca_shared_proc())
  537. return 0;
  538. if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
  539. &vcpudispatch_stats_proc_ops))
  540. pr_err("vcpudispatch_stats: error creating procfs file\n");
  541. else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL,
  542. &vcpudispatch_stats_freq_proc_ops))
  543. pr_err("vcpudispatch_stats_freq: error creating procfs file\n");
  544. return 0;
  545. }
  546. machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
  547. #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
  548. u64 pseries_paravirt_steal_clock(int cpu)
  549. {
  550. struct lppaca *lppaca = &lppaca_of(cpu);
  551. return be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) +
  552. be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb));
  553. }
  554. #endif
  555. #endif /* CONFIG_PPC_SPLPAR */
  556. void vpa_init(int cpu)
  557. {
  558. int hwcpu = get_hard_smp_processor_id(cpu);
  559. unsigned long addr;
  560. long ret;
  561. /*
  562. * The spec says it "may be problematic" if CPU x registers the VPA of
  563. * CPU y. We should never do that, but wail if we ever do.
  564. */
  565. WARN_ON(cpu != smp_processor_id());
  566. if (cpu_has_feature(CPU_FTR_ALTIVEC))
  567. lppaca_of(cpu).vmxregs_in_use = 1;
  568. if (cpu_has_feature(CPU_FTR_ARCH_207S))
  569. lppaca_of(cpu).ebb_regs_in_use = 1;
  570. addr = __pa(&lppaca_of(cpu));
  571. ret = register_vpa(hwcpu, addr);
  572. if (ret) {
  573. pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
  574. "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
  575. return;
  576. }
  577. #ifdef CONFIG_PPC_64S_HASH_MMU
  578. /*
  579. * PAPR says this feature is SLB-Buffer but firmware never
  580. * reports that. All SPLPAR support SLB shadow buffer.
  581. */
  582. if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
  583. addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
  584. ret = register_slb_shadow(hwcpu, addr);
  585. if (ret)
  586. pr_err("WARNING: SLB shadow buffer registration for "
  587. "cpu %d (hw %d) of area %lx failed with %ld\n",
  588. cpu, hwcpu, addr, ret);
  589. }
  590. #endif /* CONFIG_PPC_64S_HASH_MMU */
  591. /*
  592. * Register dispatch trace log, if one has been allocated.
  593. */
  594. register_dtl_buffer(cpu);
  595. }
  596. #ifdef CONFIG_PPC_BOOK3S_64
  597. static int __init pseries_lpar_register_process_table(unsigned long base,
  598. unsigned long page_size, unsigned long table_size)
  599. {
  600. long rc;
  601. unsigned long flags = 0;
  602. if (table_size)
  603. flags |= PROC_TABLE_NEW;
  604. if (radix_enabled()) {
  605. flags |= PROC_TABLE_RADIX;
  606. if (mmu_has_feature(MMU_FTR_GTSE))
  607. flags |= PROC_TABLE_GTSE;
  608. } else
  609. flags |= PROC_TABLE_HPT_SLB;
  610. for (;;) {
  611. rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
  612. page_size, table_size);
  613. if (!H_IS_LONG_BUSY(rc))
  614. break;
  615. mdelay(get_longbusy_msecs(rc));
  616. }
  617. if (rc != H_SUCCESS) {
  618. pr_err("Failed to register process table (rc=%ld)\n", rc);
  619. BUG();
  620. }
  621. return rc;
  622. }
  623. #ifdef CONFIG_PPC_64S_HASH_MMU
  624. static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
  625. unsigned long vpn, unsigned long pa,
  626. unsigned long rflags, unsigned long vflags,
  627. int psize, int apsize, int ssize)
  628. {
  629. unsigned long lpar_rc;
  630. unsigned long flags;
  631. unsigned long slot;
  632. unsigned long hpte_v, hpte_r;
  633. if (!(vflags & HPTE_V_BOLTED))
  634. pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
  635. "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
  636. hpte_group, vpn, pa, rflags, vflags, psize);
  637. hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
  638. hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
  639. if (!(vflags & HPTE_V_BOLTED))
  640. pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
  641. /* Now fill in the actual HPTE */
  642. /* Set CEC cookie to 0 */
  643. /* Zero page = 0 */
  644. /* I-cache Invalidate = 0 */
  645. /* I-cache synchronize = 0 */
  646. /* Exact = 0 */
  647. flags = 0;
  648. if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
  649. flags |= H_COALESCE_CAND;
  650. lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
  651. if (unlikely(lpar_rc == H_PTEG_FULL)) {
  652. pr_devel("Hash table group is full\n");
  653. return -1;
  654. }
  655. /*
  656. * Since we try and ioremap PHBs we don't own, the pte insert
  657. * will fail. However we must catch the failure in hash_page
  658. * or we will loop forever, so return -2 in this case.
  659. */
  660. if (unlikely(lpar_rc != H_SUCCESS)) {
  661. pr_err("Failed hash pte insert with error %ld\n", lpar_rc);
  662. return -2;
  663. }
  664. if (!(vflags & HPTE_V_BOLTED))
  665. pr_devel(" -> slot: %lu\n", slot & 7);
  666. /* Because of iSeries, we have to pass down the secondary
  667. * bucket bit here as well
  668. */
  669. return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
  670. }
  671. static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
  672. static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
  673. {
  674. unsigned long slot_offset;
  675. unsigned long lpar_rc;
  676. int i;
  677. unsigned long dummy1, dummy2;
  678. /* pick a random slot to start at */
  679. slot_offset = mftb() & 0x7;
  680. for (i = 0; i < HPTES_PER_GROUP; i++) {
  681. /* don't remove a bolted entry */
  682. lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
  683. HPTE_V_BOLTED, &dummy1, &dummy2);
  684. if (lpar_rc == H_SUCCESS)
  685. return i;
  686. /*
  687. * The test for adjunct partition is performed before the
  688. * ANDCOND test. H_RESOURCE may be returned, so we need to
  689. * check for that as well.
  690. */
  691. BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
  692. slot_offset++;
  693. slot_offset &= 0x7;
  694. }
  695. return -1;
  696. }
  697. /* Called during kexec sequence with MMU off */
  698. static notrace void manual_hpte_clear_all(void)
  699. {
  700. unsigned long size_bytes = 1UL << ppc64_pft_size;
  701. unsigned long hpte_count = size_bytes >> 4;
  702. struct {
  703. unsigned long pteh;
  704. unsigned long ptel;
  705. } ptes[4];
  706. long lpar_rc;
  707. unsigned long i, j;
  708. /* Read in batches of 4,
  709. * invalidate only valid entries not in the VRMA
  710. * hpte_count will be a multiple of 4
  711. */
  712. for (i = 0; i < hpte_count; i += 4) {
  713. lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
  714. if (lpar_rc != H_SUCCESS) {
  715. pr_info("Failed to read hash page table at %ld err %ld\n",
  716. i, lpar_rc);
  717. continue;
  718. }
  719. for (j = 0; j < 4; j++){
  720. if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
  721. HPTE_V_VRMA_MASK)
  722. continue;
  723. if (ptes[j].pteh & HPTE_V_VALID)
  724. plpar_pte_remove_raw(0, i + j, 0,
  725. &(ptes[j].pteh), &(ptes[j].ptel));
  726. }
  727. }
  728. }
  729. /* Called during kexec sequence with MMU off */
  730. static notrace int hcall_hpte_clear_all(void)
  731. {
  732. int rc;
  733. do {
  734. rc = plpar_hcall_norets(H_CLEAR_HPT);
  735. } while (rc == H_CONTINUE);
  736. return rc;
  737. }
  738. /* Called during kexec sequence with MMU off */
  739. static notrace void pseries_hpte_clear_all(void)
  740. {
  741. int rc;
  742. rc = hcall_hpte_clear_all();
  743. if (rc != H_SUCCESS)
  744. manual_hpte_clear_all();
  745. #ifdef __LITTLE_ENDIAN__
  746. /*
  747. * Reset exceptions to big endian.
  748. *
  749. * FIXME this is a hack for kexec, we need to reset the exception
  750. * endian before starting the new kernel and this is a convenient place
  751. * to do it.
  752. *
  753. * This is also called on boot when a fadump happens. In that case we
  754. * must not change the exception endian mode.
  755. */
  756. if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active())
  757. pseries_big_endian_exceptions();
  758. #endif
  759. }
  760. /*
  761. * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
  762. * the low 3 bits of flags happen to line up. So no transform is needed.
  763. * We can probably optimize here and assume the high bits of newpp are
  764. * already zero. For now I am paranoid.
  765. */
  766. static long pSeries_lpar_hpte_updatepp(unsigned long slot,
  767. unsigned long newpp,
  768. unsigned long vpn,
  769. int psize, int apsize,
  770. int ssize, unsigned long inv_flags)
  771. {
  772. unsigned long lpar_rc;
  773. unsigned long flags;
  774. unsigned long want_v;
  775. want_v = hpte_encode_avpn(vpn, psize, ssize);
  776. flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN;
  777. flags |= (newpp & HPTE_R_KEY_HI) >> 48;
  778. if (mmu_has_feature(MMU_FTR_KERNEL_RO))
  779. /* Move pp0 into bit 8 (IBM 55) */
  780. flags |= (newpp & HPTE_R_PP0) >> 55;
  781. pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
  782. want_v, slot, flags, psize);
  783. lpar_rc = plpar_pte_protect(flags, slot, want_v);
  784. if (lpar_rc == H_NOT_FOUND) {
  785. pr_devel("not found !\n");
  786. return -1;
  787. }
  788. pr_devel("ok\n");
  789. BUG_ON(lpar_rc != H_SUCCESS);
  790. return 0;
  791. }
  792. static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
  793. {
  794. long lpar_rc;
  795. unsigned long i, j;
  796. struct {
  797. unsigned long pteh;
  798. unsigned long ptel;
  799. } ptes[4];
  800. for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
  801. lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
  802. if (lpar_rc != H_SUCCESS) {
  803. pr_info("Failed to read hash page table at %ld err %ld\n",
  804. hpte_group, lpar_rc);
  805. continue;
  806. }
  807. for (j = 0; j < 4; j++) {
  808. if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
  809. (ptes[j].pteh & HPTE_V_VALID))
  810. return i + j;
  811. }
  812. }
  813. return -1;
  814. }
  815. static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
  816. {
  817. long slot;
  818. unsigned long hash;
  819. unsigned long want_v;
  820. unsigned long hpte_group;
  821. hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
  822. want_v = hpte_encode_avpn(vpn, psize, ssize);
  823. /*
  824. * We try to keep bolted entries always in primary hash
  825. * But in some case we can find them in secondary too.
  826. */
  827. hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
  828. slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
  829. if (slot < 0) {
  830. /* Try in secondary */
  831. hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
  832. slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
  833. if (slot < 0)
  834. return -1;
  835. }
  836. return hpte_group + slot;
  837. }
  838. static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
  839. unsigned long ea,
  840. int psize, int ssize)
  841. {
  842. unsigned long vpn;
  843. unsigned long lpar_rc, slot, vsid, flags;
  844. vsid = get_kernel_vsid(ea, ssize);
  845. vpn = hpt_vpn(ea, vsid, ssize);
  846. slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
  847. BUG_ON(slot == -1);
  848. flags = newpp & (HPTE_R_PP | HPTE_R_N);
  849. if (mmu_has_feature(MMU_FTR_KERNEL_RO))
  850. /* Move pp0 into bit 8 (IBM 55) */
  851. flags |= (newpp & HPTE_R_PP0) >> 55;
  852. flags |= ((newpp & HPTE_R_KEY_HI) >> 48) | (newpp & HPTE_R_KEY_LO);
  853. lpar_rc = plpar_pte_protect(flags, slot, 0);
  854. BUG_ON(lpar_rc != H_SUCCESS);
  855. }
  856. static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
  857. int psize, int apsize,
  858. int ssize, int local)
  859. {
  860. unsigned long want_v;
  861. unsigned long lpar_rc;
  862. unsigned long dummy1, dummy2;
  863. pr_devel(" inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
  864. slot, vpn, psize, local);
  865. want_v = hpte_encode_avpn(vpn, psize, ssize);
  866. lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
  867. if (lpar_rc == H_NOT_FOUND)
  868. return;
  869. BUG_ON(lpar_rc != H_SUCCESS);
  870. }
  871. /*
  872. * As defined in the PAPR's section 14.5.4.1.8
  873. * The control mask doesn't include the returned reference and change bit from
  874. * the processed PTE.
  875. */
  876. #define HBLKR_AVPN 0x0100000000000000UL
  877. #define HBLKR_CTRL_MASK 0xf800000000000000UL
  878. #define HBLKR_CTRL_SUCCESS 0x8000000000000000UL
  879. #define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL
  880. #define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL
  881. /*
  882. * Returned true if we are supporting this block size for the specified segment
  883. * base page size and actual page size.
  884. *
  885. * Currently, we only support 8 size block.
  886. */
  887. static inline bool is_supported_hlbkrm(int bpsize, int psize)
  888. {
  889. return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
  890. }
  891. /**
  892. * H_BLOCK_REMOVE caller.
  893. * @idx should point to the latest @param entry set with a PTEX.
  894. * If PTE cannot be processed because another CPUs has already locked that
  895. * group, those entries are put back in @param starting at index 1.
  896. * If entries has to be retried and @retry_busy is set to true, these entries
  897. * are retried until success. If @retry_busy is set to false, the returned
  898. * is the number of entries yet to process.
  899. */
  900. static unsigned long call_block_remove(unsigned long idx, unsigned long *param,
  901. bool retry_busy)
  902. {
  903. unsigned long i, rc, new_idx;
  904. unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
  905. if (idx < 2) {
  906. pr_warn("Unexpected empty call to H_BLOCK_REMOVE");
  907. return 0;
  908. }
  909. again:
  910. new_idx = 0;
  911. if (idx > PLPAR_HCALL9_BUFSIZE) {
  912. pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx);
  913. idx = PLPAR_HCALL9_BUFSIZE;
  914. } else if (idx < PLPAR_HCALL9_BUFSIZE)
  915. param[idx] = HBR_END;
  916. rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf,
  917. param[0], /* AVA */
  918. param[1], param[2], param[3], param[4], /* TS0-7 */
  919. param[5], param[6], param[7], param[8]);
  920. if (rc == H_SUCCESS)
  921. return 0;
  922. BUG_ON(rc != H_PARTIAL);
  923. /* Check that the unprocessed entries were 'not found' or 'busy' */
  924. for (i = 0; i < idx-1; i++) {
  925. unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK;
  926. if (ctrl == HBLKR_CTRL_ERRBUSY) {
  927. param[++new_idx] = param[i+1];
  928. continue;
  929. }
  930. BUG_ON(ctrl != HBLKR_CTRL_SUCCESS
  931. && ctrl != HBLKR_CTRL_ERRNOTFOUND);
  932. }
  933. /*
  934. * If there were entries found busy, retry these entries if requested,
  935. * of if all the entries have to be retried.
  936. */
  937. if (new_idx && (retry_busy || new_idx == (PLPAR_HCALL9_BUFSIZE-1))) {
  938. idx = new_idx + 1;
  939. goto again;
  940. }
  941. return new_idx;
  942. }
  943. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  944. /*
  945. * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
  946. * to make sure that we avoid bouncing the hypervisor tlbie lock.
  947. */
  948. #define PPC64_HUGE_HPTE_BATCH 12
  949. static void hugepage_block_invalidate(unsigned long *slot, unsigned long *vpn,
  950. int count, int psize, int ssize)
  951. {
  952. unsigned long param[PLPAR_HCALL9_BUFSIZE];
  953. unsigned long shift, current_vpgb, vpgb;
  954. int i, pix = 0;
  955. shift = mmu_psize_defs[psize].shift;
  956. for (i = 0; i < count; i++) {
  957. /*
  958. * Shifting 3 bits more on the right to get a
  959. * 8 pages aligned virtual addresse.
  960. */
  961. vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3));
  962. if (!pix || vpgb != current_vpgb) {
  963. /*
  964. * Need to start a new 8 pages block, flush
  965. * the current one if needed.
  966. */
  967. if (pix)
  968. (void)call_block_remove(pix, param, true);
  969. current_vpgb = vpgb;
  970. param[0] = hpte_encode_avpn(vpn[i], psize, ssize);
  971. pix = 1;
  972. }
  973. param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot[i];
  974. if (pix == PLPAR_HCALL9_BUFSIZE) {
  975. pix = call_block_remove(pix, param, false);
  976. /*
  977. * pix = 0 means that all the entries were
  978. * removed, we can start a new block.
  979. * Otherwise, this means that there are entries
  980. * to retry, and pix points to latest one, so
  981. * we should increment it and try to continue
  982. * the same block.
  983. */
  984. if (pix)
  985. pix++;
  986. }
  987. }
  988. if (pix)
  989. (void)call_block_remove(pix, param, true);
  990. }
  991. static void hugepage_bulk_invalidate(unsigned long *slot, unsigned long *vpn,
  992. int count, int psize, int ssize)
  993. {
  994. unsigned long param[PLPAR_HCALL9_BUFSIZE];
  995. int i = 0, pix = 0, rc;
  996. for (i = 0; i < count; i++) {
  997. if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
  998. pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
  999. ssize, 0);
  1000. } else {
  1001. param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
  1002. param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
  1003. pix += 2;
  1004. if (pix == 8) {
  1005. rc = plpar_hcall9(H_BULK_REMOVE, param,
  1006. param[0], param[1], param[2],
  1007. param[3], param[4], param[5],
  1008. param[6], param[7]);
  1009. BUG_ON(rc != H_SUCCESS);
  1010. pix = 0;
  1011. }
  1012. }
  1013. }
  1014. if (pix) {
  1015. param[pix] = HBR_END;
  1016. rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
  1017. param[2], param[3], param[4], param[5],
  1018. param[6], param[7]);
  1019. BUG_ON(rc != H_SUCCESS);
  1020. }
  1021. }
  1022. static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
  1023. unsigned long *vpn,
  1024. int count, int psize,
  1025. int ssize)
  1026. {
  1027. unsigned long flags = 0;
  1028. int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
  1029. if (lock_tlbie)
  1030. spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
  1031. /* Assuming THP size is 16M */
  1032. if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
  1033. hugepage_block_invalidate(slot, vpn, count, psize, ssize);
  1034. else
  1035. hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
  1036. if (lock_tlbie)
  1037. spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
  1038. }
  1039. static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
  1040. unsigned long addr,
  1041. unsigned char *hpte_slot_array,
  1042. int psize, int ssize, int local)
  1043. {
  1044. int i, index = 0;
  1045. unsigned long s_addr = addr;
  1046. unsigned int max_hpte_count, valid;
  1047. unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
  1048. unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
  1049. unsigned long shift, hidx, vpn = 0, hash, slot;
  1050. shift = mmu_psize_defs[psize].shift;
  1051. max_hpte_count = 1U << (PMD_SHIFT - shift);
  1052. for (i = 0; i < max_hpte_count; i++) {
  1053. valid = hpte_valid(hpte_slot_array, i);
  1054. if (!valid)
  1055. continue;
  1056. hidx = hpte_hash_index(hpte_slot_array, i);
  1057. /* get the vpn */
  1058. addr = s_addr + (i * (1ul << shift));
  1059. vpn = hpt_vpn(addr, vsid, ssize);
  1060. hash = hpt_hash(vpn, shift, ssize);
  1061. if (hidx & _PTEIDX_SECONDARY)
  1062. hash = ~hash;
  1063. slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
  1064. slot += hidx & _PTEIDX_GROUP_IX;
  1065. slot_array[index] = slot;
  1066. vpn_array[index] = vpn;
  1067. if (index == PPC64_HUGE_HPTE_BATCH - 1) {
  1068. /*
  1069. * Now do a bluk invalidate
  1070. */
  1071. __pSeries_lpar_hugepage_invalidate(slot_array,
  1072. vpn_array,
  1073. PPC64_HUGE_HPTE_BATCH,
  1074. psize, ssize);
  1075. index = 0;
  1076. } else
  1077. index++;
  1078. }
  1079. if (index)
  1080. __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
  1081. index, psize, ssize);
  1082. }
  1083. #else
  1084. static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
  1085. unsigned long addr,
  1086. unsigned char *hpte_slot_array,
  1087. int psize, int ssize, int local)
  1088. {
  1089. WARN(1, "%s called without THP support\n", __func__);
  1090. }
  1091. #endif
  1092. static int pSeries_lpar_hpte_removebolted(unsigned long ea,
  1093. int psize, int ssize)
  1094. {
  1095. unsigned long vpn;
  1096. unsigned long slot, vsid;
  1097. vsid = get_kernel_vsid(ea, ssize);
  1098. vpn = hpt_vpn(ea, vsid, ssize);
  1099. slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
  1100. if (slot == -1)
  1101. return -ENOENT;
  1102. /*
  1103. * lpar doesn't use the passed actual page size
  1104. */
  1105. pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
  1106. return 0;
  1107. }
  1108. static inline unsigned long compute_slot(real_pte_t pte,
  1109. unsigned long vpn,
  1110. unsigned long index,
  1111. unsigned long shift,
  1112. int ssize)
  1113. {
  1114. unsigned long slot, hash, hidx;
  1115. hash = hpt_hash(vpn, shift, ssize);
  1116. hidx = __rpte_to_hidx(pte, index);
  1117. if (hidx & _PTEIDX_SECONDARY)
  1118. hash = ~hash;
  1119. slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
  1120. slot += hidx & _PTEIDX_GROUP_IX;
  1121. return slot;
  1122. }
  1123. /**
  1124. * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are
  1125. * "all within the same naturally aligned 8 page virtual address block".
  1126. */
  1127. static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
  1128. unsigned long *param)
  1129. {
  1130. unsigned long vpn;
  1131. unsigned long i, pix = 0;
  1132. unsigned long index, shift, slot, current_vpgb, vpgb;
  1133. real_pte_t pte;
  1134. int psize, ssize;
  1135. psize = batch->psize;
  1136. ssize = batch->ssize;
  1137. for (i = 0; i < number; i++) {
  1138. vpn = batch->vpn[i];
  1139. pte = batch->pte[i];
  1140. pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
  1141. /*
  1142. * Shifting 3 bits more on the right to get a
  1143. * 8 pages aligned virtual addresse.
  1144. */
  1145. vpgb = (vpn >> (shift - VPN_SHIFT + 3));
  1146. if (!pix || vpgb != current_vpgb) {
  1147. /*
  1148. * Need to start a new 8 pages block, flush
  1149. * the current one if needed.
  1150. */
  1151. if (pix)
  1152. (void)call_block_remove(pix, param,
  1153. true);
  1154. current_vpgb = vpgb;
  1155. param[0] = hpte_encode_avpn(vpn, psize,
  1156. ssize);
  1157. pix = 1;
  1158. }
  1159. slot = compute_slot(pte, vpn, index, shift, ssize);
  1160. param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot;
  1161. if (pix == PLPAR_HCALL9_BUFSIZE) {
  1162. pix = call_block_remove(pix, param, false);
  1163. /*
  1164. * pix = 0 means that all the entries were
  1165. * removed, we can start a new block.
  1166. * Otherwise, this means that there are entries
  1167. * to retry, and pix points to latest one, so
  1168. * we should increment it and try to continue
  1169. * the same block.
  1170. */
  1171. if (pix)
  1172. pix++;
  1173. }
  1174. } pte_iterate_hashed_end();
  1175. }
  1176. if (pix)
  1177. (void)call_block_remove(pix, param, true);
  1178. }
  1179. /*
  1180. * TLB Block Invalidate Characteristics
  1181. *
  1182. * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
  1183. * is able to process for each couple segment base page size, actual page size.
  1184. *
  1185. * The ibm,get-system-parameter properties is returning a buffer with the
  1186. * following layout:
  1187. *
  1188. * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
  1189. * -----------------
  1190. * TLB Block Invalidate Specifiers:
  1191. * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
  1192. * [ 1 byte Number of page sizes (N) that are supported for the specified
  1193. * TLB invalidate block size ]
  1194. * [ 1 byte Encoded segment base page size and actual page size
  1195. * MSB=0 means 4k segment base page size and actual page size
  1196. * MSB=1 the penc value in mmu_psize_def ]
  1197. * ...
  1198. * -----------------
  1199. * Next TLB Block Invalidate Specifiers...
  1200. * -----------------
  1201. * [ 0 ]
  1202. */
  1203. static inline void set_hblkrm_bloc_size(int bpsize, int psize,
  1204. unsigned int block_size)
  1205. {
  1206. if (block_size > hblkrm_size[bpsize][psize])
  1207. hblkrm_size[bpsize][psize] = block_size;
  1208. }
  1209. /*
  1210. * Decode the Encoded segment base page size and actual page size.
  1211. * PAPR specifies:
  1212. * - bit 7 is the L bit
  1213. * - bits 0-5 are the penc value
  1214. * If the L bit is 0, this means 4K segment base page size and actual page size
  1215. * otherwise the penc value should be read.
  1216. */
  1217. #define HBLKRM_L_MASK 0x80
  1218. #define HBLKRM_PENC_MASK 0x3f
  1219. static inline void __init check_lp_set_hblkrm(unsigned int lp,
  1220. unsigned int block_size)
  1221. {
  1222. unsigned int bpsize, psize;
  1223. /* First, check the L bit, if not set, this means 4K */
  1224. if ((lp & HBLKRM_L_MASK) == 0) {
  1225. set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
  1226. return;
  1227. }
  1228. lp &= HBLKRM_PENC_MASK;
  1229. for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
  1230. struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
  1231. for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
  1232. if (def->penc[psize] == lp) {
  1233. set_hblkrm_bloc_size(bpsize, psize, block_size);
  1234. return;
  1235. }
  1236. }
  1237. }
  1238. }
  1239. #define SPLPAR_TLB_BIC_TOKEN 50
  1240. /*
  1241. * The size of the TLB Block Invalidate Characteristics is variable. But at the
  1242. * maximum it will be the number of possible page sizes *2 + 10 bytes.
  1243. * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
  1244. * (128 bytes) for the buffer to get plenty of space.
  1245. */
  1246. #define SPLPAR_TLB_BIC_MAXLENGTH 128
  1247. void __init pseries_lpar_read_hblkrm_characteristics(void)
  1248. {
  1249. unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
  1250. int call_status, len, idx, bpsize;
  1251. if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
  1252. return;
  1253. spin_lock(&rtas_data_buf_lock);
  1254. memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
  1255. call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
  1256. NULL,
  1257. SPLPAR_TLB_BIC_TOKEN,
  1258. __pa(rtas_data_buf),
  1259. RTAS_DATA_BUF_SIZE);
  1260. memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
  1261. local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
  1262. spin_unlock(&rtas_data_buf_lock);
  1263. if (call_status != 0) {
  1264. pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
  1265. __FILE__, __func__, call_status);
  1266. return;
  1267. }
  1268. /*
  1269. * The first two (2) bytes of the data in the buffer are the length of
  1270. * the returned data, not counting these first two (2) bytes.
  1271. */
  1272. len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
  1273. if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
  1274. pr_warn("%s too large returned buffer %d", __func__, len);
  1275. return;
  1276. }
  1277. idx = 2;
  1278. while (idx < len) {
  1279. u8 block_shift = local_buffer[idx++];
  1280. u32 block_size;
  1281. unsigned int npsize;
  1282. if (!block_shift)
  1283. break;
  1284. block_size = 1 << block_shift;
  1285. for (npsize = local_buffer[idx++];
  1286. npsize > 0 && idx < len; npsize--)
  1287. check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
  1288. block_size);
  1289. }
  1290. for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
  1291. for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
  1292. if (hblkrm_size[bpsize][idx])
  1293. pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
  1294. bpsize, idx, hblkrm_size[bpsize][idx]);
  1295. }
  1296. /*
  1297. * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
  1298. * lock.
  1299. */
  1300. static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
  1301. {
  1302. unsigned long vpn;
  1303. unsigned long i, pix, rc;
  1304. unsigned long flags = 0;
  1305. struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
  1306. int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
  1307. unsigned long param[PLPAR_HCALL9_BUFSIZE];
  1308. unsigned long index, shift, slot;
  1309. real_pte_t pte;
  1310. int psize, ssize;
  1311. if (lock_tlbie)
  1312. spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
  1313. if (is_supported_hlbkrm(batch->psize, batch->psize)) {
  1314. do_block_remove(number, batch, param);
  1315. goto out;
  1316. }
  1317. psize = batch->psize;
  1318. ssize = batch->ssize;
  1319. pix = 0;
  1320. for (i = 0; i < number; i++) {
  1321. vpn = batch->vpn[i];
  1322. pte = batch->pte[i];
  1323. pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
  1324. slot = compute_slot(pte, vpn, index, shift, ssize);
  1325. if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
  1326. /*
  1327. * lpar doesn't use the passed actual page size
  1328. */
  1329. pSeries_lpar_hpte_invalidate(slot, vpn, psize,
  1330. 0, ssize, local);
  1331. } else {
  1332. param[pix] = HBR_REQUEST | HBR_AVPN | slot;
  1333. param[pix+1] = hpte_encode_avpn(vpn, psize,
  1334. ssize);
  1335. pix += 2;
  1336. if (pix == 8) {
  1337. rc = plpar_hcall9(H_BULK_REMOVE, param,
  1338. param[0], param[1], param[2],
  1339. param[3], param[4], param[5],
  1340. param[6], param[7]);
  1341. BUG_ON(rc != H_SUCCESS);
  1342. pix = 0;
  1343. }
  1344. }
  1345. } pte_iterate_hashed_end();
  1346. }
  1347. if (pix) {
  1348. param[pix] = HBR_END;
  1349. rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
  1350. param[2], param[3], param[4], param[5],
  1351. param[6], param[7]);
  1352. BUG_ON(rc != H_SUCCESS);
  1353. }
  1354. out:
  1355. if (lock_tlbie)
  1356. spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
  1357. }
  1358. static int __init disable_bulk_remove(char *str)
  1359. {
  1360. if (strcmp(str, "off") == 0 &&
  1361. firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
  1362. pr_info("Disabling BULK_REMOVE firmware feature");
  1363. powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
  1364. }
  1365. return 1;
  1366. }
  1367. __setup("bulk_remove=", disable_bulk_remove);
  1368. #define HPT_RESIZE_TIMEOUT 10000 /* ms */
  1369. struct hpt_resize_state {
  1370. unsigned long shift;
  1371. int commit_rc;
  1372. };
  1373. static int pseries_lpar_resize_hpt_commit(void *data)
  1374. {
  1375. struct hpt_resize_state *state = data;
  1376. state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
  1377. if (state->commit_rc != H_SUCCESS)
  1378. return -EIO;
  1379. /* Hypervisor has transitioned the HTAB, update our globals */
  1380. ppc64_pft_size = state->shift;
  1381. htab_size_bytes = 1UL << ppc64_pft_size;
  1382. htab_hash_mask = (htab_size_bytes >> 7) - 1;
  1383. return 0;
  1384. }
  1385. /*
  1386. * Must be called in process context. The caller must hold the
  1387. * cpus_lock.
  1388. */
  1389. static int pseries_lpar_resize_hpt(unsigned long shift)
  1390. {
  1391. struct hpt_resize_state state = {
  1392. .shift = shift,
  1393. .commit_rc = H_FUNCTION,
  1394. };
  1395. unsigned int delay, total_delay = 0;
  1396. int rc;
  1397. ktime_t t0, t1, t2;
  1398. might_sleep();
  1399. if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
  1400. return -ENODEV;
  1401. pr_info("Attempting to resize HPT to shift %lu\n", shift);
  1402. t0 = ktime_get();
  1403. rc = plpar_resize_hpt_prepare(0, shift);
  1404. while (H_IS_LONG_BUSY(rc)) {
  1405. delay = get_longbusy_msecs(rc);
  1406. total_delay += delay;
  1407. if (total_delay > HPT_RESIZE_TIMEOUT) {
  1408. /* prepare with shift==0 cancels an in-progress resize */
  1409. rc = plpar_resize_hpt_prepare(0, 0);
  1410. if (rc != H_SUCCESS)
  1411. pr_warn("Unexpected error %d cancelling timed out HPT resize\n",
  1412. rc);
  1413. return -ETIMEDOUT;
  1414. }
  1415. msleep(delay);
  1416. rc = plpar_resize_hpt_prepare(0, shift);
  1417. }
  1418. switch (rc) {
  1419. case H_SUCCESS:
  1420. /* Continue on */
  1421. break;
  1422. case H_PARAMETER:
  1423. pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n");
  1424. return -EINVAL;
  1425. case H_RESOURCE:
  1426. pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n");
  1427. return -EPERM;
  1428. default:
  1429. pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
  1430. return -EIO;
  1431. }
  1432. t1 = ktime_get();
  1433. rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
  1434. &state, NULL);
  1435. t2 = ktime_get();
  1436. if (rc != 0) {
  1437. switch (state.commit_rc) {
  1438. case H_PTEG_FULL:
  1439. return -ENOSPC;
  1440. default:
  1441. pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
  1442. state.commit_rc);
  1443. return -EIO;
  1444. };
  1445. }
  1446. pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
  1447. shift, (long long) ktime_ms_delta(t1, t0),
  1448. (long long) ktime_ms_delta(t2, t1));
  1449. return 0;
  1450. }
  1451. void __init hpte_init_pseries(void)
  1452. {
  1453. mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate;
  1454. mmu_hash_ops.hpte_updatepp = pSeries_lpar_hpte_updatepp;
  1455. mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
  1456. mmu_hash_ops.hpte_insert = pSeries_lpar_hpte_insert;
  1457. mmu_hash_ops.hpte_remove = pSeries_lpar_hpte_remove;
  1458. mmu_hash_ops.hpte_removebolted = pSeries_lpar_hpte_removebolted;
  1459. mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range;
  1460. mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all;
  1461. mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
  1462. if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
  1463. mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
  1464. /*
  1465. * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
  1466. * to inform the hypervisor that we wish to use the HPT.
  1467. */
  1468. if (cpu_has_feature(CPU_FTR_ARCH_300))
  1469. pseries_lpar_register_process_table(0, 0, 0);
  1470. }
  1471. #endif /* CONFIG_PPC_64S_HASH_MMU */
  1472. #ifdef CONFIG_PPC_RADIX_MMU
  1473. void __init radix_init_pseries(void)
  1474. {
  1475. pr_info("Using radix MMU under hypervisor\n");
  1476. pseries_lpar_register_process_table(__pa(process_tb),
  1477. 0, PRTB_SIZE_SHIFT - 12);
  1478. }
  1479. #endif
  1480. #ifdef CONFIG_PPC_SMLPAR
  1481. #define CMO_FREE_HINT_DEFAULT 1
  1482. static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
  1483. static int __init cmo_free_hint(char *str)
  1484. {
  1485. char *parm;
  1486. parm = strstrip(str);
  1487. if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
  1488. pr_info("%s: CMO free page hinting is not active.\n", __func__);
  1489. cmo_free_hint_flag = 0;
  1490. return 1;
  1491. }
  1492. cmo_free_hint_flag = 1;
  1493. pr_info("%s: CMO free page hinting is active.\n", __func__);
  1494. if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
  1495. return 1;
  1496. return 0;
  1497. }
  1498. __setup("cmo_free_hint=", cmo_free_hint);
  1499. static void pSeries_set_page_state(struct page *page, int order,
  1500. unsigned long state)
  1501. {
  1502. int i, j;
  1503. unsigned long cmo_page_sz, addr;
  1504. cmo_page_sz = cmo_get_page_size();
  1505. addr = __pa((unsigned long)page_address(page));
  1506. for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
  1507. for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
  1508. plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
  1509. }
  1510. }
  1511. void arch_free_page(struct page *page, int order)
  1512. {
  1513. if (radix_enabled())
  1514. return;
  1515. if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
  1516. return;
  1517. pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
  1518. }
  1519. EXPORT_SYMBOL(arch_free_page);
  1520. #endif /* CONFIG_PPC_SMLPAR */
  1521. #endif /* CONFIG_PPC_BOOK3S_64 */
  1522. #ifdef CONFIG_TRACEPOINTS
  1523. #ifdef CONFIG_JUMP_LABEL
  1524. struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
  1525. int hcall_tracepoint_regfunc(void)
  1526. {
  1527. static_key_slow_inc(&hcall_tracepoint_key);
  1528. return 0;
  1529. }
  1530. void hcall_tracepoint_unregfunc(void)
  1531. {
  1532. static_key_slow_dec(&hcall_tracepoint_key);
  1533. }
  1534. #else
  1535. /*
  1536. * We optimise our hcall path by placing hcall_tracepoint_refcount
  1537. * directly in the TOC so we can check if the hcall tracepoints are
  1538. * enabled via a single load.
  1539. */
  1540. /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
  1541. extern long hcall_tracepoint_refcount;
  1542. int hcall_tracepoint_regfunc(void)
  1543. {
  1544. hcall_tracepoint_refcount++;
  1545. return 0;
  1546. }
  1547. void hcall_tracepoint_unregfunc(void)
  1548. {
  1549. hcall_tracepoint_refcount--;
  1550. }
  1551. #endif
  1552. /*
  1553. * Keep track of hcall tracing depth and prevent recursion. Warn if any is
  1554. * detected because it may indicate a problem. This will not catch all
  1555. * problems with tracing code making hcalls, because the tracing might have
  1556. * been invoked from a non-hcall, so the first hcall could recurse into it
  1557. * without warning here, but this better than nothing.
  1558. *
  1559. * Hcalls with specific problems being traced should use the _notrace
  1560. * plpar_hcall variants.
  1561. */
  1562. static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
  1563. notrace void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
  1564. {
  1565. unsigned long flags;
  1566. unsigned int *depth;
  1567. local_irq_save(flags);
  1568. depth = this_cpu_ptr(&hcall_trace_depth);
  1569. if (WARN_ON_ONCE(*depth))
  1570. goto out;
  1571. (*depth)++;
  1572. preempt_disable();
  1573. trace_hcall_entry(opcode, args);
  1574. (*depth)--;
  1575. out:
  1576. local_irq_restore(flags);
  1577. }
  1578. notrace void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf)
  1579. {
  1580. unsigned long flags;
  1581. unsigned int *depth;
  1582. local_irq_save(flags);
  1583. depth = this_cpu_ptr(&hcall_trace_depth);
  1584. if (*depth) /* Don't warn again on the way out */
  1585. goto out;
  1586. (*depth)++;
  1587. trace_hcall_exit(opcode, retval, retbuf);
  1588. preempt_enable();
  1589. (*depth)--;
  1590. out:
  1591. local_irq_restore(flags);
  1592. }
  1593. #endif
  1594. /**
  1595. * h_get_mpp
  1596. * H_GET_MPP hcall returns info in 7 parms
  1597. */
  1598. int h_get_mpp(struct hvcall_mpp_data *mpp_data)
  1599. {
  1600. int rc;
  1601. unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
  1602. rc = plpar_hcall9(H_GET_MPP, retbuf);
  1603. mpp_data->entitled_mem = retbuf[0];
  1604. mpp_data->mapped_mem = retbuf[1];
  1605. mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
  1606. mpp_data->pool_num = retbuf[2] & 0xffff;
  1607. mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
  1608. mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
  1609. mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
  1610. mpp_data->pool_size = retbuf[4];
  1611. mpp_data->loan_request = retbuf[5];
  1612. mpp_data->backing_mem = retbuf[6];
  1613. return rc;
  1614. }
  1615. EXPORT_SYMBOL(h_get_mpp);
  1616. int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
  1617. {
  1618. int rc;
  1619. unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
  1620. rc = plpar_hcall9(H_GET_MPP_X, retbuf);
  1621. mpp_x_data->coalesced_bytes = retbuf[0];
  1622. mpp_x_data->pool_coalesced_bytes = retbuf[1];
  1623. mpp_x_data->pool_purr_cycles = retbuf[2];
  1624. mpp_x_data->pool_spurr_cycles = retbuf[3];
  1625. return rc;
  1626. }
  1627. #ifdef CONFIG_PPC_64S_HASH_MMU
  1628. static unsigned long __init vsid_unscramble(unsigned long vsid, int ssize)
  1629. {
  1630. unsigned long protovsid;
  1631. unsigned long va_bits = VA_BITS;
  1632. unsigned long modinv, vsid_modulus;
  1633. unsigned long max_mod_inv, tmp_modinv;
  1634. if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
  1635. va_bits = 65;
  1636. if (ssize == MMU_SEGSIZE_256M) {
  1637. modinv = VSID_MULINV_256M;
  1638. vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
  1639. } else {
  1640. modinv = VSID_MULINV_1T;
  1641. vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
  1642. }
  1643. /*
  1644. * vsid outside our range.
  1645. */
  1646. if (vsid >= vsid_modulus)
  1647. return 0;
  1648. /*
  1649. * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
  1650. * and vsid = (protovsid * x) % vsid_modulus, then we say:
  1651. * protovsid = (vsid * modinv) % vsid_modulus
  1652. */
  1653. /* Check if (vsid * modinv) overflow (63 bits) */
  1654. max_mod_inv = 0x7fffffffffffffffull / vsid;
  1655. if (modinv < max_mod_inv)
  1656. return (vsid * modinv) % vsid_modulus;
  1657. tmp_modinv = modinv/max_mod_inv;
  1658. modinv %= max_mod_inv;
  1659. protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
  1660. protovsid = (protovsid + vsid * modinv) % vsid_modulus;
  1661. return protovsid;
  1662. }
  1663. static int __init reserve_vrma_context_id(void)
  1664. {
  1665. unsigned long protovsid;
  1666. /*
  1667. * Reserve context ids which map to reserved virtual addresses. For now
  1668. * we only reserve the context id which maps to the VRMA VSID. We ignore
  1669. * the addresses in "ibm,adjunct-virtual-addresses" because we don't
  1670. * enable adjunct support via the "ibm,client-architecture-support"
  1671. * interface.
  1672. */
  1673. protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
  1674. hash__reserve_context_id(protovsid >> ESID_BITS_1T);
  1675. return 0;
  1676. }
  1677. machine_device_initcall(pseries, reserve_vrma_context_id);
  1678. #endif
  1679. #ifdef CONFIG_DEBUG_FS
  1680. /* debugfs file interface for vpa data */
  1681. static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len,
  1682. loff_t *pos)
  1683. {
  1684. int cpu = (long)filp->private_data;
  1685. struct lppaca *lppaca = &lppaca_of(cpu);
  1686. return simple_read_from_buffer(buf, len, pos, lppaca,
  1687. sizeof(struct lppaca));
  1688. }
  1689. static const struct file_operations vpa_fops = {
  1690. .open = simple_open,
  1691. .read = vpa_file_read,
  1692. .llseek = default_llseek,
  1693. };
  1694. static int __init vpa_debugfs_init(void)
  1695. {
  1696. char name[16];
  1697. long i;
  1698. struct dentry *vpa_dir;
  1699. if (!firmware_has_feature(FW_FEATURE_SPLPAR))
  1700. return 0;
  1701. vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir);
  1702. /* set up the per-cpu vpa file*/
  1703. for_each_possible_cpu(i) {
  1704. sprintf(name, "cpu-%ld", i);
  1705. debugfs_create_file(name, 0400, vpa_dir, (void *)i, &vpa_fops);
  1706. }
  1707. return 0;
  1708. }
  1709. machine_arch_initcall(pseries, vpa_debugfs_init);
  1710. #endif /* CONFIG_DEBUG_FS */