aperfmperf.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * x86 APERF/MPERF KHz calculation for
  4. * /sys/.../cpufreq/scaling_cur_freq
  5. *
  6. * Copyright (C) 2017 Intel Corp.
  7. * Author: Len Brown <[email protected]>
  8. */
  9. #include <linux/cpufreq.h>
  10. #include <linux/delay.h>
  11. #include <linux/ktime.h>
  12. #include <linux/math64.h>
  13. #include <linux/percpu.h>
  14. #include <linux/rcupdate.h>
  15. #include <linux/sched/isolation.h>
  16. #include <linux/sched/topology.h>
  17. #include <linux/smp.h>
  18. #include <linux/syscore_ops.h>
  19. #include <asm/cpu.h>
  20. #include <asm/cpu_device_id.h>
  21. #include <asm/intel-family.h>
  22. #include "cpu.h"
  23. struct aperfmperf {
  24. seqcount_t seq;
  25. unsigned long last_update;
  26. u64 acnt;
  27. u64 mcnt;
  28. u64 aperf;
  29. u64 mperf;
  30. };
  31. static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
  32. .seq = SEQCNT_ZERO(cpu_samples.seq)
  33. };
  34. static void init_counter_refs(void)
  35. {
  36. u64 aperf, mperf;
  37. rdmsrl(MSR_IA32_APERF, aperf);
  38. rdmsrl(MSR_IA32_MPERF, mperf);
  39. this_cpu_write(cpu_samples.aperf, aperf);
  40. this_cpu_write(cpu_samples.mperf, mperf);
  41. }
  42. #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
  43. /*
  44. * APERF/MPERF frequency ratio computation.
  45. *
  46. * The scheduler wants to do frequency invariant accounting and needs a <1
  47. * ratio to account for the 'current' frequency, corresponding to
  48. * freq_curr / freq_max.
  49. *
  50. * Since the frequency freq_curr on x86 is controlled by micro-controller and
  51. * our P-state setting is little more than a request/hint, we need to observe
  52. * the effective frequency 'BusyMHz', i.e. the average frequency over a time
  53. * interval after discarding idle time. This is given by:
  54. *
  55. * BusyMHz = delta_APERF / delta_MPERF * freq_base
  56. *
  57. * where freq_base is the max non-turbo P-state.
  58. *
  59. * The freq_max term has to be set to a somewhat arbitrary value, because we
  60. * can't know which turbo states will be available at a given point in time:
  61. * it all depends on the thermal headroom of the entire package. We set it to
  62. * the turbo level with 4 cores active.
  63. *
  64. * Benchmarks show that's a good compromise between the 1C turbo ratio
  65. * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
  66. * which would ignore the entire turbo range (a conspicuous part, making
  67. * freq_curr/freq_max always maxed out).
  68. *
  69. * An exception to the heuristic above is the Atom uarch, where we choose the
  70. * highest turbo level for freq_max since Atom's are generally oriented towards
  71. * power efficiency.
  72. *
  73. * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
  74. * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
  75. */
  76. DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
  77. static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
  78. static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
  79. void arch_set_max_freq_ratio(bool turbo_disabled)
  80. {
  81. arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
  82. arch_turbo_freq_ratio;
  83. }
  84. EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
  85. static bool __init turbo_disabled(void)
  86. {
  87. u64 misc_en;
  88. int err;
  89. err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
  90. if (err)
  91. return false;
  92. return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
  93. }
  94. static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
  95. {
  96. int err;
  97. err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
  98. if (err)
  99. return false;
  100. err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
  101. if (err)
  102. return false;
  103. *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
  104. *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
  105. return true;
  106. }
  107. #define X86_MATCH(model) \
  108. X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
  109. INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
  110. static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
  111. X86_MATCH(XEON_PHI_KNL),
  112. X86_MATCH(XEON_PHI_KNM),
  113. {}
  114. };
  115. static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
  116. X86_MATCH(SKYLAKE_X),
  117. {}
  118. };
  119. static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
  120. X86_MATCH(ATOM_GOLDMONT),
  121. X86_MATCH(ATOM_GOLDMONT_D),
  122. X86_MATCH(ATOM_GOLDMONT_PLUS),
  123. {}
  124. };
  125. static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
  126. int num_delta_fratio)
  127. {
  128. int fratio, delta_fratio, found;
  129. int err, i;
  130. u64 msr;
  131. err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
  132. if (err)
  133. return false;
  134. *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
  135. err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
  136. if (err)
  137. return false;
  138. fratio = (msr >> 8) & 0xFF;
  139. i = 16;
  140. found = 0;
  141. do {
  142. if (found >= num_delta_fratio) {
  143. *turbo_freq = fratio;
  144. return true;
  145. }
  146. delta_fratio = (msr >> (i + 5)) & 0x7;
  147. if (delta_fratio) {
  148. found += 1;
  149. fratio -= delta_fratio;
  150. }
  151. i += 8;
  152. } while (i < 64);
  153. return true;
  154. }
  155. static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
  156. {
  157. u64 ratios, counts;
  158. u32 group_size;
  159. int err, i;
  160. err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
  161. if (err)
  162. return false;
  163. *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
  164. err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
  165. if (err)
  166. return false;
  167. err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
  168. if (err)
  169. return false;
  170. for (i = 0; i < 64; i += 8) {
  171. group_size = (counts >> i) & 0xFF;
  172. if (group_size >= size) {
  173. *turbo_freq = (ratios >> i) & 0xFF;
  174. return true;
  175. }
  176. }
  177. return false;
  178. }
  179. static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
  180. {
  181. u64 msr;
  182. int err;
  183. err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
  184. if (err)
  185. return false;
  186. err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
  187. if (err)
  188. return false;
  189. *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
  190. *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
  191. /* The CPU may have less than 4 cores */
  192. if (!*turbo_freq)
  193. *turbo_freq = msr & 0xFF; /* 1C turbo */
  194. return true;
  195. }
  196. static bool __init intel_set_max_freq_ratio(void)
  197. {
  198. u64 base_freq, turbo_freq;
  199. u64 turbo_ratio;
  200. if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
  201. goto out;
  202. if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
  203. skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
  204. goto out;
  205. if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
  206. knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
  207. goto out;
  208. if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
  209. skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
  210. goto out;
  211. if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
  212. goto out;
  213. return false;
  214. out:
  215. /*
  216. * Some hypervisors advertise X86_FEATURE_APERFMPERF
  217. * but then fill all MSR's with zeroes.
  218. * Some CPUs have turbo boost but don't declare any turbo ratio
  219. * in MSR_TURBO_RATIO_LIMIT.
  220. */
  221. if (!base_freq || !turbo_freq) {
  222. pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
  223. return false;
  224. }
  225. turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
  226. if (!turbo_ratio) {
  227. pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
  228. return false;
  229. }
  230. arch_turbo_freq_ratio = turbo_ratio;
  231. arch_set_max_freq_ratio(turbo_disabled());
  232. return true;
  233. }
  234. #ifdef CONFIG_PM_SLEEP
  235. static struct syscore_ops freq_invariance_syscore_ops = {
  236. .resume = init_counter_refs,
  237. };
  238. static void register_freq_invariance_syscore_ops(void)
  239. {
  240. register_syscore_ops(&freq_invariance_syscore_ops);
  241. }
  242. #else
  243. static inline void register_freq_invariance_syscore_ops(void) {}
  244. #endif
  245. static void freq_invariance_enable(void)
  246. {
  247. if (static_branch_unlikely(&arch_scale_freq_key)) {
  248. WARN_ON_ONCE(1);
  249. return;
  250. }
  251. static_branch_enable(&arch_scale_freq_key);
  252. register_freq_invariance_syscore_ops();
  253. pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
  254. }
  255. void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
  256. {
  257. arch_turbo_freq_ratio = ratio;
  258. arch_set_max_freq_ratio(turbo_disabled);
  259. freq_invariance_enable();
  260. }
  261. static void __init bp_init_freq_invariance(void)
  262. {
  263. if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
  264. return;
  265. if (intel_set_max_freq_ratio())
  266. freq_invariance_enable();
  267. }
  268. static void disable_freq_invariance_workfn(struct work_struct *work)
  269. {
  270. int cpu;
  271. static_branch_disable(&arch_scale_freq_key);
  272. /*
  273. * Set arch_freq_scale to a default value on all cpus
  274. * This negates the effect of scaling
  275. */
  276. for_each_possible_cpu(cpu)
  277. per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
  278. }
  279. static DECLARE_WORK(disable_freq_invariance_work,
  280. disable_freq_invariance_workfn);
  281. DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
  282. static void scale_freq_tick(u64 acnt, u64 mcnt)
  283. {
  284. u64 freq_scale;
  285. if (!arch_scale_freq_invariant())
  286. return;
  287. if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
  288. goto error;
  289. if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
  290. goto error;
  291. freq_scale = div64_u64(acnt, mcnt);
  292. if (!freq_scale)
  293. goto error;
  294. if (freq_scale > SCHED_CAPACITY_SCALE)
  295. freq_scale = SCHED_CAPACITY_SCALE;
  296. this_cpu_write(arch_freq_scale, freq_scale);
  297. return;
  298. error:
  299. pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
  300. schedule_work(&disable_freq_invariance_work);
  301. }
  302. #else
  303. static inline void bp_init_freq_invariance(void) { }
  304. static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
  305. #endif /* CONFIG_X86_64 && CONFIG_SMP */
  306. void arch_scale_freq_tick(void)
  307. {
  308. struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
  309. u64 acnt, mcnt, aperf, mperf;
  310. if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
  311. return;
  312. rdmsrl(MSR_IA32_APERF, aperf);
  313. rdmsrl(MSR_IA32_MPERF, mperf);
  314. acnt = aperf - s->aperf;
  315. mcnt = mperf - s->mperf;
  316. s->aperf = aperf;
  317. s->mperf = mperf;
  318. raw_write_seqcount_begin(&s->seq);
  319. s->last_update = jiffies;
  320. s->acnt = acnt;
  321. s->mcnt = mcnt;
  322. raw_write_seqcount_end(&s->seq);
  323. scale_freq_tick(acnt, mcnt);
  324. }
  325. /*
  326. * Discard samples older than the define maximum sample age of 20ms. There
  327. * is no point in sending IPIs in such a case. If the scheduler tick was
  328. * not running then the CPU is either idle or isolated.
  329. */
  330. #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
  331. unsigned int arch_freq_get_on_cpu(int cpu)
  332. {
  333. struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
  334. unsigned int seq, freq;
  335. unsigned long last;
  336. u64 acnt, mcnt;
  337. if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
  338. goto fallback;
  339. do {
  340. seq = raw_read_seqcount_begin(&s->seq);
  341. last = s->last_update;
  342. acnt = s->acnt;
  343. mcnt = s->mcnt;
  344. } while (read_seqcount_retry(&s->seq, seq));
  345. /*
  346. * Bail on invalid count and when the last update was too long ago,
  347. * which covers idle and NOHZ full CPUs.
  348. */
  349. if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
  350. goto fallback;
  351. return div64_u64((cpu_khz * acnt), mcnt);
  352. fallback:
  353. freq = cpufreq_quick_get(cpu);
  354. return freq ? freq : cpu_khz;
  355. }
  356. static int __init bp_init_aperfmperf(void)
  357. {
  358. if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
  359. return 0;
  360. init_counter_refs();
  361. bp_init_freq_invariance();
  362. return 0;
  363. }
  364. early_initcall(bp_init_aperfmperf);
  365. void ap_init_aperfmperf(void)
  366. {
  367. if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
  368. init_counter_refs();
  369. }