energy_model.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Energy Model of devices
  4. *
  5. * Copyright (c) 2018-2021, Arm ltd.
  6. * Written by: Quentin Perret, Arm ltd.
  7. * Improvements provided by: Lukasz Luba, Arm ltd.
  8. */
  9. #define pr_fmt(fmt) "energy_model: " fmt
  10. #include <linux/cpu.h>
  11. #include <linux/cpufreq.h>
  12. #include <linux/cpumask.h>
  13. #include <linux/debugfs.h>
  14. #include <linux/energy_model.h>
  15. #include <linux/sched/topology.h>
  16. #include <linux/slab.h>
  17. /*
  18. * Mutex serializing the registrations of performance domains and letting
  19. * callbacks defined by drivers sleep.
  20. */
  21. static DEFINE_MUTEX(em_pd_mutex);
  22. static bool _is_cpu_device(struct device *dev)
  23. {
  24. return (dev->bus == &cpu_subsys);
  25. }
  26. #ifdef CONFIG_DEBUG_FS
  27. static struct dentry *rootdir;
  28. static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
  29. {
  30. struct dentry *d;
  31. char name[24];
  32. snprintf(name, sizeof(name), "ps:%lu", ps->frequency);
  33. /* Create per-ps directory */
  34. d = debugfs_create_dir(name, pd);
  35. debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
  36. debugfs_create_ulong("power", 0444, d, &ps->power);
  37. debugfs_create_ulong("cost", 0444, d, &ps->cost);
  38. debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
  39. }
  40. static int em_debug_cpus_show(struct seq_file *s, void *unused)
  41. {
  42. seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
  43. return 0;
  44. }
  45. DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
  46. static int em_debug_flags_show(struct seq_file *s, void *unused)
  47. {
  48. struct em_perf_domain *pd = s->private;
  49. seq_printf(s, "%#lx\n", pd->flags);
  50. return 0;
  51. }
  52. DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
  53. static void em_debug_create_pd(struct device *dev)
  54. {
  55. struct dentry *d;
  56. int i;
  57. /* Create the directory of the performance domain */
  58. d = debugfs_create_dir(dev_name(dev), rootdir);
  59. if (_is_cpu_device(dev))
  60. debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
  61. &em_debug_cpus_fops);
  62. debugfs_create_file("flags", 0444, d, dev->em_pd,
  63. &em_debug_flags_fops);
  64. /* Create a sub-directory for each performance state */
  65. for (i = 0; i < dev->em_pd->nr_perf_states; i++)
  66. em_debug_create_ps(&dev->em_pd->table[i], d);
  67. }
  68. static void em_debug_remove_pd(struct device *dev)
  69. {
  70. debugfs_lookup_and_remove(dev_name(dev), rootdir);
  71. }
  72. static int __init em_debug_init(void)
  73. {
  74. /* Create /sys/kernel/debug/energy_model directory */
  75. rootdir = debugfs_create_dir("energy_model", NULL);
  76. return 0;
  77. }
  78. fs_initcall(em_debug_init);
  79. #else /* CONFIG_DEBUG_FS */
  80. static void em_debug_create_pd(struct device *dev) {}
  81. static void em_debug_remove_pd(struct device *dev) {}
  82. #endif
  83. static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
  84. int nr_states, struct em_data_callback *cb,
  85. unsigned long flags)
  86. {
  87. unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
  88. struct em_perf_state *table;
  89. int i, ret;
  90. u64 fmax;
  91. table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
  92. if (!table)
  93. return -ENOMEM;
  94. /* Build the list of performance states for this performance domain */
  95. for (i = 0, freq = 0; i < nr_states; i++, freq++) {
  96. /*
  97. * active_power() is a driver callback which ceils 'freq' to
  98. * lowest performance state of 'dev' above 'freq' and updates
  99. * 'power' and 'freq' accordingly.
  100. */
  101. ret = cb->active_power(dev, &power, &freq);
  102. if (ret) {
  103. dev_err(dev, "EM: invalid perf. state: %d\n",
  104. ret);
  105. goto free_ps_table;
  106. }
  107. /*
  108. * We expect the driver callback to increase the frequency for
  109. * higher performance states.
  110. */
  111. if (freq <= prev_freq) {
  112. dev_err(dev, "EM: non-increasing freq: %lu\n",
  113. freq);
  114. goto free_ps_table;
  115. }
  116. /*
  117. * The power returned by active_state() is expected to be
  118. * positive and be in range.
  119. */
  120. if (!power || power > EM_MAX_POWER) {
  121. dev_err(dev, "EM: invalid power: %lu\n",
  122. power);
  123. goto free_ps_table;
  124. }
  125. table[i].power = power;
  126. table[i].frequency = prev_freq = freq;
  127. }
  128. /* Compute the cost of each performance state. */
  129. fmax = (u64) table[nr_states - 1].frequency;
  130. for (i = nr_states - 1; i >= 0; i--) {
  131. unsigned long power_res, cost;
  132. if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
  133. ret = cb->get_cost(dev, table[i].frequency, &cost);
  134. if (ret || !cost || cost > EM_MAX_POWER) {
  135. dev_err(dev, "EM: invalid cost %lu %d\n",
  136. cost, ret);
  137. goto free_ps_table;
  138. }
  139. } else {
  140. power_res = table[i].power;
  141. cost = div64_u64(fmax * power_res, table[i].frequency);
  142. }
  143. table[i].cost = cost;
  144. if (table[i].cost >= prev_cost) {
  145. table[i].flags = EM_PERF_STATE_INEFFICIENT;
  146. dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
  147. table[i].frequency);
  148. } else {
  149. prev_cost = table[i].cost;
  150. }
  151. }
  152. pd->table = table;
  153. pd->nr_perf_states = nr_states;
  154. return 0;
  155. free_ps_table:
  156. kfree(table);
  157. return -EINVAL;
  158. }
  159. static int em_create_pd(struct device *dev, int nr_states,
  160. struct em_data_callback *cb, cpumask_t *cpus,
  161. unsigned long flags)
  162. {
  163. struct em_perf_domain *pd;
  164. struct device *cpu_dev;
  165. int cpu, ret, num_cpus;
  166. if (_is_cpu_device(dev)) {
  167. num_cpus = cpumask_weight(cpus);
  168. /* Prevent max possible energy calculation to not overflow */
  169. if (num_cpus > EM_MAX_NUM_CPUS) {
  170. dev_err(dev, "EM: too many CPUs, overflow possible\n");
  171. return -EINVAL;
  172. }
  173. pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
  174. if (!pd)
  175. return -ENOMEM;
  176. cpumask_copy(em_span_cpus(pd), cpus);
  177. } else {
  178. pd = kzalloc(sizeof(*pd), GFP_KERNEL);
  179. if (!pd)
  180. return -ENOMEM;
  181. }
  182. ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
  183. if (ret) {
  184. kfree(pd);
  185. return ret;
  186. }
  187. if (_is_cpu_device(dev))
  188. for_each_cpu(cpu, cpus) {
  189. cpu_dev = get_cpu_device(cpu);
  190. cpu_dev->em_pd = pd;
  191. }
  192. dev->em_pd = pd;
  193. return 0;
  194. }
  195. static void em_cpufreq_update_efficiencies(struct device *dev)
  196. {
  197. struct em_perf_domain *pd = dev->em_pd;
  198. struct em_perf_state *table;
  199. struct cpufreq_policy *policy;
  200. int found = 0;
  201. int i;
  202. if (!_is_cpu_device(dev) || !pd)
  203. return;
  204. policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
  205. if (!policy) {
  206. dev_warn(dev, "EM: Access to CPUFreq policy failed");
  207. return;
  208. }
  209. table = pd->table;
  210. for (i = 0; i < pd->nr_perf_states; i++) {
  211. if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
  212. continue;
  213. if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
  214. found++;
  215. }
  216. cpufreq_cpu_put(policy);
  217. if (!found)
  218. return;
  219. /*
  220. * Efficiencies have been installed in CPUFreq, inefficient frequencies
  221. * will be skipped. The EM can do the same.
  222. */
  223. pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
  224. }
  225. /**
  226. * em_pd_get() - Return the performance domain for a device
  227. * @dev : Device to find the performance domain for
  228. *
  229. * Returns the performance domain to which @dev belongs, or NULL if it doesn't
  230. * exist.
  231. */
  232. struct em_perf_domain *em_pd_get(struct device *dev)
  233. {
  234. if (IS_ERR_OR_NULL(dev))
  235. return NULL;
  236. return dev->em_pd;
  237. }
  238. EXPORT_SYMBOL_GPL(em_pd_get);
  239. /**
  240. * em_cpu_get() - Return the performance domain for a CPU
  241. * @cpu : CPU to find the performance domain for
  242. *
  243. * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
  244. * exist.
  245. */
  246. struct em_perf_domain *em_cpu_get(int cpu)
  247. {
  248. struct device *cpu_dev;
  249. cpu_dev = get_cpu_device(cpu);
  250. if (!cpu_dev)
  251. return NULL;
  252. return em_pd_get(cpu_dev);
  253. }
  254. EXPORT_SYMBOL_GPL(em_cpu_get);
  255. /**
  256. * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
  257. * @dev : Device for which the EM is to register
  258. * @nr_states : Number of performance states to register
  259. * @cb : Callback functions providing the data of the Energy Model
  260. * @cpus : Pointer to cpumask_t, which in case of a CPU device is
  261. * obligatory. It can be taken from i.e. 'policy->cpus'. For other
  262. * type of devices this should be set to NULL.
  263. * @microwatts : Flag indicating that the power values are in micro-Watts or
  264. * in some other scale. It must be set properly.
  265. *
  266. * Create Energy Model tables for a performance domain using the callbacks
  267. * defined in cb.
  268. *
  269. * The @microwatts is important to set with correct value. Some kernel
  270. * sub-systems might rely on this flag and check if all devices in the EM are
  271. * using the same scale.
  272. *
  273. * If multiple clients register the same performance domain, all but the first
  274. * registration will be ignored.
  275. *
  276. * Return 0 on success
  277. */
  278. int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
  279. struct em_data_callback *cb, cpumask_t *cpus,
  280. bool microwatts)
  281. {
  282. unsigned long cap, prev_cap = 0;
  283. unsigned long flags = 0;
  284. int cpu, ret;
  285. if (!dev || !nr_states || !cb)
  286. return -EINVAL;
  287. /*
  288. * Use a mutex to serialize the registration of performance domains and
  289. * let the driver-defined callback functions sleep.
  290. */
  291. mutex_lock(&em_pd_mutex);
  292. if (dev->em_pd) {
  293. ret = -EEXIST;
  294. goto unlock;
  295. }
  296. if (_is_cpu_device(dev)) {
  297. if (!cpus) {
  298. dev_err(dev, "EM: invalid CPU mask\n");
  299. ret = -EINVAL;
  300. goto unlock;
  301. }
  302. for_each_cpu(cpu, cpus) {
  303. if (em_cpu_get(cpu)) {
  304. dev_err(dev, "EM: exists for CPU%d\n", cpu);
  305. ret = -EEXIST;
  306. goto unlock;
  307. }
  308. /*
  309. * All CPUs of a domain must have the same
  310. * micro-architecture since they all share the same
  311. * table.
  312. */
  313. cap = arch_scale_cpu_capacity(cpu);
  314. if (prev_cap && prev_cap != cap) {
  315. dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
  316. cpumask_pr_args(cpus));
  317. ret = -EINVAL;
  318. goto unlock;
  319. }
  320. prev_cap = cap;
  321. }
  322. }
  323. if (microwatts)
  324. flags |= EM_PERF_DOMAIN_MICROWATTS;
  325. else if (cb->get_cost)
  326. flags |= EM_PERF_DOMAIN_ARTIFICIAL;
  327. ret = em_create_pd(dev, nr_states, cb, cpus, flags);
  328. if (ret)
  329. goto unlock;
  330. dev->em_pd->flags |= flags;
  331. em_cpufreq_update_efficiencies(dev);
  332. em_debug_create_pd(dev);
  333. dev_info(dev, "EM: created perf domain\n");
  334. unlock:
  335. mutex_unlock(&em_pd_mutex);
  336. return ret;
  337. }
  338. EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
  339. /**
  340. * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
  341. * @dev : Device for which the EM is registered
  342. *
  343. * Unregister the EM for the specified @dev (but not a CPU device).
  344. */
  345. void em_dev_unregister_perf_domain(struct device *dev)
  346. {
  347. if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
  348. return;
  349. if (_is_cpu_device(dev))
  350. return;
  351. /*
  352. * The mutex separates all register/unregister requests and protects
  353. * from potential clean-up/setup issues in the debugfs directories.
  354. * The debugfs directory name is the same as device's name.
  355. */
  356. mutex_lock(&em_pd_mutex);
  357. em_debug_remove_pd(dev);
  358. kfree(dev->em_pd->table);
  359. kfree(dev->em_pd);
  360. dev->em_pd = NULL;
  361. mutex_unlock(&em_pd_mutex);
  362. }
  363. EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);