kmem.c 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */
  3. #include <linux/memremap.h>
  4. #include <linux/pagemap.h>
  5. #include <linux/memory.h>
  6. #include <linux/module.h>
  7. #include <linux/device.h>
  8. #include <linux/pfn_t.h>
  9. #include <linux/slab.h>
  10. #include <linux/dax.h>
  11. #include <linux/fs.h>
  12. #include <linux/mm.h>
  13. #include <linux/mman.h>
  14. #include <linux/memory-tiers.h>
  15. #include "dax-private.h"
  16. #include "bus.h"
  17. /*
  18. * Default abstract distance assigned to the NUMA node onlined
  19. * by DAX/kmem if the low level platform driver didn't initialize
  20. * one for this NUMA node.
  21. */
  22. #define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5)
  23. /* Memory resource name used for add_memory_driver_managed(). */
  24. static const char *kmem_name;
  25. /* Set if any memory will remain added when the driver will be unloaded. */
  26. static bool any_hotremove_failed;
  27. static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
  28. {
  29. struct dev_dax_range *dax_range = &dev_dax->ranges[i];
  30. struct range *range = &dax_range->range;
  31. /* memory-block align the hotplug range */
  32. r->start = ALIGN(range->start, memory_block_size_bytes());
  33. r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1;
  34. if (r->start >= r->end) {
  35. r->start = range->start;
  36. r->end = range->end;
  37. return -ENOSPC;
  38. }
  39. return 0;
  40. }
  41. struct dax_kmem_data {
  42. const char *res_name;
  43. int mgid;
  44. struct resource *res[];
  45. };
  46. static struct memory_dev_type *dax_slowmem_type;
  47. static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
  48. {
  49. struct device *dev = &dev_dax->dev;
  50. unsigned long total_len = 0;
  51. struct dax_kmem_data *data;
  52. int i, rc, mapped = 0;
  53. int numa_node;
  54. /*
  55. * Ensure good NUMA information for the persistent memory.
  56. * Without this check, there is a risk that slow memory
  57. * could be mixed in a node with faster memory, causing
  58. * unavoidable performance issues.
  59. */
  60. numa_node = dev_dax->target_node;
  61. if (numa_node < 0) {
  62. dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
  63. numa_node);
  64. return -EINVAL;
  65. }
  66. for (i = 0; i < dev_dax->nr_range; i++) {
  67. struct range range;
  68. rc = dax_kmem_range(dev_dax, i, &range);
  69. if (rc) {
  70. dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
  71. i, range.start, range.end);
  72. continue;
  73. }
  74. total_len += range_len(&range);
  75. }
  76. if (!total_len) {
  77. dev_warn(dev, "rejecting DAX region without any memory after alignment\n");
  78. return -EINVAL;
  79. }
  80. init_node_memory_type(numa_node, dax_slowmem_type);
  81. rc = -ENOMEM;
  82. data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL);
  83. if (!data)
  84. goto err_dax_kmem_data;
  85. data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
  86. if (!data->res_name)
  87. goto err_res_name;
  88. rc = memory_group_register_static(numa_node, PFN_UP(total_len));
  89. if (rc < 0)
  90. goto err_reg_mgid;
  91. data->mgid = rc;
  92. for (i = 0; i < dev_dax->nr_range; i++) {
  93. struct resource *res;
  94. struct range range;
  95. rc = dax_kmem_range(dev_dax, i, &range);
  96. if (rc)
  97. continue;
  98. /* Region is permanently reserved if hotremove fails. */
  99. res = request_mem_region(range.start, range_len(&range), data->res_name);
  100. if (!res) {
  101. dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n",
  102. i, range.start, range.end);
  103. /*
  104. * Once some memory has been onlined we can't
  105. * assume that it can be un-onlined safely.
  106. */
  107. if (mapped)
  108. continue;
  109. rc = -EBUSY;
  110. goto err_request_mem;
  111. }
  112. data->res[i] = res;
  113. /*
  114. * Set flags appropriate for System RAM. Leave ..._BUSY clear
  115. * so that add_memory() can add a child resource. Do not
  116. * inherit flags from the parent since it may set new flags
  117. * unknown to us that will break add_memory() below.
  118. */
  119. res->flags = IORESOURCE_SYSTEM_RAM;
  120. /*
  121. * Ensure that future kexec'd kernels will not treat
  122. * this as RAM automatically.
  123. */
  124. rc = add_memory_driver_managed(data->mgid, range.start,
  125. range_len(&range), kmem_name, MHP_NID_IS_MGID);
  126. if (rc) {
  127. dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
  128. i, range.start, range.end);
  129. remove_resource(res);
  130. kfree(res);
  131. data->res[i] = NULL;
  132. if (mapped)
  133. continue;
  134. goto err_request_mem;
  135. }
  136. mapped++;
  137. }
  138. dev_set_drvdata(dev, data);
  139. return 0;
  140. err_request_mem:
  141. memory_group_unregister(data->mgid);
  142. err_reg_mgid:
  143. kfree(data->res_name);
  144. err_res_name:
  145. kfree(data);
  146. err_dax_kmem_data:
  147. clear_node_memory_type(numa_node, dax_slowmem_type);
  148. return rc;
  149. }
  150. #ifdef CONFIG_MEMORY_HOTREMOVE
  151. static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
  152. {
  153. int i, success = 0;
  154. int node = dev_dax->target_node;
  155. struct device *dev = &dev_dax->dev;
  156. struct dax_kmem_data *data = dev_get_drvdata(dev);
  157. /*
  158. * We have one shot for removing memory, if some memory blocks were not
  159. * offline prior to calling this function remove_memory() will fail, and
  160. * there is no way to hotremove this memory until reboot because device
  161. * unbind will succeed even if we return failure.
  162. */
  163. for (i = 0; i < dev_dax->nr_range; i++) {
  164. struct range range;
  165. int rc;
  166. rc = dax_kmem_range(dev_dax, i, &range);
  167. if (rc)
  168. continue;
  169. rc = remove_memory(range.start, range_len(&range));
  170. if (rc == 0) {
  171. remove_resource(data->res[i]);
  172. kfree(data->res[i]);
  173. data->res[i] = NULL;
  174. success++;
  175. continue;
  176. }
  177. any_hotremove_failed = true;
  178. dev_err(dev,
  179. "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n",
  180. i, range.start, range.end);
  181. }
  182. if (success >= dev_dax->nr_range) {
  183. memory_group_unregister(data->mgid);
  184. kfree(data->res_name);
  185. kfree(data);
  186. dev_set_drvdata(dev, NULL);
  187. /*
  188. * Clear the memtype association on successful unplug.
  189. * If not, we have memory blocks left which can be
  190. * offlined/onlined later. We need to keep memory_dev_type
  191. * for that. This implies this reference will be around
  192. * till next reboot.
  193. */
  194. clear_node_memory_type(node, dax_slowmem_type);
  195. }
  196. }
  197. #else
  198. static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
  199. {
  200. /*
  201. * Without hotremove purposely leak the request_mem_region() for the
  202. * device-dax range and return '0' to ->remove() attempts. The removal
  203. * of the device from the driver always succeeds, but the region is
  204. * permanently pinned as reserved by the unreleased
  205. * request_mem_region().
  206. */
  207. any_hotremove_failed = true;
  208. }
  209. #endif /* CONFIG_MEMORY_HOTREMOVE */
  210. static struct dax_device_driver device_dax_kmem_driver = {
  211. .probe = dev_dax_kmem_probe,
  212. .remove = dev_dax_kmem_remove,
  213. };
  214. static int __init dax_kmem_init(void)
  215. {
  216. int rc;
  217. /* Resource name is permanently allocated if any hotremove fails. */
  218. kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL);
  219. if (!kmem_name)
  220. return -ENOMEM;
  221. dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE);
  222. if (IS_ERR(dax_slowmem_type)) {
  223. rc = PTR_ERR(dax_slowmem_type);
  224. goto err_dax_slowmem_type;
  225. }
  226. rc = dax_driver_register(&device_dax_kmem_driver);
  227. if (rc)
  228. goto error_dax_driver;
  229. return rc;
  230. error_dax_driver:
  231. destroy_memory_type(dax_slowmem_type);
  232. err_dax_slowmem_type:
  233. kfree_const(kmem_name);
  234. return rc;
  235. }
  236. static void __exit dax_kmem_exit(void)
  237. {
  238. dax_driver_unregister(&device_dax_kmem_driver);
  239. if (!any_hotremove_failed)
  240. kfree_const(kmem_name);
  241. destroy_memory_type(dax_slowmem_type);
  242. }
  243. MODULE_AUTHOR("Intel Corporation");
  244. MODULE_LICENSE("GPL v2");
  245. module_init(dax_kmem_init);
  246. module_exit(dax_kmem_exit);
  247. MODULE_ALIAS_DAX_DEVICE(0);