memory.c 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Memory subsystem support
  4. *
  5. * Written by Matt Tolentino <[email protected]>
  6. * Dave Hansen <[email protected]>
  7. *
  8. * This file provides the necessary infrastructure to represent
  9. * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10. * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11. * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12. */
  13. #include <linux/module.h>
  14. #include <linux/init.h>
  15. #include <linux/topology.h>
  16. #include <linux/capability.h>
  17. #include <linux/device.h>
  18. #include <linux/memory.h>
  19. #include <linux/memory_hotplug.h>
  20. #include <linux/mm.h>
  21. #include <linux/stat.h>
  22. #include <linux/slab.h>
  23. #include <linux/xarray.h>
  24. #include <linux/atomic.h>
  25. #include <linux/uaccess.h>
  26. #define MEMORY_CLASS_NAME "memory"
  27. static const char *const online_type_to_str[] = {
  28. [MMOP_OFFLINE] = "offline",
  29. [MMOP_ONLINE] = "online",
  30. [MMOP_ONLINE_KERNEL] = "online_kernel",
  31. [MMOP_ONLINE_MOVABLE] = "online_movable",
  32. };
  33. int mhp_online_type_from_str(const char *str)
  34. {
  35. int i;
  36. for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
  37. if (sysfs_streq(str, online_type_to_str[i]))
  38. return i;
  39. }
  40. return -EINVAL;
  41. }
  42. #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  43. static int sections_per_block;
  44. static inline unsigned long memory_block_id(unsigned long section_nr)
  45. {
  46. return section_nr / sections_per_block;
  47. }
  48. static inline unsigned long pfn_to_block_id(unsigned long pfn)
  49. {
  50. return memory_block_id(pfn_to_section_nr(pfn));
  51. }
  52. static inline unsigned long phys_to_block_id(unsigned long phys)
  53. {
  54. return pfn_to_block_id(PFN_DOWN(phys));
  55. }
  56. static int memory_subsys_online(struct device *dev);
  57. static int memory_subsys_offline(struct device *dev);
  58. static struct bus_type memory_subsys = {
  59. .name = MEMORY_CLASS_NAME,
  60. .dev_name = MEMORY_CLASS_NAME,
  61. .online = memory_subsys_online,
  62. .offline = memory_subsys_offline,
  63. };
  64. /*
  65. * Memory blocks are cached in a local radix tree to avoid
  66. * a costly linear search for the corresponding device on
  67. * the subsystem bus.
  68. */
  69. static DEFINE_XARRAY(memory_blocks);
  70. /*
  71. * Memory groups, indexed by memory group id (mgid).
  72. */
  73. static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
  74. #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1
  75. static BLOCKING_NOTIFIER_HEAD(memory_chain);
  76. int register_memory_notifier(struct notifier_block *nb)
  77. {
  78. return blocking_notifier_chain_register(&memory_chain, nb);
  79. }
  80. EXPORT_SYMBOL(register_memory_notifier);
  81. void unregister_memory_notifier(struct notifier_block *nb)
  82. {
  83. blocking_notifier_chain_unregister(&memory_chain, nb);
  84. }
  85. EXPORT_SYMBOL(unregister_memory_notifier);
  86. static void memory_block_release(struct device *dev)
  87. {
  88. struct memory_block *mem = to_memory_block(dev);
  89. kfree(mem);
  90. }
  91. unsigned long __weak memory_block_size_bytes(void)
  92. {
  93. return MIN_MEMORY_BLOCK_SIZE;
  94. }
  95. EXPORT_SYMBOL_GPL(memory_block_size_bytes);
  96. /*
  97. * Show the first physical section index (number) of this memory block.
  98. */
  99. static ssize_t phys_index_show(struct device *dev,
  100. struct device_attribute *attr, char *buf)
  101. {
  102. struct memory_block *mem = to_memory_block(dev);
  103. unsigned long phys_index;
  104. phys_index = mem->start_section_nr / sections_per_block;
  105. return sysfs_emit(buf, "%08lx\n", phys_index);
  106. }
  107. /*
  108. * Legacy interface that we cannot remove. Always indicate "removable"
  109. * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
  110. */
  111. static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
  112. char *buf)
  113. {
  114. return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
  115. }
  116. /*
  117. * online, offline, going offline, etc.
  118. */
  119. static ssize_t state_show(struct device *dev, struct device_attribute *attr,
  120. char *buf)
  121. {
  122. struct memory_block *mem = to_memory_block(dev);
  123. const char *output;
  124. /*
  125. * We can probably put these states in a nice little array
  126. * so that they're not open-coded
  127. */
  128. switch (mem->state) {
  129. case MEM_ONLINE:
  130. output = "online";
  131. break;
  132. case MEM_OFFLINE:
  133. output = "offline";
  134. break;
  135. case MEM_GOING_OFFLINE:
  136. output = "going-offline";
  137. break;
  138. default:
  139. WARN_ON(1);
  140. return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
  141. }
  142. return sysfs_emit(buf, "%s\n", output);
  143. }
  144. int memory_notify(unsigned long val, void *v)
  145. {
  146. return blocking_notifier_call_chain(&memory_chain, val, v);
  147. }
  148. static int memory_block_online(struct memory_block *mem)
  149. {
  150. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  151. unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  152. unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
  153. struct zone *zone;
  154. int ret;
  155. zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
  156. start_pfn, nr_pages);
  157. /*
  158. * Although vmemmap pages have a different lifecycle than the pages
  159. * they describe (they remain until the memory is unplugged), doing
  160. * their initialization and accounting at memory onlining/offlining
  161. * stage helps to keep accounting easier to follow - e.g vmemmaps
  162. * belong to the same zone as the memory they backed.
  163. */
  164. if (nr_vmemmap_pages) {
  165. ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
  166. if (ret)
  167. return ret;
  168. }
  169. ret = online_pages(start_pfn + nr_vmemmap_pages,
  170. nr_pages - nr_vmemmap_pages, zone, mem->group);
  171. if (ret) {
  172. if (nr_vmemmap_pages)
  173. mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
  174. return ret;
  175. }
  176. /*
  177. * Account once onlining succeeded. If the zone was unpopulated, it is
  178. * now already properly populated.
  179. */
  180. if (nr_vmemmap_pages)
  181. adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  182. nr_vmemmap_pages);
  183. mem->zone = zone;
  184. return ret;
  185. }
  186. static int memory_block_offline(struct memory_block *mem)
  187. {
  188. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  189. unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  190. unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
  191. int ret;
  192. if (!mem->zone)
  193. return -EINVAL;
  194. /*
  195. * Unaccount before offlining, such that unpopulated zone and kthreads
  196. * can properly be torn down in offline_pages().
  197. */
  198. if (nr_vmemmap_pages)
  199. adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  200. -nr_vmemmap_pages);
  201. ret = offline_pages(start_pfn + nr_vmemmap_pages,
  202. nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
  203. if (ret) {
  204. /* offline_pages() failed. Account back. */
  205. if (nr_vmemmap_pages)
  206. adjust_present_page_count(pfn_to_page(start_pfn),
  207. mem->group, nr_vmemmap_pages);
  208. return ret;
  209. }
  210. if (nr_vmemmap_pages)
  211. mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
  212. mem->zone = NULL;
  213. return ret;
  214. }
  215. /*
  216. * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  217. * OK to have direct references to sparsemem variables in here.
  218. */
  219. static int
  220. memory_block_action(struct memory_block *mem, unsigned long action)
  221. {
  222. int ret;
  223. switch (action) {
  224. case MEM_ONLINE:
  225. ret = memory_block_online(mem);
  226. break;
  227. case MEM_OFFLINE:
  228. ret = memory_block_offline(mem);
  229. break;
  230. default:
  231. WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
  232. "%ld\n", __func__, mem->start_section_nr, action, action);
  233. ret = -EINVAL;
  234. }
  235. return ret;
  236. }
  237. static int memory_block_change_state(struct memory_block *mem,
  238. unsigned long to_state, unsigned long from_state_req)
  239. {
  240. int ret = 0;
  241. if (mem->state != from_state_req)
  242. return -EINVAL;
  243. if (to_state == MEM_OFFLINE)
  244. mem->state = MEM_GOING_OFFLINE;
  245. ret = memory_block_action(mem, to_state);
  246. mem->state = ret ? from_state_req : to_state;
  247. return ret;
  248. }
  249. /* The device lock serializes operations on memory_subsys_[online|offline] */
  250. static int memory_subsys_online(struct device *dev)
  251. {
  252. struct memory_block *mem = to_memory_block(dev);
  253. int ret;
  254. if (mem->state == MEM_ONLINE)
  255. return 0;
  256. /*
  257. * When called via device_online() without configuring the online_type,
  258. * we want to default to MMOP_ONLINE.
  259. */
  260. if (mem->online_type == MMOP_OFFLINE)
  261. mem->online_type = MMOP_ONLINE;
  262. ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
  263. mem->online_type = MMOP_OFFLINE;
  264. return ret;
  265. }
  266. static int memory_subsys_offline(struct device *dev)
  267. {
  268. struct memory_block *mem = to_memory_block(dev);
  269. if (mem->state == MEM_OFFLINE)
  270. return 0;
  271. return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
  272. }
  273. static ssize_t state_store(struct device *dev, struct device_attribute *attr,
  274. const char *buf, size_t count)
  275. {
  276. const int online_type = mhp_online_type_from_str(buf);
  277. struct memory_block *mem = to_memory_block(dev);
  278. int ret;
  279. if (online_type < 0)
  280. return -EINVAL;
  281. ret = lock_device_hotplug_sysfs();
  282. if (ret)
  283. return ret;
  284. switch (online_type) {
  285. case MMOP_ONLINE_KERNEL:
  286. case MMOP_ONLINE_MOVABLE:
  287. case MMOP_ONLINE:
  288. /* mem->online_type is protected by device_hotplug_lock */
  289. mem->online_type = online_type;
  290. ret = device_online(&mem->dev);
  291. break;
  292. case MMOP_OFFLINE:
  293. ret = device_offline(&mem->dev);
  294. break;
  295. default:
  296. ret = -EINVAL; /* should never happen */
  297. }
  298. unlock_device_hotplug();
  299. if (ret < 0)
  300. return ret;
  301. if (ret)
  302. return -EINVAL;
  303. return count;
  304. }
  305. /*
  306. * Legacy interface that we cannot remove: s390x exposes the storage increment
  307. * covered by a memory block, allowing for identifying which memory blocks
  308. * comprise a storage increment. Since a memory block spans complete
  309. * storage increments nowadays, this interface is basically unused. Other
  310. * archs never exposed != 0.
  311. */
  312. static ssize_t phys_device_show(struct device *dev,
  313. struct device_attribute *attr, char *buf)
  314. {
  315. struct memory_block *mem = to_memory_block(dev);
  316. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  317. return sysfs_emit(buf, "%d\n",
  318. arch_get_memory_phys_device(start_pfn));
  319. }
  320. #ifdef CONFIG_MEMORY_HOTREMOVE
  321. static int print_allowed_zone(char *buf, int len, int nid,
  322. struct memory_group *group,
  323. unsigned long start_pfn, unsigned long nr_pages,
  324. int online_type, struct zone *default_zone)
  325. {
  326. struct zone *zone;
  327. zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
  328. if (zone == default_zone)
  329. return 0;
  330. return sysfs_emit_at(buf, len, " %s", zone->name);
  331. }
  332. static ssize_t valid_zones_show(struct device *dev,
  333. struct device_attribute *attr, char *buf)
  334. {
  335. struct memory_block *mem = to_memory_block(dev);
  336. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  337. unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  338. struct memory_group *group = mem->group;
  339. struct zone *default_zone;
  340. int nid = mem->nid;
  341. int len = 0;
  342. /*
  343. * Check the existing zone. Make sure that we do that only on the
  344. * online nodes otherwise the page_zone is not reliable
  345. */
  346. if (mem->state == MEM_ONLINE) {
  347. /*
  348. * If !mem->zone, the memory block spans multiple zones and
  349. * cannot get offlined.
  350. */
  351. default_zone = mem->zone;
  352. if (!default_zone)
  353. return sysfs_emit(buf, "%s\n", "none");
  354. len += sysfs_emit_at(buf, len, "%s", default_zone->name);
  355. goto out;
  356. }
  357. default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
  358. start_pfn, nr_pages);
  359. len += sysfs_emit_at(buf, len, "%s", default_zone->name);
  360. len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
  361. MMOP_ONLINE_KERNEL, default_zone);
  362. len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
  363. MMOP_ONLINE_MOVABLE, default_zone);
  364. out:
  365. len += sysfs_emit_at(buf, len, "\n");
  366. return len;
  367. }
  368. static DEVICE_ATTR_RO(valid_zones);
  369. #endif
  370. static DEVICE_ATTR_RO(phys_index);
  371. static DEVICE_ATTR_RW(state);
  372. static DEVICE_ATTR_RO(phys_device);
  373. static DEVICE_ATTR_RO(removable);
  374. /*
  375. * Show the memory block size (shared by all memory blocks).
  376. */
  377. static ssize_t block_size_bytes_show(struct device *dev,
  378. struct device_attribute *attr, char *buf)
  379. {
  380. return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
  381. }
  382. static DEVICE_ATTR_RO(block_size_bytes);
  383. /*
  384. * Memory auto online policy.
  385. */
  386. static ssize_t auto_online_blocks_show(struct device *dev,
  387. struct device_attribute *attr, char *buf)
  388. {
  389. return sysfs_emit(buf, "%s\n",
  390. online_type_to_str[mhp_default_online_type]);
  391. }
  392. static ssize_t auto_online_blocks_store(struct device *dev,
  393. struct device_attribute *attr,
  394. const char *buf, size_t count)
  395. {
  396. const int online_type = mhp_online_type_from_str(buf);
  397. if (online_type < 0)
  398. return -EINVAL;
  399. mhp_default_online_type = online_type;
  400. return count;
  401. }
  402. static DEVICE_ATTR_RW(auto_online_blocks);
  403. /*
  404. * Some architectures will have custom drivers to do this, and
  405. * will not need to do it from userspace. The fake hot-add code
  406. * as well as ppc64 will do all of their discovery in userspace
  407. * and will require this interface.
  408. */
  409. #ifdef CONFIG_ARCH_MEMORY_PROBE
  410. static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
  411. const char *buf, size_t count)
  412. {
  413. u64 phys_addr;
  414. int nid, ret;
  415. unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
  416. ret = kstrtoull(buf, 0, &phys_addr);
  417. if (ret)
  418. return ret;
  419. if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
  420. return -EINVAL;
  421. ret = lock_device_hotplug_sysfs();
  422. if (ret)
  423. return ret;
  424. nid = memory_add_physaddr_to_nid(phys_addr);
  425. ret = __add_memory(nid, phys_addr,
  426. MIN_MEMORY_BLOCK_SIZE * sections_per_block,
  427. MHP_NONE);
  428. if (ret)
  429. goto out;
  430. ret = count;
  431. out:
  432. unlock_device_hotplug();
  433. return ret;
  434. }
  435. static DEVICE_ATTR_WO(probe);
  436. #endif
  437. #ifdef CONFIG_MEMORY_FAILURE
  438. /*
  439. * Support for offlining pages of memory
  440. */
  441. /* Soft offline a page */
  442. static ssize_t soft_offline_page_store(struct device *dev,
  443. struct device_attribute *attr,
  444. const char *buf, size_t count)
  445. {
  446. int ret;
  447. u64 pfn;
  448. if (!capable(CAP_SYS_ADMIN))
  449. return -EPERM;
  450. if (kstrtoull(buf, 0, &pfn) < 0)
  451. return -EINVAL;
  452. pfn >>= PAGE_SHIFT;
  453. ret = soft_offline_page(pfn, 0);
  454. return ret == 0 ? count : ret;
  455. }
  456. /* Forcibly offline a page, including killing processes. */
  457. static ssize_t hard_offline_page_store(struct device *dev,
  458. struct device_attribute *attr,
  459. const char *buf, size_t count)
  460. {
  461. int ret;
  462. u64 pfn;
  463. if (!capable(CAP_SYS_ADMIN))
  464. return -EPERM;
  465. if (kstrtoull(buf, 0, &pfn) < 0)
  466. return -EINVAL;
  467. pfn >>= PAGE_SHIFT;
  468. ret = memory_failure(pfn, MF_SW_SIMULATED);
  469. if (ret == -EOPNOTSUPP)
  470. ret = 0;
  471. return ret ? ret : count;
  472. }
  473. static DEVICE_ATTR_WO(soft_offline_page);
  474. static DEVICE_ATTR_WO(hard_offline_page);
  475. #endif
  476. /* See phys_device_show(). */
  477. int __weak arch_get_memory_phys_device(unsigned long start_pfn)
  478. {
  479. return 0;
  480. }
  481. /*
  482. * A reference for the returned memory block device is acquired.
  483. *
  484. * Called under device_hotplug_lock.
  485. */
  486. static struct memory_block *find_memory_block_by_id(unsigned long block_id)
  487. {
  488. struct memory_block *mem;
  489. mem = xa_load(&memory_blocks, block_id);
  490. if (mem)
  491. get_device(&mem->dev);
  492. return mem;
  493. }
  494. /*
  495. * Called under device_hotplug_lock.
  496. */
  497. struct memory_block *find_memory_block(unsigned long section_nr)
  498. {
  499. unsigned long block_id = memory_block_id(section_nr);
  500. return find_memory_block_by_id(block_id);
  501. }
  502. static struct attribute *memory_memblk_attrs[] = {
  503. &dev_attr_phys_index.attr,
  504. &dev_attr_state.attr,
  505. &dev_attr_phys_device.attr,
  506. &dev_attr_removable.attr,
  507. #ifdef CONFIG_MEMORY_HOTREMOVE
  508. &dev_attr_valid_zones.attr,
  509. #endif
  510. NULL
  511. };
  512. static const struct attribute_group memory_memblk_attr_group = {
  513. .attrs = memory_memblk_attrs,
  514. };
  515. static const struct attribute_group *memory_memblk_attr_groups[] = {
  516. &memory_memblk_attr_group,
  517. NULL,
  518. };
  519. static int __add_memory_block(struct memory_block *memory)
  520. {
  521. int ret;
  522. memory->dev.bus = &memory_subsys;
  523. memory->dev.id = memory->start_section_nr / sections_per_block;
  524. memory->dev.release = memory_block_release;
  525. memory->dev.groups = memory_memblk_attr_groups;
  526. memory->dev.offline = memory->state == MEM_OFFLINE;
  527. ret = device_register(&memory->dev);
  528. if (ret) {
  529. put_device(&memory->dev);
  530. return ret;
  531. }
  532. ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
  533. GFP_KERNEL));
  534. if (ret)
  535. device_unregister(&memory->dev);
  536. return ret;
  537. }
  538. static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
  539. int nid)
  540. {
  541. const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  542. const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  543. struct zone *zone, *matching_zone = NULL;
  544. pg_data_t *pgdat = NODE_DATA(nid);
  545. int i;
  546. /*
  547. * This logic only works for early memory, when the applicable zones
  548. * already span the memory block. We don't expect overlapping zones on
  549. * a single node for early memory. So if we're told that some PFNs
  550. * of a node fall into this memory block, we can assume that all node
  551. * zones that intersect with the memory block are actually applicable.
  552. * No need to look at the memmap.
  553. */
  554. for (i = 0; i < MAX_NR_ZONES; i++) {
  555. zone = pgdat->node_zones + i;
  556. if (!populated_zone(zone))
  557. continue;
  558. if (!zone_intersects(zone, start_pfn, nr_pages))
  559. continue;
  560. if (!matching_zone) {
  561. matching_zone = zone;
  562. continue;
  563. }
  564. /* Spans multiple zones ... */
  565. matching_zone = NULL;
  566. break;
  567. }
  568. return matching_zone;
  569. }
  570. #ifdef CONFIG_NUMA
  571. /**
  572. * memory_block_add_nid() - Indicate that system RAM falling into this memory
  573. * block device (partially) belongs to the given node.
  574. * @mem: The memory block device.
  575. * @nid: The node id.
  576. * @context: The memory initialization context.
  577. *
  578. * Indicate that system RAM falling into this memory block (partially) belongs
  579. * to the given node. If the context indicates ("early") that we are adding the
  580. * node during node device subsystem initialization, this will also properly
  581. * set/adjust mem->zone based on the zone ranges of the given node.
  582. */
  583. void memory_block_add_nid(struct memory_block *mem, int nid,
  584. enum meminit_context context)
  585. {
  586. if (context == MEMINIT_EARLY && mem->nid != nid) {
  587. /*
  588. * For early memory we have to determine the zone when setting
  589. * the node id and handle multiple nodes spanning a single
  590. * memory block by indicate via zone == NULL that we're not
  591. * dealing with a single zone. So if we're setting the node id
  592. * the first time, determine if there is a single zone. If we're
  593. * setting the node id a second time to a different node,
  594. * invalidate the single detected zone.
  595. */
  596. if (mem->nid == NUMA_NO_NODE)
  597. mem->zone = early_node_zone_for_memory_block(mem, nid);
  598. else
  599. mem->zone = NULL;
  600. }
  601. /*
  602. * If this memory block spans multiple nodes, we only indicate
  603. * the last processed node. If we span multiple nodes (not applicable
  604. * to hotplugged memory), zone == NULL will prohibit memory offlining
  605. * and consequently unplug.
  606. */
  607. mem->nid = nid;
  608. }
  609. #endif
  610. static int add_memory_block(unsigned long block_id, unsigned long state,
  611. unsigned long nr_vmemmap_pages,
  612. struct memory_group *group)
  613. {
  614. struct memory_block *mem;
  615. int ret = 0;
  616. mem = find_memory_block_by_id(block_id);
  617. if (mem) {
  618. put_device(&mem->dev);
  619. return -EEXIST;
  620. }
  621. mem = kzalloc(sizeof(*mem), GFP_KERNEL);
  622. if (!mem)
  623. return -ENOMEM;
  624. mem->start_section_nr = block_id * sections_per_block;
  625. mem->state = state;
  626. mem->nid = NUMA_NO_NODE;
  627. mem->nr_vmemmap_pages = nr_vmemmap_pages;
  628. INIT_LIST_HEAD(&mem->group_next);
  629. #ifndef CONFIG_NUMA
  630. if (state == MEM_ONLINE)
  631. /*
  632. * MEM_ONLINE at this point implies early memory. With NUMA,
  633. * we'll determine the zone when setting the node id via
  634. * memory_block_add_nid(). Memory hotplug updated the zone
  635. * manually when memory onlining/offlining succeeds.
  636. */
  637. mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
  638. #endif /* CONFIG_NUMA */
  639. ret = __add_memory_block(mem);
  640. if (ret)
  641. return ret;
  642. if (group) {
  643. mem->group = group;
  644. list_add(&mem->group_next, &group->memory_blocks);
  645. }
  646. return 0;
  647. }
  648. static int __init add_boot_memory_block(unsigned long base_section_nr)
  649. {
  650. int section_count = 0;
  651. unsigned long nr;
  652. for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
  653. nr++)
  654. if (present_section_nr(nr))
  655. section_count++;
  656. if (section_count == 0)
  657. return 0;
  658. return add_memory_block(memory_block_id(base_section_nr),
  659. MEM_ONLINE, 0, NULL);
  660. }
  661. static int add_hotplug_memory_block(unsigned long block_id,
  662. unsigned long nr_vmemmap_pages,
  663. struct memory_group *group)
  664. {
  665. return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
  666. }
  667. static void remove_memory_block(struct memory_block *memory)
  668. {
  669. if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
  670. return;
  671. WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
  672. if (memory->group) {
  673. list_del(&memory->group_next);
  674. memory->group = NULL;
  675. }
  676. /* drop the ref. we got via find_memory_block() */
  677. put_device(&memory->dev);
  678. device_unregister(&memory->dev);
  679. }
  680. /*
  681. * Create memory block devices for the given memory area. Start and size
  682. * have to be aligned to memory block granularity. Memory block devices
  683. * will be initialized as offline.
  684. *
  685. * Called under device_hotplug_lock.
  686. */
  687. int create_memory_block_devices(unsigned long start, unsigned long size,
  688. unsigned long vmemmap_pages,
  689. struct memory_group *group)
  690. {
  691. const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
  692. unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
  693. struct memory_block *mem;
  694. unsigned long block_id;
  695. int ret = 0;
  696. if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
  697. !IS_ALIGNED(size, memory_block_size_bytes())))
  698. return -EINVAL;
  699. for (block_id = start_block_id; block_id != end_block_id; block_id++) {
  700. ret = add_hotplug_memory_block(block_id, vmemmap_pages, group);
  701. if (ret)
  702. break;
  703. }
  704. if (ret) {
  705. end_block_id = block_id;
  706. for (block_id = start_block_id; block_id != end_block_id;
  707. block_id++) {
  708. mem = find_memory_block_by_id(block_id);
  709. if (WARN_ON_ONCE(!mem))
  710. continue;
  711. remove_memory_block(mem);
  712. }
  713. }
  714. return ret;
  715. }
  716. /*
  717. * Remove memory block devices for the given memory area. Start and size
  718. * have to be aligned to memory block granularity. Memory block devices
  719. * have to be offline.
  720. *
  721. * Called under device_hotplug_lock.
  722. */
  723. void remove_memory_block_devices(unsigned long start, unsigned long size)
  724. {
  725. const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
  726. const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
  727. struct memory_block *mem;
  728. unsigned long block_id;
  729. if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
  730. !IS_ALIGNED(size, memory_block_size_bytes())))
  731. return;
  732. for (block_id = start_block_id; block_id != end_block_id; block_id++) {
  733. mem = find_memory_block_by_id(block_id);
  734. if (WARN_ON_ONCE(!mem))
  735. continue;
  736. unregister_memory_block_under_nodes(mem);
  737. remove_memory_block(mem);
  738. }
  739. }
  740. static struct attribute *memory_root_attrs[] = {
  741. #ifdef CONFIG_ARCH_MEMORY_PROBE
  742. &dev_attr_probe.attr,
  743. #endif
  744. #ifdef CONFIG_MEMORY_FAILURE
  745. &dev_attr_soft_offline_page.attr,
  746. &dev_attr_hard_offline_page.attr,
  747. #endif
  748. &dev_attr_block_size_bytes.attr,
  749. &dev_attr_auto_online_blocks.attr,
  750. NULL
  751. };
  752. static const struct attribute_group memory_root_attr_group = {
  753. .attrs = memory_root_attrs,
  754. };
  755. static const struct attribute_group *memory_root_attr_groups[] = {
  756. &memory_root_attr_group,
  757. NULL,
  758. };
  759. /*
  760. * Initialize the sysfs support for memory devices. At the time this function
  761. * is called, we cannot have concurrent creation/deletion of memory block
  762. * devices, the device_hotplug_lock is not needed.
  763. */
  764. void __init memory_dev_init(void)
  765. {
  766. int ret;
  767. unsigned long block_sz, nr;
  768. /* Validate the configured memory block size */
  769. block_sz = memory_block_size_bytes();
  770. if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
  771. panic("Memory block size not suitable: 0x%lx\n", block_sz);
  772. sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
  773. ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
  774. if (ret)
  775. panic("%s() failed to register subsystem: %d\n", __func__, ret);
  776. /*
  777. * Create entries for memory sections that were found
  778. * during boot and have been initialized
  779. */
  780. for (nr = 0; nr <= __highest_present_section_nr;
  781. nr += sections_per_block) {
  782. ret = add_boot_memory_block(nr);
  783. if (ret)
  784. panic("%s() failed to add memory block: %d\n", __func__,
  785. ret);
  786. }
  787. }
  788. /**
  789. * walk_memory_blocks - walk through all present memory blocks overlapped
  790. * by the range [start, start + size)
  791. *
  792. * @start: start address of the memory range
  793. * @size: size of the memory range
  794. * @arg: argument passed to func
  795. * @func: callback for each memory section walked
  796. *
  797. * This function walks through all present memory blocks overlapped by the
  798. * range [start, start + size), calling func on each memory block.
  799. *
  800. * In case func() returns an error, walking is aborted and the error is
  801. * returned.
  802. *
  803. * Called under device_hotplug_lock.
  804. */
  805. int walk_memory_blocks(unsigned long start, unsigned long size,
  806. void *arg, walk_memory_blocks_func_t func)
  807. {
  808. const unsigned long start_block_id = phys_to_block_id(start);
  809. const unsigned long end_block_id = phys_to_block_id(start + size - 1);
  810. struct memory_block *mem;
  811. unsigned long block_id;
  812. int ret = 0;
  813. if (!size)
  814. return 0;
  815. for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
  816. mem = find_memory_block_by_id(block_id);
  817. if (!mem)
  818. continue;
  819. ret = func(mem, arg);
  820. put_device(&mem->dev);
  821. if (ret)
  822. break;
  823. }
  824. return ret;
  825. }
  826. struct for_each_memory_block_cb_data {
  827. walk_memory_blocks_func_t func;
  828. void *arg;
  829. };
  830. static int for_each_memory_block_cb(struct device *dev, void *data)
  831. {
  832. struct memory_block *mem = to_memory_block(dev);
  833. struct for_each_memory_block_cb_data *cb_data = data;
  834. return cb_data->func(mem, cb_data->arg);
  835. }
  836. /**
  837. * for_each_memory_block - walk through all present memory blocks
  838. *
  839. * @arg: argument passed to func
  840. * @func: callback for each memory block walked
  841. *
  842. * This function walks through all present memory blocks, calling func on
  843. * each memory block.
  844. *
  845. * In case func() returns an error, walking is aborted and the error is
  846. * returned.
  847. */
  848. int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
  849. {
  850. struct for_each_memory_block_cb_data cb_data = {
  851. .func = func,
  852. .arg = arg,
  853. };
  854. return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
  855. for_each_memory_block_cb);
  856. }
  857. /*
  858. * This is an internal helper to unify allocation and initialization of
  859. * memory groups. Note that the passed memory group will be copied to a
  860. * dynamically allocated memory group. After this call, the passed
  861. * memory group should no longer be used.
  862. */
  863. static int memory_group_register(struct memory_group group)
  864. {
  865. struct memory_group *new_group;
  866. uint32_t mgid;
  867. int ret;
  868. if (!node_possible(group.nid))
  869. return -EINVAL;
  870. new_group = kzalloc(sizeof(group), GFP_KERNEL);
  871. if (!new_group)
  872. return -ENOMEM;
  873. *new_group = group;
  874. INIT_LIST_HEAD(&new_group->memory_blocks);
  875. ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
  876. GFP_KERNEL);
  877. if (ret) {
  878. kfree(new_group);
  879. return ret;
  880. } else if (group.is_dynamic) {
  881. xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
  882. }
  883. return mgid;
  884. }
  885. /**
  886. * memory_group_register_static() - Register a static memory group.
  887. * @nid: The node id.
  888. * @max_pages: The maximum number of pages we'll have in this static memory
  889. * group.
  890. *
  891. * Register a new static memory group and return the memory group id.
  892. * All memory in the group belongs to a single unit, such as a DIMM. All
  893. * memory belonging to a static memory group is added in one go to be removed
  894. * in one go -- it's static.
  895. *
  896. * Returns an error if out of memory, if the node id is invalid, if no new
  897. * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
  898. * returns the new memory group id.
  899. */
  900. int memory_group_register_static(int nid, unsigned long max_pages)
  901. {
  902. struct memory_group group = {
  903. .nid = nid,
  904. .s = {
  905. .max_pages = max_pages,
  906. },
  907. };
  908. if (!max_pages)
  909. return -EINVAL;
  910. return memory_group_register(group);
  911. }
  912. EXPORT_SYMBOL_GPL(memory_group_register_static);
  913. /**
  914. * memory_group_register_dynamic() - Register a dynamic memory group.
  915. * @nid: The node id.
  916. * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
  917. * memory group.
  918. *
  919. * Register a new dynamic memory group and return the memory group id.
  920. * Memory within a dynamic memory group is added/removed dynamically
  921. * in unit_pages.
  922. *
  923. * Returns an error if out of memory, if the node id is invalid, if no new
  924. * memory groups can be registered, or if unit_pages is invalid (0, not a
  925. * power of two, smaller than a single memory block). Otherwise, returns the
  926. * new memory group id.
  927. */
  928. int memory_group_register_dynamic(int nid, unsigned long unit_pages)
  929. {
  930. struct memory_group group = {
  931. .nid = nid,
  932. .is_dynamic = true,
  933. .d = {
  934. .unit_pages = unit_pages,
  935. },
  936. };
  937. if (!unit_pages || !is_power_of_2(unit_pages) ||
  938. unit_pages < PHYS_PFN(memory_block_size_bytes()))
  939. return -EINVAL;
  940. return memory_group_register(group);
  941. }
  942. EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
  943. /**
  944. * memory_group_unregister() - Unregister a memory group.
  945. * @mgid: the memory group id
  946. *
  947. * Unregister a memory group. If any memory block still belongs to this
  948. * memory group, unregistering will fail.
  949. *
  950. * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
  951. * memory blocks still belong to this memory group and returns 0 if
  952. * unregistering succeeded.
  953. */
  954. int memory_group_unregister(int mgid)
  955. {
  956. struct memory_group *group;
  957. if (mgid < 0)
  958. return -EINVAL;
  959. group = xa_load(&memory_groups, mgid);
  960. if (!group)
  961. return -EINVAL;
  962. if (!list_empty(&group->memory_blocks))
  963. return -EBUSY;
  964. xa_erase(&memory_groups, mgid);
  965. kfree(group);
  966. return 0;
  967. }
  968. EXPORT_SYMBOL_GPL(memory_group_unregister);
  969. /*
  970. * This is an internal helper only to be used in core memory hotplug code to
  971. * lookup a memory group. We don't care about locking, as we don't expect a
  972. * memory group to get unregistered while adding memory to it -- because
  973. * the group and the memory is managed by the same driver.
  974. */
  975. struct memory_group *memory_group_find_by_id(int mgid)
  976. {
  977. return xa_load(&memory_groups, mgid);
  978. }
  979. /*
  980. * This is an internal helper only to be used in core memory hotplug code to
  981. * walk all dynamic memory groups excluding a given memory group, either
  982. * belonging to a specific node, or belonging to any node.
  983. */
  984. int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
  985. struct memory_group *excluded, void *arg)
  986. {
  987. struct memory_group *group;
  988. unsigned long index;
  989. int ret = 0;
  990. xa_for_each_marked(&memory_groups, index, group,
  991. MEMORY_GROUP_MARK_DYNAMIC) {
  992. if (group == excluded)
  993. continue;
  994. #ifdef CONFIG_NUMA
  995. if (nid != NUMA_NO_NODE && group->nid != nid)
  996. continue;
  997. #endif /* CONFIG_NUMA */
  998. ret = func(group, arg);
  999. if (ret)
  1000. break;
  1001. }
  1002. return ret;
  1003. }