sparse.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * sparse memory mappings.
  4. */
  5. #include <linux/mm.h>
  6. #include <linux/slab.h>
  7. #include <linux/mmzone.h>
  8. #include <linux/memblock.h>
  9. #include <linux/compiler.h>
  10. #include <linux/highmem.h>
  11. #include <linux/export.h>
  12. #include <linux/spinlock.h>
  13. #include <linux/vmalloc.h>
  14. #include <linux/swap.h>
  15. #include <linux/swapops.h>
  16. #include <linux/bootmem_info.h>
  17. #include "internal.h"
  18. #include <asm/dma.h>
  19. /*
  20. * Permanent SPARSEMEM data:
  21. *
  22. * 1) mem_section - memory sections, mem_map's for valid memory
  23. */
  24. #ifdef CONFIG_SPARSEMEM_EXTREME
  25. struct mem_section **mem_section;
  26. #else
  27. struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
  28. ____cacheline_internodealigned_in_smp;
  29. #endif
  30. EXPORT_SYMBOL(mem_section);
  31. #ifdef NODE_NOT_IN_PAGE_FLAGS
  32. /*
  33. * If we did not store the node number in the page then we have to
  34. * do a lookup in the section_to_node_table in order to find which
  35. * node the page belongs to.
  36. */
  37. #if MAX_NUMNODES <= 256
  38. static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
  39. #else
  40. static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
  41. #endif
  42. int page_to_nid(const struct page *page)
  43. {
  44. return section_to_node_table[page_to_section(page)];
  45. }
  46. EXPORT_SYMBOL(page_to_nid);
  47. static void set_section_nid(unsigned long section_nr, int nid)
  48. {
  49. section_to_node_table[section_nr] = nid;
  50. }
  51. #else /* !NODE_NOT_IN_PAGE_FLAGS */
  52. static inline void set_section_nid(unsigned long section_nr, int nid)
  53. {
  54. }
  55. #endif
  56. #ifdef CONFIG_SPARSEMEM_EXTREME
  57. static noinline struct mem_section __ref *sparse_index_alloc(int nid)
  58. {
  59. struct mem_section *section = NULL;
  60. unsigned long array_size = SECTIONS_PER_ROOT *
  61. sizeof(struct mem_section);
  62. if (slab_is_available()) {
  63. section = kzalloc_node(array_size, GFP_KERNEL, nid);
  64. } else {
  65. section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
  66. nid);
  67. if (!section)
  68. panic("%s: Failed to allocate %lu bytes nid=%d\n",
  69. __func__, array_size, nid);
  70. }
  71. return section;
  72. }
  73. static int __meminit sparse_index_init(unsigned long section_nr, int nid)
  74. {
  75. unsigned long root = SECTION_NR_TO_ROOT(section_nr);
  76. struct mem_section *section;
  77. /*
  78. * An existing section is possible in the sub-section hotplug
  79. * case. First hot-add instantiates, follow-on hot-add reuses
  80. * the existing section.
  81. *
  82. * The mem_hotplug_lock resolves the apparent race below.
  83. */
  84. if (mem_section[root])
  85. return 0;
  86. section = sparse_index_alloc(nid);
  87. if (!section)
  88. return -ENOMEM;
  89. mem_section[root] = section;
  90. return 0;
  91. }
  92. #else /* !SPARSEMEM_EXTREME */
  93. static inline int sparse_index_init(unsigned long section_nr, int nid)
  94. {
  95. return 0;
  96. }
  97. #endif
  98. /*
  99. * During early boot, before section_mem_map is used for an actual
  100. * mem_map, we use section_mem_map to store the section's NUMA
  101. * node. This keeps us from having to use another data structure. The
  102. * node information is cleared just before we store the real mem_map.
  103. */
  104. static inline unsigned long sparse_encode_early_nid(int nid)
  105. {
  106. return ((unsigned long)nid << SECTION_NID_SHIFT);
  107. }
  108. static inline int sparse_early_nid(struct mem_section *section)
  109. {
  110. return (section->section_mem_map >> SECTION_NID_SHIFT);
  111. }
  112. /* Validate the physical addressing limitations of the model */
  113. static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
  114. unsigned long *end_pfn)
  115. {
  116. unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
  117. /*
  118. * Sanity checks - do not allow an architecture to pass
  119. * in larger pfns than the maximum scope of sparsemem:
  120. */
  121. if (*start_pfn > max_sparsemem_pfn) {
  122. mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
  123. "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
  124. *start_pfn, *end_pfn, max_sparsemem_pfn);
  125. WARN_ON_ONCE(1);
  126. *start_pfn = max_sparsemem_pfn;
  127. *end_pfn = max_sparsemem_pfn;
  128. } else if (*end_pfn > max_sparsemem_pfn) {
  129. mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
  130. "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
  131. *start_pfn, *end_pfn, max_sparsemem_pfn);
  132. WARN_ON_ONCE(1);
  133. *end_pfn = max_sparsemem_pfn;
  134. }
  135. }
  136. /*
  137. * There are a number of times that we loop over NR_MEM_SECTIONS,
  138. * looking for section_present() on each. But, when we have very
  139. * large physical address spaces, NR_MEM_SECTIONS can also be
  140. * very large which makes the loops quite long.
  141. *
  142. * Keeping track of this gives us an easy way to break out of
  143. * those loops early.
  144. */
  145. unsigned long __highest_present_section_nr;
  146. static void __section_mark_present(struct mem_section *ms,
  147. unsigned long section_nr)
  148. {
  149. if (section_nr > __highest_present_section_nr)
  150. __highest_present_section_nr = section_nr;
  151. ms->section_mem_map |= SECTION_MARKED_PRESENT;
  152. }
  153. #define for_each_present_section_nr(start, section_nr) \
  154. for (section_nr = next_present_section_nr(start-1); \
  155. ((section_nr != -1) && \
  156. (section_nr <= __highest_present_section_nr)); \
  157. section_nr = next_present_section_nr(section_nr))
  158. static inline unsigned long first_present_section_nr(void)
  159. {
  160. return next_present_section_nr(-1);
  161. }
  162. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  163. static void subsection_mask_set(unsigned long *map, unsigned long pfn,
  164. unsigned long nr_pages)
  165. {
  166. int idx = subsection_map_index(pfn);
  167. int end = subsection_map_index(pfn + nr_pages - 1);
  168. bitmap_set(map, idx, end - idx + 1);
  169. }
  170. void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
  171. {
  172. int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
  173. unsigned long nr, start_sec = pfn_to_section_nr(pfn);
  174. if (!nr_pages)
  175. return;
  176. for (nr = start_sec; nr <= end_sec; nr++) {
  177. struct mem_section *ms;
  178. unsigned long pfns;
  179. pfns = min(nr_pages, PAGES_PER_SECTION
  180. - (pfn & ~PAGE_SECTION_MASK));
  181. ms = __nr_to_section(nr);
  182. subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
  183. pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
  184. pfns, subsection_map_index(pfn),
  185. subsection_map_index(pfn + pfns - 1));
  186. pfn += pfns;
  187. nr_pages -= pfns;
  188. }
  189. }
  190. #else
  191. void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
  192. {
  193. }
  194. #endif
  195. /* Record a memory area against a node. */
  196. static void __init memory_present(int nid, unsigned long start, unsigned long end)
  197. {
  198. unsigned long pfn;
  199. #ifdef CONFIG_SPARSEMEM_EXTREME
  200. if (unlikely(!mem_section)) {
  201. unsigned long size, align;
  202. size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
  203. align = 1 << (INTERNODE_CACHE_SHIFT);
  204. mem_section = memblock_alloc(size, align);
  205. if (!mem_section)
  206. panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
  207. __func__, size, align);
  208. }
  209. #endif
  210. start &= PAGE_SECTION_MASK;
  211. mminit_validate_memmodel_limits(&start, &end);
  212. for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
  213. unsigned long section = pfn_to_section_nr(pfn);
  214. struct mem_section *ms;
  215. sparse_index_init(section, nid);
  216. set_section_nid(section, nid);
  217. ms = __nr_to_section(section);
  218. if (!ms->section_mem_map) {
  219. ms->section_mem_map = sparse_encode_early_nid(nid) |
  220. SECTION_IS_ONLINE;
  221. __section_mark_present(ms, section);
  222. }
  223. }
  224. }
  225. /*
  226. * Mark all memblocks as present using memory_present().
  227. * This is a convenience function that is useful to mark all of the systems
  228. * memory as present during initialization.
  229. */
  230. static void __init memblocks_present(void)
  231. {
  232. unsigned long start, end;
  233. int i, nid;
  234. for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
  235. memory_present(nid, start, end);
  236. }
  237. /*
  238. * Subtle, we encode the real pfn into the mem_map such that
  239. * the identity pfn - section_mem_map will return the actual
  240. * physical page frame number.
  241. */
  242. static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
  243. {
  244. unsigned long coded_mem_map =
  245. (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
  246. BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
  247. BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
  248. return coded_mem_map;
  249. }
  250. #ifdef CONFIG_MEMORY_HOTPLUG
  251. /*
  252. * Decode mem_map from the coded memmap
  253. */
  254. struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
  255. {
  256. /* mask off the extra low bits of information */
  257. coded_mem_map &= SECTION_MAP_MASK;
  258. return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
  259. }
  260. #endif /* CONFIG_MEMORY_HOTPLUG */
  261. static void __meminit sparse_init_one_section(struct mem_section *ms,
  262. unsigned long pnum, struct page *mem_map,
  263. struct mem_section_usage *usage, unsigned long flags)
  264. {
  265. ms->section_mem_map &= ~SECTION_MAP_MASK;
  266. ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
  267. | SECTION_HAS_MEM_MAP | flags;
  268. ms->usage = usage;
  269. }
  270. static unsigned long usemap_size(void)
  271. {
  272. return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
  273. }
  274. size_t mem_section_usage_size(void)
  275. {
  276. return sizeof(struct mem_section_usage) + usemap_size();
  277. }
  278. static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
  279. {
  280. #ifndef CONFIG_NUMA
  281. VM_BUG_ON(pgdat != &contig_page_data);
  282. return __pa_symbol(&contig_page_data);
  283. #else
  284. return __pa(pgdat);
  285. #endif
  286. }
  287. #ifdef CONFIG_MEMORY_HOTREMOVE
  288. static struct mem_section_usage * __init
  289. sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
  290. unsigned long size)
  291. {
  292. struct mem_section_usage *usage;
  293. unsigned long goal, limit;
  294. int nid;
  295. /*
  296. * A page may contain usemaps for other sections preventing the
  297. * page being freed and making a section unremovable while
  298. * other sections referencing the usemap remain active. Similarly,
  299. * a pgdat can prevent a section being removed. If section A
  300. * contains a pgdat and section B contains the usemap, both
  301. * sections become inter-dependent. This allocates usemaps
  302. * from the same section as the pgdat where possible to avoid
  303. * this problem.
  304. */
  305. goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
  306. limit = goal + (1UL << PA_SECTION_SHIFT);
  307. nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
  308. again:
  309. usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
  310. if (!usage && limit) {
  311. limit = 0;
  312. goto again;
  313. }
  314. return usage;
  315. }
  316. static void __init check_usemap_section_nr(int nid,
  317. struct mem_section_usage *usage)
  318. {
  319. unsigned long usemap_snr, pgdat_snr;
  320. static unsigned long old_usemap_snr;
  321. static unsigned long old_pgdat_snr;
  322. struct pglist_data *pgdat = NODE_DATA(nid);
  323. int usemap_nid;
  324. /* First call */
  325. if (!old_usemap_snr) {
  326. old_usemap_snr = NR_MEM_SECTIONS;
  327. old_pgdat_snr = NR_MEM_SECTIONS;
  328. }
  329. usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
  330. pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
  331. if (usemap_snr == pgdat_snr)
  332. return;
  333. if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
  334. /* skip redundant message */
  335. return;
  336. old_usemap_snr = usemap_snr;
  337. old_pgdat_snr = pgdat_snr;
  338. usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
  339. if (usemap_nid != nid) {
  340. pr_info("node %d must be removed before remove section %ld\n",
  341. nid, usemap_snr);
  342. return;
  343. }
  344. /*
  345. * There is a circular dependency.
  346. * Some platforms allow un-removable section because they will just
  347. * gather other removable sections for dynamic partitioning.
  348. * Just notify un-removable section's number here.
  349. */
  350. pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
  351. usemap_snr, pgdat_snr, nid);
  352. }
  353. #else
  354. static struct mem_section_usage * __init
  355. sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
  356. unsigned long size)
  357. {
  358. return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
  359. }
  360. static void __init check_usemap_section_nr(int nid,
  361. struct mem_section_usage *usage)
  362. {
  363. }
  364. #endif /* CONFIG_MEMORY_HOTREMOVE */
  365. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  366. static unsigned long __init section_map_size(void)
  367. {
  368. return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
  369. }
  370. #else
  371. static unsigned long __init section_map_size(void)
  372. {
  373. return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
  374. }
  375. struct page __init *__populate_section_memmap(unsigned long pfn,
  376. unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
  377. struct dev_pagemap *pgmap)
  378. {
  379. unsigned long size = section_map_size();
  380. struct page *map = sparse_buffer_alloc(size);
  381. phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
  382. if (map)
  383. return map;
  384. map = memmap_alloc(size, size, addr, nid, false);
  385. if (!map)
  386. panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
  387. __func__, size, PAGE_SIZE, nid, &addr);
  388. return map;
  389. }
  390. #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
  391. static void *sparsemap_buf __meminitdata;
  392. static void *sparsemap_buf_end __meminitdata;
  393. static inline void __meminit sparse_buffer_free(unsigned long size)
  394. {
  395. WARN_ON(!sparsemap_buf || size == 0);
  396. memblock_free(sparsemap_buf, size);
  397. }
  398. static void __init sparse_buffer_init(unsigned long size, int nid)
  399. {
  400. phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
  401. WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
  402. /*
  403. * Pre-allocated buffer is mainly used by __populate_section_memmap
  404. * and we want it to be properly aligned to the section size - this is
  405. * especially the case for VMEMMAP which maps memmap to PMDs
  406. */
  407. sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
  408. sparsemap_buf_end = sparsemap_buf + size;
  409. }
  410. static void __init sparse_buffer_fini(void)
  411. {
  412. unsigned long size = sparsemap_buf_end - sparsemap_buf;
  413. if (sparsemap_buf && size > 0)
  414. sparse_buffer_free(size);
  415. sparsemap_buf = NULL;
  416. }
  417. void * __meminit sparse_buffer_alloc(unsigned long size)
  418. {
  419. void *ptr = NULL;
  420. if (sparsemap_buf) {
  421. ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
  422. if (ptr + size > sparsemap_buf_end)
  423. ptr = NULL;
  424. else {
  425. /* Free redundant aligned space */
  426. if ((unsigned long)(ptr - sparsemap_buf) > 0)
  427. sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
  428. sparsemap_buf = ptr + size;
  429. }
  430. }
  431. return ptr;
  432. }
  433. void __weak __meminit vmemmap_populate_print_last(void)
  434. {
  435. }
  436. /*
  437. * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
  438. * And number of present sections in this node is map_count.
  439. */
  440. static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
  441. unsigned long pnum_end,
  442. unsigned long map_count)
  443. {
  444. struct mem_section_usage *usage;
  445. unsigned long pnum;
  446. struct page *map;
  447. usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
  448. mem_section_usage_size() * map_count);
  449. if (!usage) {
  450. pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
  451. goto failed;
  452. }
  453. sparse_buffer_init(map_count * section_map_size(), nid);
  454. for_each_present_section_nr(pnum_begin, pnum) {
  455. unsigned long pfn = section_nr_to_pfn(pnum);
  456. if (pnum >= pnum_end)
  457. break;
  458. map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
  459. nid, NULL, NULL);
  460. if (!map) {
  461. pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
  462. __func__, nid);
  463. pnum_begin = pnum;
  464. sparse_buffer_fini();
  465. goto failed;
  466. }
  467. check_usemap_section_nr(nid, usage);
  468. sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
  469. SECTION_IS_EARLY);
  470. usage = (void *) usage + mem_section_usage_size();
  471. }
  472. sparse_buffer_fini();
  473. return;
  474. failed:
  475. /* We failed to allocate, mark all the following pnums as not present */
  476. for_each_present_section_nr(pnum_begin, pnum) {
  477. struct mem_section *ms;
  478. if (pnum >= pnum_end)
  479. break;
  480. ms = __nr_to_section(pnum);
  481. ms->section_mem_map = 0;
  482. }
  483. }
  484. /*
  485. * Allocate the accumulated non-linear sections, allocate a mem_map
  486. * for each and record the physical to section mapping.
  487. */
  488. void __init sparse_init(void)
  489. {
  490. unsigned long pnum_end, pnum_begin, map_count = 1;
  491. int nid_begin;
  492. memblocks_present();
  493. pnum_begin = first_present_section_nr();
  494. nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
  495. /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
  496. set_pageblock_order();
  497. for_each_present_section_nr(pnum_begin + 1, pnum_end) {
  498. int nid = sparse_early_nid(__nr_to_section(pnum_end));
  499. if (nid == nid_begin) {
  500. map_count++;
  501. continue;
  502. }
  503. /* Init node with sections in range [pnum_begin, pnum_end) */
  504. sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
  505. nid_begin = nid;
  506. pnum_begin = pnum_end;
  507. map_count = 1;
  508. }
  509. /* cover the last node */
  510. sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
  511. vmemmap_populate_print_last();
  512. }
  513. #ifdef CONFIG_MEMORY_HOTPLUG
  514. /* Mark all memory sections within the pfn range as online */
  515. void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
  516. {
  517. unsigned long pfn;
  518. for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
  519. unsigned long section_nr = pfn_to_section_nr(pfn);
  520. struct mem_section *ms;
  521. /* onlining code should never touch invalid ranges */
  522. if (WARN_ON(!valid_section_nr(section_nr)))
  523. continue;
  524. ms = __nr_to_section(section_nr);
  525. ms->section_mem_map |= SECTION_IS_ONLINE;
  526. }
  527. }
  528. /* Mark all memory sections within the pfn range as offline */
  529. void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
  530. {
  531. unsigned long pfn;
  532. for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
  533. unsigned long section_nr = pfn_to_section_nr(pfn);
  534. struct mem_section *ms;
  535. /*
  536. * TODO this needs some double checking. Offlining code makes
  537. * sure to check pfn_valid but those checks might be just bogus
  538. */
  539. if (WARN_ON(!valid_section_nr(section_nr)))
  540. continue;
  541. ms = __nr_to_section(section_nr);
  542. ms->section_mem_map &= ~SECTION_IS_ONLINE;
  543. }
  544. }
  545. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  546. static struct page * __meminit populate_section_memmap(unsigned long pfn,
  547. unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
  548. struct dev_pagemap *pgmap)
  549. {
  550. return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
  551. }
  552. static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
  553. struct vmem_altmap *altmap)
  554. {
  555. unsigned long start = (unsigned long) pfn_to_page(pfn);
  556. unsigned long end = start + nr_pages * sizeof(struct page);
  557. vmemmap_free(start, end, altmap);
  558. }
  559. static void free_map_bootmem(struct page *memmap)
  560. {
  561. unsigned long start = (unsigned long)memmap;
  562. unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
  563. vmemmap_free(start, end, NULL);
  564. }
  565. static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
  566. {
  567. DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
  568. DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
  569. struct mem_section *ms = __pfn_to_section(pfn);
  570. unsigned long *subsection_map = ms->usage
  571. ? &ms->usage->subsection_map[0] : NULL;
  572. subsection_mask_set(map, pfn, nr_pages);
  573. if (subsection_map)
  574. bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
  575. if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
  576. "section already deactivated (%#lx + %ld)\n",
  577. pfn, nr_pages))
  578. return -EINVAL;
  579. bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
  580. return 0;
  581. }
  582. static bool is_subsection_map_empty(struct mem_section *ms)
  583. {
  584. return bitmap_empty(&ms->usage->subsection_map[0],
  585. SUBSECTIONS_PER_SECTION);
  586. }
  587. static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
  588. {
  589. struct mem_section *ms = __pfn_to_section(pfn);
  590. DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
  591. unsigned long *subsection_map;
  592. int rc = 0;
  593. subsection_mask_set(map, pfn, nr_pages);
  594. subsection_map = &ms->usage->subsection_map[0];
  595. if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
  596. rc = -EINVAL;
  597. else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
  598. rc = -EEXIST;
  599. else
  600. bitmap_or(subsection_map, map, subsection_map,
  601. SUBSECTIONS_PER_SECTION);
  602. return rc;
  603. }
  604. #else
  605. struct page * __meminit populate_section_memmap(unsigned long pfn,
  606. unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
  607. struct dev_pagemap *pgmap)
  608. {
  609. return kvmalloc_node(array_size(sizeof(struct page),
  610. PAGES_PER_SECTION), GFP_KERNEL, nid);
  611. }
  612. static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
  613. struct vmem_altmap *altmap)
  614. {
  615. kvfree(pfn_to_page(pfn));
  616. }
  617. static void free_map_bootmem(struct page *memmap)
  618. {
  619. unsigned long maps_section_nr, removing_section_nr, i;
  620. unsigned long magic, nr_pages;
  621. struct page *page = virt_to_page(memmap);
  622. nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
  623. >> PAGE_SHIFT;
  624. for (i = 0; i < nr_pages; i++, page++) {
  625. magic = page->index;
  626. BUG_ON(magic == NODE_INFO);
  627. maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
  628. removing_section_nr = page_private(page);
  629. /*
  630. * When this function is called, the removing section is
  631. * logical offlined state. This means all pages are isolated
  632. * from page allocator. If removing section's memmap is placed
  633. * on the same section, it must not be freed.
  634. * If it is freed, page allocator may allocate it which will
  635. * be removed physically soon.
  636. */
  637. if (maps_section_nr != removing_section_nr)
  638. put_page_bootmem(page);
  639. }
  640. }
  641. static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
  642. {
  643. return 0;
  644. }
  645. static bool is_subsection_map_empty(struct mem_section *ms)
  646. {
  647. return true;
  648. }
  649. static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
  650. {
  651. return 0;
  652. }
  653. #endif /* CONFIG_SPARSEMEM_VMEMMAP */
  654. /*
  655. * To deactivate a memory region, there are 3 cases to handle across
  656. * two configurations (SPARSEMEM_VMEMMAP={y,n}):
  657. *
  658. * 1. deactivation of a partial hot-added section (only possible in
  659. * the SPARSEMEM_VMEMMAP=y case).
  660. * a) section was present at memory init.
  661. * b) section was hot-added post memory init.
  662. * 2. deactivation of a complete hot-added section.
  663. * 3. deactivation of a complete section from memory init.
  664. *
  665. * For 1, when subsection_map does not empty we will not be freeing the
  666. * usage map, but still need to free the vmemmap range.
  667. *
  668. * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
  669. */
  670. static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
  671. struct vmem_altmap *altmap)
  672. {
  673. struct mem_section *ms = __pfn_to_section(pfn);
  674. bool section_is_early = early_section(ms);
  675. struct page *memmap = NULL;
  676. bool empty;
  677. if (clear_subsection_map(pfn, nr_pages))
  678. return;
  679. empty = is_subsection_map_empty(ms);
  680. if (empty) {
  681. unsigned long section_nr = pfn_to_section_nr(pfn);
  682. /*
  683. * When removing an early section, the usage map is kept (as the
  684. * usage maps of other sections fall into the same page). It
  685. * will be re-used when re-adding the section - which is then no
  686. * longer an early section. If the usage map is PageReserved, it
  687. * was allocated during boot.
  688. */
  689. if (!PageReserved(virt_to_page(ms->usage))) {
  690. kfree(ms->usage);
  691. ms->usage = NULL;
  692. }
  693. memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
  694. /*
  695. * Mark the section invalid so that valid_section()
  696. * return false. This prevents code from dereferencing
  697. * ms->usage array.
  698. */
  699. ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
  700. }
  701. /*
  702. * The memmap of early sections is always fully populated. See
  703. * section_activate() and pfn_valid() .
  704. */
  705. if (!section_is_early)
  706. depopulate_section_memmap(pfn, nr_pages, altmap);
  707. else if (memmap)
  708. free_map_bootmem(memmap);
  709. if (empty)
  710. ms->section_mem_map = (unsigned long)NULL;
  711. }
  712. static struct page * __meminit section_activate(int nid, unsigned long pfn,
  713. unsigned long nr_pages, struct vmem_altmap *altmap,
  714. struct dev_pagemap *pgmap)
  715. {
  716. struct mem_section *ms = __pfn_to_section(pfn);
  717. struct mem_section_usage *usage = NULL;
  718. struct page *memmap;
  719. int rc = 0;
  720. if (!ms->usage) {
  721. usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
  722. if (!usage)
  723. return ERR_PTR(-ENOMEM);
  724. ms->usage = usage;
  725. }
  726. rc = fill_subsection_map(pfn, nr_pages);
  727. if (rc) {
  728. if (usage)
  729. ms->usage = NULL;
  730. kfree(usage);
  731. return ERR_PTR(rc);
  732. }
  733. /*
  734. * The early init code does not consider partially populated
  735. * initial sections, it simply assumes that memory will never be
  736. * referenced. If we hot-add memory into such a section then we
  737. * do not need to populate the memmap and can simply reuse what
  738. * is already there.
  739. */
  740. if (nr_pages < PAGES_PER_SECTION && early_section(ms))
  741. return pfn_to_page(pfn);
  742. memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
  743. if (!memmap) {
  744. section_deactivate(pfn, nr_pages, altmap);
  745. return ERR_PTR(-ENOMEM);
  746. }
  747. return memmap;
  748. }
  749. /**
  750. * sparse_add_section - add a memory section, or populate an existing one
  751. * @nid: The node to add section on
  752. * @start_pfn: start pfn of the memory range
  753. * @nr_pages: number of pfns to add in the section
  754. * @altmap: alternate pfns to allocate the memmap backing store
  755. * @pgmap: alternate compound page geometry for devmap mappings
  756. *
  757. * This is only intended for hotplug.
  758. *
  759. * Note that only VMEMMAP supports sub-section aligned hotplug,
  760. * the proper alignment and size are gated by check_pfn_span().
  761. *
  762. *
  763. * Return:
  764. * * 0 - On success.
  765. * * -EEXIST - Section has been present.
  766. * * -ENOMEM - Out of memory.
  767. */
  768. int __meminit sparse_add_section(int nid, unsigned long start_pfn,
  769. unsigned long nr_pages, struct vmem_altmap *altmap,
  770. struct dev_pagemap *pgmap)
  771. {
  772. unsigned long section_nr = pfn_to_section_nr(start_pfn);
  773. struct mem_section *ms;
  774. struct page *memmap;
  775. int ret;
  776. ret = sparse_index_init(section_nr, nid);
  777. if (ret < 0)
  778. return ret;
  779. memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap);
  780. if (IS_ERR(memmap))
  781. return PTR_ERR(memmap);
  782. /*
  783. * Poison uninitialized struct pages in order to catch invalid flags
  784. * combinations.
  785. */
  786. page_init_poison(memmap, sizeof(struct page) * nr_pages);
  787. ms = __nr_to_section(section_nr);
  788. set_section_nid(section_nr, nid);
  789. __section_mark_present(ms, section_nr);
  790. /* Align memmap to section boundary in the subsection case */
  791. if (section_nr_to_pfn(section_nr) != start_pfn)
  792. memmap = pfn_to_page(section_nr_to_pfn(section_nr));
  793. sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
  794. return 0;
  795. }
  796. void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
  797. unsigned long nr_pages, unsigned long map_offset,
  798. struct vmem_altmap *altmap)
  799. {
  800. clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
  801. nr_pages - map_offset);
  802. section_deactivate(pfn, nr_pages, altmap);
  803. }
  804. #endif /* CONFIG_MEMORY_HOTPLUG */