arch_numa.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * NUMA support, based on the x86 implementation.
  4. *
  5. * Copyright (C) 2015 Cavium Inc.
  6. * Author: Ganapatrao Kulkarni <[email protected]>
  7. */
  8. #define pr_fmt(fmt) "NUMA: " fmt
  9. #include <linux/acpi.h>
  10. #include <linux/memblock.h>
  11. #include <linux/module.h>
  12. #include <linux/of.h>
  13. #include <asm/sections.h>
  14. struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
  15. EXPORT_SYMBOL(node_data);
  16. nodemask_t numa_nodes_parsed __initdata;
  17. static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
  18. static int numa_distance_cnt;
  19. static u8 *numa_distance;
  20. bool numa_off;
  21. static __init int numa_parse_early_param(char *opt)
  22. {
  23. if (!opt)
  24. return -EINVAL;
  25. if (str_has_prefix(opt, "off"))
  26. numa_off = true;
  27. return 0;
  28. }
  29. early_param("numa", numa_parse_early_param);
  30. cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
  31. EXPORT_SYMBOL(node_to_cpumask_map);
  32. #ifdef CONFIG_DEBUG_PER_CPU_MAPS
  33. /*
  34. * Returns a pointer to the bitmask of CPUs on Node 'node'.
  35. */
  36. const struct cpumask *cpumask_of_node(int node)
  37. {
  38. if (node == NUMA_NO_NODE)
  39. return cpu_all_mask;
  40. if (WARN_ON(node < 0 || node >= nr_node_ids))
  41. return cpu_none_mask;
  42. if (WARN_ON(node_to_cpumask_map[node] == NULL))
  43. return cpu_online_mask;
  44. return node_to_cpumask_map[node];
  45. }
  46. EXPORT_SYMBOL(cpumask_of_node);
  47. #endif
  48. static void numa_update_cpu(unsigned int cpu, bool remove)
  49. {
  50. int nid = cpu_to_node(cpu);
  51. if (nid == NUMA_NO_NODE)
  52. return;
  53. if (remove)
  54. cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]);
  55. else
  56. cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
  57. }
  58. void numa_add_cpu(unsigned int cpu)
  59. {
  60. numa_update_cpu(cpu, false);
  61. }
  62. void numa_remove_cpu(unsigned int cpu)
  63. {
  64. numa_update_cpu(cpu, true);
  65. }
  66. void numa_clear_node(unsigned int cpu)
  67. {
  68. numa_remove_cpu(cpu);
  69. set_cpu_numa_node(cpu, NUMA_NO_NODE);
  70. }
  71. /*
  72. * Allocate node_to_cpumask_map based on number of available nodes
  73. * Requires node_possible_map to be valid.
  74. *
  75. * Note: cpumask_of_node() is not valid until after this is done.
  76. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
  77. */
  78. static void __init setup_node_to_cpumask_map(void)
  79. {
  80. int node;
  81. /* setup nr_node_ids if not done yet */
  82. if (nr_node_ids == MAX_NUMNODES)
  83. setup_nr_node_ids();
  84. /* allocate and clear the mapping */
  85. for (node = 0; node < nr_node_ids; node++) {
  86. alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
  87. cpumask_clear(node_to_cpumask_map[node]);
  88. }
  89. /* cpumask_of_node() will now work */
  90. pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
  91. }
  92. /*
  93. * Set the cpu to node and mem mapping
  94. */
  95. void numa_store_cpu_info(unsigned int cpu)
  96. {
  97. set_cpu_numa_node(cpu, cpu_to_node_map[cpu]);
  98. }
  99. void __init early_map_cpu_to_node(unsigned int cpu, int nid)
  100. {
  101. /* fallback to node 0 */
  102. if (nid < 0 || nid >= MAX_NUMNODES || numa_off)
  103. nid = 0;
  104. cpu_to_node_map[cpu] = nid;
  105. /*
  106. * We should set the numa node of cpu0 as soon as possible, because it
  107. * has already been set up online before. cpu_to_node(0) will soon be
  108. * called.
  109. */
  110. if (!cpu)
  111. set_cpu_numa_node(cpu, nid);
  112. }
  113. #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
  114. unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
  115. EXPORT_SYMBOL(__per_cpu_offset);
  116. static int __init early_cpu_to_node(int cpu)
  117. {
  118. return cpu_to_node_map[cpu];
  119. }
  120. static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
  121. {
  122. return node_distance(early_cpu_to_node(from), early_cpu_to_node(to));
  123. }
  124. void __init setup_per_cpu_areas(void)
  125. {
  126. unsigned long delta;
  127. unsigned int cpu;
  128. int rc = -EINVAL;
  129. if (pcpu_chosen_fc != PCPU_FC_PAGE) {
  130. /*
  131. * Always reserve area for module percpu variables. That's
  132. * what the legacy allocator did.
  133. */
  134. rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
  135. PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
  136. pcpu_cpu_distance,
  137. early_cpu_to_node);
  138. #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  139. if (rc < 0)
  140. pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n",
  141. pcpu_fc_names[pcpu_chosen_fc], rc);
  142. #endif
  143. }
  144. #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  145. if (rc < 0)
  146. rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node);
  147. #endif
  148. if (rc < 0)
  149. panic("Failed to initialize percpu areas (err=%d).", rc);
  150. delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
  151. for_each_possible_cpu(cpu)
  152. __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
  153. }
  154. #endif
  155. /**
  156. * numa_add_memblk() - Set node id to memblk
  157. * @nid: NUMA node ID of the new memblk
  158. * @start: Start address of the new memblk
  159. * @end: End address of the new memblk
  160. *
  161. * RETURNS:
  162. * 0 on success, -errno on failure.
  163. */
  164. int __init numa_add_memblk(int nid, u64 start, u64 end)
  165. {
  166. int ret;
  167. ret = memblock_set_node(start, (end - start), &memblock.memory, nid);
  168. if (ret < 0) {
  169. pr_err("memblock [0x%llx - 0x%llx] failed to add on node %d\n",
  170. start, (end - 1), nid);
  171. return ret;
  172. }
  173. node_set(nid, numa_nodes_parsed);
  174. return ret;
  175. }
  176. /*
  177. * Initialize NODE_DATA for a node on the local memory
  178. */
  179. static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
  180. {
  181. const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
  182. u64 nd_pa;
  183. void *nd;
  184. int tnid;
  185. if (start_pfn >= end_pfn)
  186. pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
  187. nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
  188. if (!nd_pa)
  189. panic("Cannot allocate %zu bytes for node %d data\n",
  190. nd_size, nid);
  191. nd = __va(nd_pa);
  192. /* report and initialize */
  193. pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
  194. nd_pa, nd_pa + nd_size - 1);
  195. tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
  196. if (tnid != nid)
  197. pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);
  198. node_data[nid] = nd;
  199. memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
  200. NODE_DATA(nid)->node_id = nid;
  201. NODE_DATA(nid)->node_start_pfn = start_pfn;
  202. NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
  203. }
  204. /*
  205. * numa_free_distance
  206. *
  207. * The current table is freed.
  208. */
  209. void __init numa_free_distance(void)
  210. {
  211. size_t size;
  212. if (!numa_distance)
  213. return;
  214. size = numa_distance_cnt * numa_distance_cnt *
  215. sizeof(numa_distance[0]);
  216. memblock_free(numa_distance, size);
  217. numa_distance_cnt = 0;
  218. numa_distance = NULL;
  219. }
  220. /*
  221. * Create a new NUMA distance table.
  222. */
  223. static int __init numa_alloc_distance(void)
  224. {
  225. size_t size;
  226. int i, j;
  227. size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
  228. numa_distance = memblock_alloc(size, PAGE_SIZE);
  229. if (WARN_ON(!numa_distance))
  230. return -ENOMEM;
  231. numa_distance_cnt = nr_node_ids;
  232. /* fill with the default distances */
  233. for (i = 0; i < numa_distance_cnt; i++)
  234. for (j = 0; j < numa_distance_cnt; j++)
  235. numa_distance[i * numa_distance_cnt + j] = i == j ?
  236. LOCAL_DISTANCE : REMOTE_DISTANCE;
  237. pr_debug("Initialized distance table, cnt=%d\n", numa_distance_cnt);
  238. return 0;
  239. }
  240. /**
  241. * numa_set_distance() - Set inter node NUMA distance from node to node.
  242. * @from: the 'from' node to set distance
  243. * @to: the 'to' node to set distance
  244. * @distance: NUMA distance
  245. *
  246. * Set the distance from node @from to @to to @distance.
  247. * If distance table doesn't exist, a warning is printed.
  248. *
  249. * If @from or @to is higher than the highest known node or lower than zero
  250. * or @distance doesn't make sense, the call is ignored.
  251. */
  252. void __init numa_set_distance(int from, int to, int distance)
  253. {
  254. if (!numa_distance) {
  255. pr_warn_once("Warning: distance table not allocated yet\n");
  256. return;
  257. }
  258. if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
  259. from < 0 || to < 0) {
  260. pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
  261. from, to, distance);
  262. return;
  263. }
  264. if ((u8)distance != distance ||
  265. (from == to && distance != LOCAL_DISTANCE)) {
  266. pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
  267. from, to, distance);
  268. return;
  269. }
  270. numa_distance[from * numa_distance_cnt + to] = distance;
  271. }
  272. /*
  273. * Return NUMA distance @from to @to
  274. */
  275. int __node_distance(int from, int to)
  276. {
  277. if (from >= numa_distance_cnt || to >= numa_distance_cnt)
  278. return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
  279. return numa_distance[from * numa_distance_cnt + to];
  280. }
  281. EXPORT_SYMBOL(__node_distance);
  282. static int __init numa_register_nodes(void)
  283. {
  284. int nid;
  285. struct memblock_region *mblk;
  286. /* Check that valid nid is set to memblks */
  287. for_each_mem_region(mblk) {
  288. int mblk_nid = memblock_get_region_node(mblk);
  289. phys_addr_t start = mblk->base;
  290. phys_addr_t end = mblk->base + mblk->size - 1;
  291. if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
  292. pr_warn("Warning: invalid memblk node %d [mem %pap-%pap]\n",
  293. mblk_nid, &start, &end);
  294. return -EINVAL;
  295. }
  296. }
  297. /* Finally register nodes. */
  298. for_each_node_mask(nid, numa_nodes_parsed) {
  299. unsigned long start_pfn, end_pfn;
  300. get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
  301. setup_node_data(nid, start_pfn, end_pfn);
  302. node_set_online(nid);
  303. }
  304. /* Setup online nodes to actual nodes*/
  305. node_possible_map = numa_nodes_parsed;
  306. return 0;
  307. }
  308. static int __init numa_init(int (*init_func)(void))
  309. {
  310. int ret;
  311. nodes_clear(numa_nodes_parsed);
  312. nodes_clear(node_possible_map);
  313. nodes_clear(node_online_map);
  314. ret = numa_alloc_distance();
  315. if (ret < 0)
  316. return ret;
  317. ret = init_func();
  318. if (ret < 0)
  319. goto out_free_distance;
  320. if (nodes_empty(numa_nodes_parsed)) {
  321. pr_info("No NUMA configuration found\n");
  322. ret = -EINVAL;
  323. goto out_free_distance;
  324. }
  325. ret = numa_register_nodes();
  326. if (ret < 0)
  327. goto out_free_distance;
  328. setup_node_to_cpumask_map();
  329. return 0;
  330. out_free_distance:
  331. numa_free_distance();
  332. return ret;
  333. }
  334. /**
  335. * dummy_numa_init() - Fallback dummy NUMA init
  336. *
  337. * Used if there's no underlying NUMA architecture, NUMA initialization
  338. * fails, or NUMA is disabled on the command line.
  339. *
  340. * Must online at least one node (node 0) and add memory blocks that cover all
  341. * allowed memory. It is unlikely that this function fails.
  342. *
  343. * Return: 0 on success, -errno on failure.
  344. */
  345. static int __init dummy_numa_init(void)
  346. {
  347. phys_addr_t start = memblock_start_of_DRAM();
  348. phys_addr_t end = memblock_end_of_DRAM() - 1;
  349. int ret;
  350. if (numa_off)
  351. pr_info("NUMA disabled\n"); /* Forced off on command line. */
  352. pr_info("Faking a node at [mem %pap-%pap]\n", &start, &end);
  353. ret = numa_add_memblk(0, start, end + 1);
  354. if (ret) {
  355. pr_err("NUMA init failed\n");
  356. return ret;
  357. }
  358. numa_off = true;
  359. return 0;
  360. }
  361. #ifdef CONFIG_ACPI_NUMA
  362. static int __init arch_acpi_numa_init(void)
  363. {
  364. int ret;
  365. ret = acpi_numa_init();
  366. if (ret) {
  367. pr_info("Failed to initialise from firmware\n");
  368. return ret;
  369. }
  370. return srat_disabled() ? -EINVAL : 0;
  371. }
  372. #else
  373. static int __init arch_acpi_numa_init(void)
  374. {
  375. return -EOPNOTSUPP;
  376. }
  377. #endif
  378. /**
  379. * arch_numa_init() - Initialize NUMA
  380. *
  381. * Try each configured NUMA initialization method until one succeeds. The
  382. * last fallback is dummy single node config encompassing whole memory.
  383. */
  384. void __init arch_numa_init(void)
  385. {
  386. if (!numa_off) {
  387. if (!acpi_disabled && !numa_init(arch_acpi_numa_init))
  388. return;
  389. if (acpi_disabled && !numa_init(of_numa_init))
  390. return;
  391. }
  392. numa_init(dummy_numa_init);
  393. }