numa.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * pSeries NUMA support
  4. *
  5. * Copyright (C) 2002 Anton Blanchard <[email protected]>, IBM
  6. */
  7. #define pr_fmt(fmt) "numa: " fmt
  8. #include <linux/threads.h>
  9. #include <linux/memblock.h>
  10. #include <linux/init.h>
  11. #include <linux/mm.h>
  12. #include <linux/mmzone.h>
  13. #include <linux/export.h>
  14. #include <linux/nodemask.h>
  15. #include <linux/cpu.h>
  16. #include <linux/notifier.h>
  17. #include <linux/of.h>
  18. #include <linux/pfn.h>
  19. #include <linux/cpuset.h>
  20. #include <linux/node.h>
  21. #include <linux/stop_machine.h>
  22. #include <linux/proc_fs.h>
  23. #include <linux/seq_file.h>
  24. #include <linux/uaccess.h>
  25. #include <linux/slab.h>
  26. #include <asm/cputhreads.h>
  27. #include <asm/sparsemem.h>
  28. #include <asm/smp.h>
  29. #include <asm/topology.h>
  30. #include <asm/firmware.h>
  31. #include <asm/paca.h>
  32. #include <asm/hvcall.h>
  33. #include <asm/setup.h>
  34. #include <asm/vdso.h>
  35. #include <asm/drmem.h>
  36. static int numa_enabled = 1;
  37. static char *cmdline __initdata;
  38. int numa_cpu_lookup_table[NR_CPUS];
  39. cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
  40. struct pglist_data *node_data[MAX_NUMNODES];
  41. EXPORT_SYMBOL(numa_cpu_lookup_table);
  42. EXPORT_SYMBOL(node_to_cpumask_map);
  43. EXPORT_SYMBOL(node_data);
  44. static int primary_domain_index;
  45. static int n_mem_addr_cells, n_mem_size_cells;
  46. #define FORM0_AFFINITY 0
  47. #define FORM1_AFFINITY 1
  48. #define FORM2_AFFINITY 2
  49. static int affinity_form;
  50. #define MAX_DISTANCE_REF_POINTS 4
  51. static int distance_ref_points_depth;
  52. static const __be32 *distance_ref_points;
  53. static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
  54. static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
  55. [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
  56. };
  57. static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE };
  58. /*
  59. * Allocate node_to_cpumask_map based on number of available nodes
  60. * Requires node_possible_map to be valid.
  61. *
  62. * Note: cpumask_of_node() is not valid until after this is done.
  63. */
  64. static void __init setup_node_to_cpumask_map(void)
  65. {
  66. unsigned int node;
  67. /* setup nr_node_ids if not done yet */
  68. if (nr_node_ids == MAX_NUMNODES)
  69. setup_nr_node_ids();
  70. /* allocate the map */
  71. for_each_node(node)
  72. alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
  73. /* cpumask_of_node() will now work */
  74. pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
  75. }
  76. static int __init fake_numa_create_new_node(unsigned long end_pfn,
  77. unsigned int *nid)
  78. {
  79. unsigned long long mem;
  80. char *p = cmdline;
  81. static unsigned int fake_nid;
  82. static unsigned long long curr_boundary;
  83. /*
  84. * Modify node id, iff we started creating NUMA nodes
  85. * We want to continue from where we left of the last time
  86. */
  87. if (fake_nid)
  88. *nid = fake_nid;
  89. /*
  90. * In case there are no more arguments to parse, the
  91. * node_id should be the same as the last fake node id
  92. * (we've handled this above).
  93. */
  94. if (!p)
  95. return 0;
  96. mem = memparse(p, &p);
  97. if (!mem)
  98. return 0;
  99. if (mem < curr_boundary)
  100. return 0;
  101. curr_boundary = mem;
  102. if ((end_pfn << PAGE_SHIFT) > mem) {
  103. /*
  104. * Skip commas and spaces
  105. */
  106. while (*p == ',' || *p == ' ' || *p == '\t')
  107. p++;
  108. cmdline = p;
  109. fake_nid++;
  110. *nid = fake_nid;
  111. pr_debug("created new fake_node with id %d\n", fake_nid);
  112. return 1;
  113. }
  114. return 0;
  115. }
  116. static void __init reset_numa_cpu_lookup_table(void)
  117. {
  118. unsigned int cpu;
  119. for_each_possible_cpu(cpu)
  120. numa_cpu_lookup_table[cpu] = -1;
  121. }
  122. void map_cpu_to_node(int cpu, int node)
  123. {
  124. update_numa_cpu_lookup_table(cpu, node);
  125. if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) {
  126. pr_debug("adding cpu %d to node %d\n", cpu, node);
  127. cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
  128. }
  129. }
  130. #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
  131. void unmap_cpu_from_node(unsigned long cpu)
  132. {
  133. int node = numa_cpu_lookup_table[cpu];
  134. if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
  135. cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
  136. pr_debug("removing cpu %lu from node %d\n", cpu, node);
  137. } else {
  138. pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node);
  139. }
  140. }
  141. #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
  142. static int __associativity_to_nid(const __be32 *associativity,
  143. int max_array_sz)
  144. {
  145. int nid;
  146. /*
  147. * primary_domain_index is 1 based array index.
  148. */
  149. int index = primary_domain_index - 1;
  150. if (!numa_enabled || index >= max_array_sz)
  151. return NUMA_NO_NODE;
  152. nid = of_read_number(&associativity[index], 1);
  153. /* POWER4 LPAR uses 0xffff as invalid node */
  154. if (nid == 0xffff || nid >= nr_node_ids)
  155. nid = NUMA_NO_NODE;
  156. return nid;
  157. }
  158. /*
  159. * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
  160. * info is found.
  161. */
  162. static int associativity_to_nid(const __be32 *associativity)
  163. {
  164. int array_sz = of_read_number(associativity, 1);
  165. /* Skip the first element in the associativity array */
  166. return __associativity_to_nid((associativity + 1), array_sz);
  167. }
  168. static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
  169. {
  170. int dist;
  171. int node1, node2;
  172. node1 = associativity_to_nid(cpu1_assoc);
  173. node2 = associativity_to_nid(cpu2_assoc);
  174. dist = numa_distance_table[node1][node2];
  175. if (dist <= LOCAL_DISTANCE)
  176. return 0;
  177. else if (dist <= REMOTE_DISTANCE)
  178. return 1;
  179. else
  180. return 2;
  181. }
  182. static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
  183. {
  184. int dist = 0;
  185. int i, index;
  186. for (i = 0; i < distance_ref_points_depth; i++) {
  187. index = be32_to_cpu(distance_ref_points[i]);
  188. if (cpu1_assoc[index] == cpu2_assoc[index])
  189. break;
  190. dist++;
  191. }
  192. return dist;
  193. }
  194. int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
  195. {
  196. /* We should not get called with FORM0 */
  197. VM_WARN_ON(affinity_form == FORM0_AFFINITY);
  198. if (affinity_form == FORM1_AFFINITY)
  199. return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
  200. return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc);
  201. }
  202. /* must hold reference to node during call */
  203. static const __be32 *of_get_associativity(struct device_node *dev)
  204. {
  205. return of_get_property(dev, "ibm,associativity", NULL);
  206. }
  207. int __node_distance(int a, int b)
  208. {
  209. int i;
  210. int distance = LOCAL_DISTANCE;
  211. if (affinity_form == FORM2_AFFINITY)
  212. return numa_distance_table[a][b];
  213. else if (affinity_form == FORM0_AFFINITY)
  214. return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
  215. for (i = 0; i < distance_ref_points_depth; i++) {
  216. if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
  217. break;
  218. /* Double the distance for each NUMA level */
  219. distance *= 2;
  220. }
  221. return distance;
  222. }
  223. EXPORT_SYMBOL(__node_distance);
  224. /* Returns the nid associated with the given device tree node,
  225. * or -1 if not found.
  226. */
  227. static int of_node_to_nid_single(struct device_node *device)
  228. {
  229. int nid = NUMA_NO_NODE;
  230. const __be32 *tmp;
  231. tmp = of_get_associativity(device);
  232. if (tmp)
  233. nid = associativity_to_nid(tmp);
  234. return nid;
  235. }
  236. /* Walk the device tree upwards, looking for an associativity id */
  237. int of_node_to_nid(struct device_node *device)
  238. {
  239. int nid = NUMA_NO_NODE;
  240. of_node_get(device);
  241. while (device) {
  242. nid = of_node_to_nid_single(device);
  243. if (nid != -1)
  244. break;
  245. device = of_get_next_parent(device);
  246. }
  247. of_node_put(device);
  248. return nid;
  249. }
  250. EXPORT_SYMBOL(of_node_to_nid);
  251. static void __initialize_form1_numa_distance(const __be32 *associativity,
  252. int max_array_sz)
  253. {
  254. int i, nid;
  255. if (affinity_form != FORM1_AFFINITY)
  256. return;
  257. nid = __associativity_to_nid(associativity, max_array_sz);
  258. if (nid != NUMA_NO_NODE) {
  259. for (i = 0; i < distance_ref_points_depth; i++) {
  260. const __be32 *entry;
  261. int index = be32_to_cpu(distance_ref_points[i]) - 1;
  262. /*
  263. * broken hierarchy, return with broken distance table
  264. */
  265. if (WARN(index >= max_array_sz, "Broken ibm,associativity property"))
  266. return;
  267. entry = &associativity[index];
  268. distance_lookup_table[nid][i] = of_read_number(entry, 1);
  269. }
  270. }
  271. }
  272. static void initialize_form1_numa_distance(const __be32 *associativity)
  273. {
  274. int array_sz;
  275. array_sz = of_read_number(associativity, 1);
  276. /* Skip the first element in the associativity array */
  277. __initialize_form1_numa_distance(associativity + 1, array_sz);
  278. }
  279. /*
  280. * Used to update distance information w.r.t newly added node.
  281. */
  282. void update_numa_distance(struct device_node *node)
  283. {
  284. int nid;
  285. if (affinity_form == FORM0_AFFINITY)
  286. return;
  287. else if (affinity_form == FORM1_AFFINITY) {
  288. const __be32 *associativity;
  289. associativity = of_get_associativity(node);
  290. if (!associativity)
  291. return;
  292. initialize_form1_numa_distance(associativity);
  293. return;
  294. }
  295. /* FORM2 affinity */
  296. nid = of_node_to_nid_single(node);
  297. if (nid == NUMA_NO_NODE)
  298. return;
  299. /*
  300. * With FORM2 we expect NUMA distance of all possible NUMA
  301. * nodes to be provided during boot.
  302. */
  303. WARN(numa_distance_table[nid][nid] == -1,
  304. "NUMA distance details for node %d not provided\n", nid);
  305. }
  306. EXPORT_SYMBOL_GPL(update_numa_distance);
  307. /*
  308. * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
  309. * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
  310. */
  311. static void __init initialize_form2_numa_distance_lookup_table(void)
  312. {
  313. int i, j;
  314. struct device_node *root;
  315. const __u8 *form2_distances;
  316. const __be32 *numa_lookup_index;
  317. int form2_distances_length;
  318. int max_numa_index, distance_index;
  319. if (firmware_has_feature(FW_FEATURE_OPAL))
  320. root = of_find_node_by_path("/ibm,opal");
  321. else
  322. root = of_find_node_by_path("/rtas");
  323. if (!root)
  324. root = of_find_node_by_path("/");
  325. numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
  326. max_numa_index = of_read_number(&numa_lookup_index[0], 1);
  327. /* first element of the array is the size and is encode-int */
  328. form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL);
  329. form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1);
  330. /* Skip the size which is encoded int */
  331. form2_distances += sizeof(__be32);
  332. pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n",
  333. form2_distances_length, max_numa_index);
  334. for (i = 0; i < max_numa_index; i++)
  335. /* +1 skip the max_numa_index in the property */
  336. numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
  337. if (form2_distances_length != max_numa_index * max_numa_index) {
  338. WARN(1, "Wrong NUMA distance information\n");
  339. form2_distances = NULL; // don't use it
  340. }
  341. distance_index = 0;
  342. for (i = 0; i < max_numa_index; i++) {
  343. for (j = 0; j < max_numa_index; j++) {
  344. int nodeA = numa_id_index_table[i];
  345. int nodeB = numa_id_index_table[j];
  346. int dist;
  347. if (form2_distances)
  348. dist = form2_distances[distance_index++];
  349. else if (nodeA == nodeB)
  350. dist = LOCAL_DISTANCE;
  351. else
  352. dist = REMOTE_DISTANCE;
  353. numa_distance_table[nodeA][nodeB] = dist;
  354. pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist);
  355. }
  356. }
  357. of_node_put(root);
  358. }
  359. static int __init find_primary_domain_index(void)
  360. {
  361. int index;
  362. struct device_node *root;
  363. /*
  364. * Check for which form of affinity.
  365. */
  366. if (firmware_has_feature(FW_FEATURE_OPAL)) {
  367. affinity_form = FORM1_AFFINITY;
  368. } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
  369. pr_debug("Using form 2 affinity\n");
  370. affinity_form = FORM2_AFFINITY;
  371. } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
  372. pr_debug("Using form 1 affinity\n");
  373. affinity_form = FORM1_AFFINITY;
  374. } else
  375. affinity_form = FORM0_AFFINITY;
  376. if (firmware_has_feature(FW_FEATURE_OPAL))
  377. root = of_find_node_by_path("/ibm,opal");
  378. else
  379. root = of_find_node_by_path("/rtas");
  380. if (!root)
  381. root = of_find_node_by_path("/");
  382. /*
  383. * This property is a set of 32-bit integers, each representing
  384. * an index into the ibm,associativity nodes.
  385. *
  386. * With form 0 affinity the first integer is for an SMP configuration
  387. * (should be all 0's) and the second is for a normal NUMA
  388. * configuration. We have only one level of NUMA.
  389. *
  390. * With form 1 affinity the first integer is the most significant
  391. * NUMA boundary and the following are progressively less significant
  392. * boundaries. There can be more than one level of NUMA.
  393. */
  394. distance_ref_points = of_get_property(root,
  395. "ibm,associativity-reference-points",
  396. &distance_ref_points_depth);
  397. if (!distance_ref_points) {
  398. pr_debug("ibm,associativity-reference-points not found.\n");
  399. goto err;
  400. }
  401. distance_ref_points_depth /= sizeof(int);
  402. if (affinity_form == FORM0_AFFINITY) {
  403. if (distance_ref_points_depth < 2) {
  404. pr_warn("short ibm,associativity-reference-points\n");
  405. goto err;
  406. }
  407. index = of_read_number(&distance_ref_points[1], 1);
  408. } else {
  409. /*
  410. * Both FORM1 and FORM2 affinity find the primary domain details
  411. * at the same offset.
  412. */
  413. index = of_read_number(distance_ref_points, 1);
  414. }
  415. /*
  416. * Warn and cap if the hardware supports more than
  417. * MAX_DISTANCE_REF_POINTS domains.
  418. */
  419. if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
  420. pr_warn("distance array capped at %d entries\n",
  421. MAX_DISTANCE_REF_POINTS);
  422. distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
  423. }
  424. of_node_put(root);
  425. return index;
  426. err:
  427. of_node_put(root);
  428. return -1;
  429. }
  430. static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
  431. {
  432. struct device_node *memory = NULL;
  433. memory = of_find_node_by_type(memory, "memory");
  434. if (!memory)
  435. panic("numa.c: No memory nodes found!");
  436. *n_addr_cells = of_n_addr_cells(memory);
  437. *n_size_cells = of_n_size_cells(memory);
  438. of_node_put(memory);
  439. }
  440. static unsigned long read_n_cells(int n, const __be32 **buf)
  441. {
  442. unsigned long result = 0;
  443. while (n--) {
  444. result = (result << 32) | of_read_number(*buf, 1);
  445. (*buf)++;
  446. }
  447. return result;
  448. }
  449. struct assoc_arrays {
  450. u32 n_arrays;
  451. u32 array_sz;
  452. const __be32 *arrays;
  453. };
  454. /*
  455. * Retrieve and validate the list of associativity arrays for drconf
  456. * memory from the ibm,associativity-lookup-arrays property of the
  457. * device tree..
  458. *
  459. * The layout of the ibm,associativity-lookup-arrays property is a number N
  460. * indicating the number of associativity arrays, followed by a number M
  461. * indicating the size of each associativity array, followed by a list
  462. * of N associativity arrays.
  463. */
  464. static int of_get_assoc_arrays(struct assoc_arrays *aa)
  465. {
  466. struct device_node *memory;
  467. const __be32 *prop;
  468. u32 len;
  469. memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
  470. if (!memory)
  471. return -1;
  472. prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
  473. if (!prop || len < 2 * sizeof(unsigned int)) {
  474. of_node_put(memory);
  475. return -1;
  476. }
  477. aa->n_arrays = of_read_number(prop++, 1);
  478. aa->array_sz = of_read_number(prop++, 1);
  479. of_node_put(memory);
  480. /* Now that we know the number of arrays and size of each array,
  481. * revalidate the size of the property read in.
  482. */
  483. if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
  484. return -1;
  485. aa->arrays = prop;
  486. return 0;
  487. }
  488. static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb)
  489. {
  490. struct assoc_arrays aa = { .arrays = NULL };
  491. int default_nid = NUMA_NO_NODE;
  492. int nid = default_nid;
  493. int rc, index;
  494. if ((primary_domain_index < 0) || !numa_enabled)
  495. return default_nid;
  496. rc = of_get_assoc_arrays(&aa);
  497. if (rc)
  498. return default_nid;
  499. if (primary_domain_index <= aa.array_sz &&
  500. !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
  501. const __be32 *associativity;
  502. index = lmb->aa_index * aa.array_sz;
  503. associativity = &aa.arrays[index];
  504. nid = __associativity_to_nid(associativity, aa.array_sz);
  505. if (nid > 0 && affinity_form == FORM1_AFFINITY) {
  506. /*
  507. * lookup array associativity entries have
  508. * no length of the array as the first element.
  509. */
  510. __initialize_form1_numa_distance(associativity, aa.array_sz);
  511. }
  512. }
  513. return nid;
  514. }
  515. /*
  516. * This is like of_node_to_nid_single() for memory represented in the
  517. * ibm,dynamic-reconfiguration-memory node.
  518. */
  519. int of_drconf_to_nid_single(struct drmem_lmb *lmb)
  520. {
  521. struct assoc_arrays aa = { .arrays = NULL };
  522. int default_nid = NUMA_NO_NODE;
  523. int nid = default_nid;
  524. int rc, index;
  525. if ((primary_domain_index < 0) || !numa_enabled)
  526. return default_nid;
  527. rc = of_get_assoc_arrays(&aa);
  528. if (rc)
  529. return default_nid;
  530. if (primary_domain_index <= aa.array_sz &&
  531. !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
  532. const __be32 *associativity;
  533. index = lmb->aa_index * aa.array_sz;
  534. associativity = &aa.arrays[index];
  535. nid = __associativity_to_nid(associativity, aa.array_sz);
  536. }
  537. return nid;
  538. }
  539. #ifdef CONFIG_PPC_SPLPAR
  540. static int __vphn_get_associativity(long lcpu, __be32 *associativity)
  541. {
  542. long rc, hwid;
  543. /*
  544. * On a shared lpar, device tree will not have node associativity.
  545. * At this time lppaca, or its __old_status field may not be
  546. * updated. Hence kernel cannot detect if its on a shared lpar. So
  547. * request an explicit associativity irrespective of whether the
  548. * lpar is shared or dedicated. Use the device tree property as a
  549. * fallback. cpu_to_phys_id is only valid between
  550. * smp_setup_cpu_maps() and smp_setup_pacas().
  551. */
  552. if (firmware_has_feature(FW_FEATURE_VPHN)) {
  553. if (cpu_to_phys_id)
  554. hwid = cpu_to_phys_id[lcpu];
  555. else
  556. hwid = get_hard_smp_processor_id(lcpu);
  557. rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
  558. if (rc == H_SUCCESS)
  559. return 0;
  560. }
  561. return -1;
  562. }
  563. static int vphn_get_nid(long lcpu)
  564. {
  565. __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
  566. if (!__vphn_get_associativity(lcpu, associativity))
  567. return associativity_to_nid(associativity);
  568. return NUMA_NO_NODE;
  569. }
  570. #else
  571. static int __vphn_get_associativity(long lcpu, __be32 *associativity)
  572. {
  573. return -1;
  574. }
  575. static int vphn_get_nid(long unused)
  576. {
  577. return NUMA_NO_NODE;
  578. }
  579. #endif /* CONFIG_PPC_SPLPAR */
  580. /*
  581. * Figure out to which domain a cpu belongs and stick it there.
  582. * Return the id of the domain used.
  583. */
  584. static int numa_setup_cpu(unsigned long lcpu)
  585. {
  586. struct device_node *cpu;
  587. int fcpu = cpu_first_thread_sibling(lcpu);
  588. int nid = NUMA_NO_NODE;
  589. if (!cpu_present(lcpu)) {
  590. set_cpu_numa_node(lcpu, first_online_node);
  591. return first_online_node;
  592. }
  593. /*
  594. * If a valid cpu-to-node mapping is already available, use it
  595. * directly instead of querying the firmware, since it represents
  596. * the most recent mapping notified to us by the platform (eg: VPHN).
  597. * Since cpu_to_node binding remains the same for all threads in the
  598. * core. If a valid cpu-to-node mapping is already available, for
  599. * the first thread in the core, use it.
  600. */
  601. nid = numa_cpu_lookup_table[fcpu];
  602. if (nid >= 0) {
  603. map_cpu_to_node(lcpu, nid);
  604. return nid;
  605. }
  606. nid = vphn_get_nid(lcpu);
  607. if (nid != NUMA_NO_NODE)
  608. goto out_present;
  609. cpu = of_get_cpu_node(lcpu, NULL);
  610. if (!cpu) {
  611. WARN_ON(1);
  612. if (cpu_present(lcpu))
  613. goto out_present;
  614. else
  615. goto out;
  616. }
  617. nid = of_node_to_nid_single(cpu);
  618. of_node_put(cpu);
  619. out_present:
  620. if (nid < 0 || !node_possible(nid))
  621. nid = first_online_node;
  622. /*
  623. * Update for the first thread of the core. All threads of a core
  624. * have to be part of the same node. This not only avoids querying
  625. * for every other thread in the core, but always avoids a case
  626. * where virtual node associativity change causes subsequent threads
  627. * of a core to be associated with different nid. However if first
  628. * thread is already online, expect it to have a valid mapping.
  629. */
  630. if (fcpu != lcpu) {
  631. WARN_ON(cpu_online(fcpu));
  632. map_cpu_to_node(fcpu, nid);
  633. }
  634. map_cpu_to_node(lcpu, nid);
  635. out:
  636. return nid;
  637. }
  638. static void verify_cpu_node_mapping(int cpu, int node)
  639. {
  640. int base, sibling, i;
  641. /* Verify that all the threads in the core belong to the same node */
  642. base = cpu_first_thread_sibling(cpu);
  643. for (i = 0; i < threads_per_core; i++) {
  644. sibling = base + i;
  645. if (sibling == cpu || cpu_is_offline(sibling))
  646. continue;
  647. if (cpu_to_node(sibling) != node) {
  648. WARN(1, "CPU thread siblings %d and %d don't belong"
  649. " to the same node!\n", cpu, sibling);
  650. break;
  651. }
  652. }
  653. }
  654. /* Must run before sched domains notifier. */
  655. static int ppc_numa_cpu_prepare(unsigned int cpu)
  656. {
  657. int nid;
  658. nid = numa_setup_cpu(cpu);
  659. verify_cpu_node_mapping(cpu, nid);
  660. return 0;
  661. }
  662. static int ppc_numa_cpu_dead(unsigned int cpu)
  663. {
  664. return 0;
  665. }
  666. /*
  667. * Check and possibly modify a memory region to enforce the memory limit.
  668. *
  669. * Returns the size the region should have to enforce the memory limit.
  670. * This will either be the original value of size, a truncated value,
  671. * or zero. If the returned value of size is 0 the region should be
  672. * discarded as it lies wholly above the memory limit.
  673. */
  674. static unsigned long __init numa_enforce_memory_limit(unsigned long start,
  675. unsigned long size)
  676. {
  677. /*
  678. * We use memblock_end_of_DRAM() in here instead of memory_limit because
  679. * we've already adjusted it for the limit and it takes care of
  680. * having memory holes below the limit. Also, in the case of
  681. * iommu_is_off, memory_limit is not set but is implicitly enforced.
  682. */
  683. if (start + size <= memblock_end_of_DRAM())
  684. return size;
  685. if (start >= memblock_end_of_DRAM())
  686. return 0;
  687. return memblock_end_of_DRAM() - start;
  688. }
  689. /*
  690. * Reads the counter for a given entry in
  691. * linux,drconf-usable-memory property
  692. */
  693. static inline int __init read_usm_ranges(const __be32 **usm)
  694. {
  695. /*
  696. * For each lmb in ibm,dynamic-memory a corresponding
  697. * entry in linux,drconf-usable-memory property contains
  698. * a counter followed by that many (base, size) duple.
  699. * read the counter from linux,drconf-usable-memory
  700. */
  701. return read_n_cells(n_mem_size_cells, usm);
  702. }
  703. /*
  704. * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
  705. * node. This assumes n_mem_{addr,size}_cells have been set.
  706. */
  707. static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
  708. const __be32 **usm,
  709. void *data)
  710. {
  711. unsigned int ranges, is_kexec_kdump = 0;
  712. unsigned long base, size, sz;
  713. int nid;
  714. /*
  715. * Skip this block if the reserved bit is set in flags (0x80)
  716. * or if the block is not assigned to this partition (0x8)
  717. */
  718. if ((lmb->flags & DRCONF_MEM_RESERVED)
  719. || !(lmb->flags & DRCONF_MEM_ASSIGNED))
  720. return 0;
  721. if (*usm)
  722. is_kexec_kdump = 1;
  723. base = lmb->base_addr;
  724. size = drmem_lmb_size();
  725. ranges = 1;
  726. if (is_kexec_kdump) {
  727. ranges = read_usm_ranges(usm);
  728. if (!ranges) /* there are no (base, size) duple */
  729. return 0;
  730. }
  731. do {
  732. if (is_kexec_kdump) {
  733. base = read_n_cells(n_mem_addr_cells, usm);
  734. size = read_n_cells(n_mem_size_cells, usm);
  735. }
  736. nid = get_nid_and_numa_distance(lmb);
  737. fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
  738. &nid);
  739. node_set_online(nid);
  740. sz = numa_enforce_memory_limit(base, size);
  741. if (sz)
  742. memblock_set_node(base, sz, &memblock.memory, nid);
  743. } while (--ranges);
  744. return 0;
  745. }
  746. static int __init parse_numa_properties(void)
  747. {
  748. struct device_node *memory;
  749. int default_nid = 0;
  750. unsigned long i;
  751. const __be32 *associativity;
  752. if (numa_enabled == 0) {
  753. pr_warn("disabled by user\n");
  754. return -1;
  755. }
  756. primary_domain_index = find_primary_domain_index();
  757. if (primary_domain_index < 0) {
  758. /*
  759. * if we fail to parse primary_domain_index from device tree
  760. * mark the numa disabled, boot with numa disabled.
  761. */
  762. numa_enabled = false;
  763. return primary_domain_index;
  764. }
  765. pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index);
  766. /*
  767. * If it is FORM2 initialize the distance table here.
  768. */
  769. if (affinity_form == FORM2_AFFINITY)
  770. initialize_form2_numa_distance_lookup_table();
  771. /*
  772. * Even though we connect cpus to numa domains later in SMP
  773. * init, we need to know the node ids now. This is because
  774. * each node to be onlined must have NODE_DATA etc backing it.
  775. */
  776. for_each_present_cpu(i) {
  777. __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
  778. struct device_node *cpu;
  779. int nid = NUMA_NO_NODE;
  780. memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
  781. if (__vphn_get_associativity(i, vphn_assoc) == 0) {
  782. nid = associativity_to_nid(vphn_assoc);
  783. initialize_form1_numa_distance(vphn_assoc);
  784. } else {
  785. /*
  786. * Don't fall back to default_nid yet -- we will plug
  787. * cpus into nodes once the memory scan has discovered
  788. * the topology.
  789. */
  790. cpu = of_get_cpu_node(i, NULL);
  791. BUG_ON(!cpu);
  792. associativity = of_get_associativity(cpu);
  793. if (associativity) {
  794. nid = associativity_to_nid(associativity);
  795. initialize_form1_numa_distance(associativity);
  796. }
  797. of_node_put(cpu);
  798. }
  799. /* node_set_online() is an UB if 'nid' is negative */
  800. if (likely(nid >= 0))
  801. node_set_online(nid);
  802. }
  803. get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
  804. for_each_node_by_type(memory, "memory") {
  805. unsigned long start;
  806. unsigned long size;
  807. int nid;
  808. int ranges;
  809. const __be32 *memcell_buf;
  810. unsigned int len;
  811. memcell_buf = of_get_property(memory,
  812. "linux,usable-memory", &len);
  813. if (!memcell_buf || len <= 0)
  814. memcell_buf = of_get_property(memory, "reg", &len);
  815. if (!memcell_buf || len <= 0)
  816. continue;
  817. /* ranges in cell */
  818. ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
  819. new_range:
  820. /* these are order-sensitive, and modify the buffer pointer */
  821. start = read_n_cells(n_mem_addr_cells, &memcell_buf);
  822. size = read_n_cells(n_mem_size_cells, &memcell_buf);
  823. /*
  824. * Assumption: either all memory nodes or none will
  825. * have associativity properties. If none, then
  826. * everything goes to default_nid.
  827. */
  828. associativity = of_get_associativity(memory);
  829. if (associativity) {
  830. nid = associativity_to_nid(associativity);
  831. initialize_form1_numa_distance(associativity);
  832. } else
  833. nid = default_nid;
  834. fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
  835. node_set_online(nid);
  836. size = numa_enforce_memory_limit(start, size);
  837. if (size)
  838. memblock_set_node(start, size, &memblock.memory, nid);
  839. if (--ranges)
  840. goto new_range;
  841. }
  842. /*
  843. * Now do the same thing for each MEMBLOCK listed in the
  844. * ibm,dynamic-memory property in the
  845. * ibm,dynamic-reconfiguration-memory node.
  846. */
  847. memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
  848. if (memory) {
  849. walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
  850. of_node_put(memory);
  851. }
  852. return 0;
  853. }
  854. static void __init setup_nonnuma(void)
  855. {
  856. unsigned long top_of_ram = memblock_end_of_DRAM();
  857. unsigned long total_ram = memblock_phys_mem_size();
  858. unsigned long start_pfn, end_pfn;
  859. unsigned int nid = 0;
  860. int i;
  861. pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram);
  862. pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20);
  863. for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
  864. fake_numa_create_new_node(end_pfn, &nid);
  865. memblock_set_node(PFN_PHYS(start_pfn),
  866. PFN_PHYS(end_pfn - start_pfn),
  867. &memblock.memory, nid);
  868. node_set_online(nid);
  869. }
  870. }
  871. void __init dump_numa_cpu_topology(void)
  872. {
  873. unsigned int node;
  874. unsigned int cpu, count;
  875. if (!numa_enabled)
  876. return;
  877. for_each_online_node(node) {
  878. pr_info("Node %d CPUs:", node);
  879. count = 0;
  880. /*
  881. * If we used a CPU iterator here we would miss printing
  882. * the holes in the cpumap.
  883. */
  884. for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
  885. if (cpumask_test_cpu(cpu,
  886. node_to_cpumask_map[node])) {
  887. if (count == 0)
  888. pr_cont(" %u", cpu);
  889. ++count;
  890. } else {
  891. if (count > 1)
  892. pr_cont("-%u", cpu - 1);
  893. count = 0;
  894. }
  895. }
  896. if (count > 1)
  897. pr_cont("-%u", nr_cpu_ids - 1);
  898. pr_cont("\n");
  899. }
  900. }
  901. /* Initialize NODE_DATA for a node on the local memory */
  902. static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
  903. {
  904. u64 spanned_pages = end_pfn - start_pfn;
  905. const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
  906. u64 nd_pa;
  907. void *nd;
  908. int tnid;
  909. nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
  910. if (!nd_pa)
  911. panic("Cannot allocate %zu bytes for node %d data\n",
  912. nd_size, nid);
  913. nd = __va(nd_pa);
  914. /* report and initialize */
  915. pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n",
  916. nd_pa, nd_pa + nd_size - 1);
  917. tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
  918. if (tnid != nid)
  919. pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
  920. node_data[nid] = nd;
  921. memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
  922. NODE_DATA(nid)->node_id = nid;
  923. NODE_DATA(nid)->node_start_pfn = start_pfn;
  924. NODE_DATA(nid)->node_spanned_pages = spanned_pages;
  925. }
  926. static void __init find_possible_nodes(void)
  927. {
  928. struct device_node *rtas;
  929. const __be32 *domains = NULL;
  930. int prop_length, max_nodes;
  931. u32 i;
  932. if (!numa_enabled)
  933. return;
  934. rtas = of_find_node_by_path("/rtas");
  935. if (!rtas)
  936. return;
  937. /*
  938. * ibm,current-associativity-domains is a fairly recent property. If
  939. * it doesn't exist, then fallback on ibm,max-associativity-domains.
  940. * Current denotes what the platform can support compared to max
  941. * which denotes what the Hypervisor can support.
  942. *
  943. * If the LPAR is migratable, new nodes might be activated after a LPM,
  944. * so we should consider the max number in that case.
  945. */
  946. if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
  947. domains = of_get_property(rtas,
  948. "ibm,current-associativity-domains",
  949. &prop_length);
  950. if (!domains) {
  951. domains = of_get_property(rtas, "ibm,max-associativity-domains",
  952. &prop_length);
  953. if (!domains)
  954. goto out;
  955. }
  956. max_nodes = of_read_number(&domains[primary_domain_index], 1);
  957. pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
  958. for (i = 0; i < max_nodes; i++) {
  959. if (!node_possible(i))
  960. node_set(i, node_possible_map);
  961. }
  962. prop_length /= sizeof(int);
  963. if (prop_length > primary_domain_index + 2)
  964. coregroup_enabled = 1;
  965. out:
  966. of_node_put(rtas);
  967. }
  968. void __init mem_topology_setup(void)
  969. {
  970. int cpu;
  971. max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
  972. min_low_pfn = MEMORY_START >> PAGE_SHIFT;
  973. /*
  974. * Linux/mm assumes node 0 to be online at boot. However this is not
  975. * true on PowerPC, where node 0 is similar to any other node, it
  976. * could be cpuless, memoryless node. So force node 0 to be offline
  977. * for now. This will prevent cpuless, memoryless node 0 showing up
  978. * unnecessarily as online. If a node has cpus or memory that need
  979. * to be online, then node will anyway be marked online.
  980. */
  981. node_set_offline(0);
  982. if (parse_numa_properties())
  983. setup_nonnuma();
  984. /*
  985. * Modify the set of possible NUMA nodes to reflect information
  986. * available about the set of online nodes, and the set of nodes
  987. * that we expect to make use of for this platform's affinity
  988. * calculations.
  989. */
  990. nodes_and(node_possible_map, node_possible_map, node_online_map);
  991. find_possible_nodes();
  992. setup_node_to_cpumask_map();
  993. reset_numa_cpu_lookup_table();
  994. for_each_possible_cpu(cpu) {
  995. /*
  996. * Powerpc with CONFIG_NUMA always used to have a node 0,
  997. * even if it was memoryless or cpuless. For all cpus that
  998. * are possible but not present, cpu_to_node() would point
  999. * to node 0. To remove a cpuless, memoryless dummy node,
  1000. * powerpc need to make sure all possible but not present
  1001. * cpu_to_node are set to a proper node.
  1002. */
  1003. numa_setup_cpu(cpu);
  1004. }
  1005. }
  1006. void __init initmem_init(void)
  1007. {
  1008. int nid;
  1009. memblock_dump_all();
  1010. for_each_online_node(nid) {
  1011. unsigned long start_pfn, end_pfn;
  1012. get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
  1013. setup_node_data(nid, start_pfn, end_pfn);
  1014. }
  1015. sparse_init();
  1016. /*
  1017. * We need the numa_cpu_lookup_table to be accurate for all CPUs,
  1018. * even before we online them, so that we can use cpu_to_{node,mem}
  1019. * early in boot, cf. smp_prepare_cpus().
  1020. * _nocalls() + manual invocation is used because cpuhp is not yet
  1021. * initialized for the boot CPU.
  1022. */
  1023. cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
  1024. ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
  1025. }
  1026. static int __init early_numa(char *p)
  1027. {
  1028. if (!p)
  1029. return 0;
  1030. if (strstr(p, "off"))
  1031. numa_enabled = 0;
  1032. p = strstr(p, "fake=");
  1033. if (p)
  1034. cmdline = p + strlen("fake=");
  1035. return 0;
  1036. }
  1037. early_param("numa", early_numa);
  1038. #ifdef CONFIG_MEMORY_HOTPLUG
  1039. /*
  1040. * Find the node associated with a hot added memory section for
  1041. * memory represented in the device tree by the property
  1042. * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
  1043. */
  1044. static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
  1045. {
  1046. struct drmem_lmb *lmb;
  1047. unsigned long lmb_size;
  1048. int nid = NUMA_NO_NODE;
  1049. lmb_size = drmem_lmb_size();
  1050. for_each_drmem_lmb(lmb) {
  1051. /* skip this block if it is reserved or not assigned to
  1052. * this partition */
  1053. if ((lmb->flags & DRCONF_MEM_RESERVED)
  1054. || !(lmb->flags & DRCONF_MEM_ASSIGNED))
  1055. continue;
  1056. if ((scn_addr < lmb->base_addr)
  1057. || (scn_addr >= (lmb->base_addr + lmb_size)))
  1058. continue;
  1059. nid = of_drconf_to_nid_single(lmb);
  1060. break;
  1061. }
  1062. return nid;
  1063. }
  1064. /*
  1065. * Find the node associated with a hot added memory section for memory
  1066. * represented in the device tree as a node (i.e. memory@XXXX) for
  1067. * each memblock.
  1068. */
  1069. static int hot_add_node_scn_to_nid(unsigned long scn_addr)
  1070. {
  1071. struct device_node *memory;
  1072. int nid = NUMA_NO_NODE;
  1073. for_each_node_by_type(memory, "memory") {
  1074. unsigned long start, size;
  1075. int ranges;
  1076. const __be32 *memcell_buf;
  1077. unsigned int len;
  1078. memcell_buf = of_get_property(memory, "reg", &len);
  1079. if (!memcell_buf || len <= 0)
  1080. continue;
  1081. /* ranges in cell */
  1082. ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
  1083. while (ranges--) {
  1084. start = read_n_cells(n_mem_addr_cells, &memcell_buf);
  1085. size = read_n_cells(n_mem_size_cells, &memcell_buf);
  1086. if ((scn_addr < start) || (scn_addr >= (start + size)))
  1087. continue;
  1088. nid = of_node_to_nid_single(memory);
  1089. break;
  1090. }
  1091. if (nid >= 0)
  1092. break;
  1093. }
  1094. of_node_put(memory);
  1095. return nid;
  1096. }
  1097. /*
  1098. * Find the node associated with a hot added memory section. Section
  1099. * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
  1100. * sections are fully contained within a single MEMBLOCK.
  1101. */
  1102. int hot_add_scn_to_nid(unsigned long scn_addr)
  1103. {
  1104. struct device_node *memory = NULL;
  1105. int nid;
  1106. if (!numa_enabled)
  1107. return first_online_node;
  1108. memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
  1109. if (memory) {
  1110. nid = hot_add_drconf_scn_to_nid(scn_addr);
  1111. of_node_put(memory);
  1112. } else {
  1113. nid = hot_add_node_scn_to_nid(scn_addr);
  1114. }
  1115. if (nid < 0 || !node_possible(nid))
  1116. nid = first_online_node;
  1117. return nid;
  1118. }
  1119. static u64 hot_add_drconf_memory_max(void)
  1120. {
  1121. struct device_node *memory = NULL;
  1122. struct device_node *dn = NULL;
  1123. const __be64 *lrdr = NULL;
  1124. dn = of_find_node_by_path("/rtas");
  1125. if (dn) {
  1126. lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
  1127. of_node_put(dn);
  1128. if (lrdr)
  1129. return be64_to_cpup(lrdr);
  1130. }
  1131. memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
  1132. if (memory) {
  1133. of_node_put(memory);
  1134. return drmem_lmb_memory_max();
  1135. }
  1136. return 0;
  1137. }
  1138. /*
  1139. * memory_hotplug_max - return max address of memory that may be added
  1140. *
  1141. * This is currently only used on systems that support drconfig memory
  1142. * hotplug.
  1143. */
  1144. u64 memory_hotplug_max(void)
  1145. {
  1146. return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
  1147. }
  1148. #endif /* CONFIG_MEMORY_HOTPLUG */
  1149. /* Virtual Processor Home Node (VPHN) support */
  1150. #ifdef CONFIG_PPC_SPLPAR
  1151. static int topology_inited;
  1152. /*
  1153. * Retrieve the new associativity information for a virtual processor's
  1154. * home node.
  1155. */
  1156. static long vphn_get_associativity(unsigned long cpu,
  1157. __be32 *associativity)
  1158. {
  1159. long rc;
  1160. rc = hcall_vphn(get_hard_smp_processor_id(cpu),
  1161. VPHN_FLAG_VCPU, associativity);
  1162. switch (rc) {
  1163. case H_SUCCESS:
  1164. pr_debug("VPHN hcall succeeded. Reset polling...\n");
  1165. goto out;
  1166. case H_FUNCTION:
  1167. pr_err_ratelimited("VPHN unsupported. Disabling polling...\n");
  1168. break;
  1169. case H_HARDWARE:
  1170. pr_err_ratelimited("hcall_vphn() experienced a hardware fault "
  1171. "preventing VPHN. Disabling polling...\n");
  1172. break;
  1173. case H_PARAMETER:
  1174. pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. "
  1175. "Disabling polling...\n");
  1176. break;
  1177. default:
  1178. pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n"
  1179. , rc);
  1180. break;
  1181. }
  1182. out:
  1183. return rc;
  1184. }
  1185. void find_and_update_cpu_nid(int cpu)
  1186. {
  1187. __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
  1188. int new_nid;
  1189. /* Use associativity from first thread for all siblings */
  1190. if (vphn_get_associativity(cpu, associativity))
  1191. return;
  1192. /* Do not have previous associativity, so find it now. */
  1193. new_nid = associativity_to_nid(associativity);
  1194. if (new_nid < 0 || !node_possible(new_nid))
  1195. new_nid = first_online_node;
  1196. else
  1197. // Associate node <-> cpu, so cpu_up() calls
  1198. // try_online_node() on the right node.
  1199. set_cpu_numa_node(cpu, new_nid);
  1200. pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid);
  1201. }
  1202. int cpu_to_coregroup_id(int cpu)
  1203. {
  1204. __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
  1205. int index;
  1206. if (cpu < 0 || cpu > nr_cpu_ids)
  1207. return -1;
  1208. if (!coregroup_enabled)
  1209. goto out;
  1210. if (!firmware_has_feature(FW_FEATURE_VPHN))
  1211. goto out;
  1212. if (vphn_get_associativity(cpu, associativity))
  1213. goto out;
  1214. index = of_read_number(associativity, 1);
  1215. if (index > primary_domain_index + 1)
  1216. return of_read_number(&associativity[index - 1], 1);
  1217. out:
  1218. return cpu_to_core_id(cpu);
  1219. }
  1220. static int topology_update_init(void)
  1221. {
  1222. topology_inited = 1;
  1223. return 0;
  1224. }
  1225. device_initcall(topology_update_init);
  1226. #endif /* CONFIG_PPC_SPLPAR */