fadump.c 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Firmware Assisted dump: A robust mechanism to get reliable kernel crash
  4. * dump with assistance from firmware. This approach does not use kexec,
  5. * instead firmware assists in booting the kdump kernel while preserving
  6. * memory contents. The most of the code implementation has been adapted
  7. * from phyp assisted dump implementation written by Linas Vepstas and
  8. * Manish Ahuja
  9. *
  10. * Copyright 2011 IBM Corporation
  11. * Author: Mahesh Salgaonkar <[email protected]>
  12. */
  13. #undef DEBUG
  14. #define pr_fmt(fmt) "fadump: " fmt
  15. #include <linux/string.h>
  16. #include <linux/memblock.h>
  17. #include <linux/delay.h>
  18. #include <linux/seq_file.h>
  19. #include <linux/crash_dump.h>
  20. #include <linux/kobject.h>
  21. #include <linux/sysfs.h>
  22. #include <linux/slab.h>
  23. #include <linux/cma.h>
  24. #include <linux/hugetlb.h>
  25. #include <linux/debugfs.h>
  26. #include <linux/of.h>
  27. #include <linux/of_fdt.h>
  28. #include <asm/page.h>
  29. #include <asm/fadump.h>
  30. #include <asm/fadump-internal.h>
  31. #include <asm/setup.h>
  32. #include <asm/interrupt.h>
  33. /*
  34. * The CPU who acquired the lock to trigger the fadump crash should
  35. * wait for other CPUs to enter.
  36. *
  37. * The timeout is in milliseconds.
  38. */
  39. #define CRASH_TIMEOUT 500
  40. static struct fw_dump fw_dump;
  41. static void __init fadump_reserve_crash_area(u64 base);
  42. #ifndef CONFIG_PRESERVE_FA_DUMP
  43. static struct kobject *fadump_kobj;
  44. static atomic_t cpus_in_fadump;
  45. static DEFINE_MUTEX(fadump_mutex);
  46. static struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false };
  47. #define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */
  48. #define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \
  49. sizeof(struct fadump_memory_range))
  50. static struct fadump_memory_range rngs[RESERVED_RNGS_CNT];
  51. static struct fadump_mrange_info
  52. reserved_mrange_info = { "reserved", rngs, RESERVED_RNGS_SZ, 0, RESERVED_RNGS_CNT, true };
  53. static void __init early_init_dt_scan_reserved_ranges(unsigned long node);
  54. #ifdef CONFIG_CMA
  55. static struct cma *fadump_cma;
  56. /*
  57. * fadump_cma_init() - Initialize CMA area from a fadump reserved memory
  58. *
  59. * This function initializes CMA area from fadump reserved memory.
  60. * The total size of fadump reserved memory covers for boot memory size
  61. * + cpu data size + hpte size and metadata.
  62. * Initialize only the area equivalent to boot memory size for CMA use.
  63. * The remaining portion of fadump reserved memory will be not given
  64. * to CMA and pages for those will stay reserved. boot memory size is
  65. * aligned per CMA requirement to satisy cma_init_reserved_mem() call.
  66. * But for some reason even if it fails we still have the memory reservation
  67. * with us and we can still continue doing fadump.
  68. */
  69. static int __init fadump_cma_init(void)
  70. {
  71. unsigned long long base, size;
  72. int rc;
  73. if (!fw_dump.fadump_enabled)
  74. return 0;
  75. /*
  76. * Do not use CMA if user has provided fadump=nocma kernel parameter.
  77. * Return 1 to continue with fadump old behaviour.
  78. */
  79. if (fw_dump.nocma)
  80. return 1;
  81. base = fw_dump.reserve_dump_area_start;
  82. size = fw_dump.boot_memory_size;
  83. if (!size)
  84. return 0;
  85. rc = cma_init_reserved_mem(base, size, 0, "fadump_cma", &fadump_cma);
  86. if (rc) {
  87. pr_err("Failed to init cma area for firmware-assisted dump,%d\n", rc);
  88. /*
  89. * Though the CMA init has failed we still have memory
  90. * reservation with us. The reserved memory will be
  91. * blocked from production system usage. Hence return 1,
  92. * so that we can continue with fadump.
  93. */
  94. return 1;
  95. }
  96. /*
  97. * If CMA activation fails, keep the pages reserved, instead of
  98. * exposing them to buddy allocator. Same as 'fadump=nocma' case.
  99. */
  100. cma_reserve_pages_on_error(fadump_cma);
  101. /*
  102. * So we now have successfully initialized cma area for fadump.
  103. */
  104. pr_info("Initialized 0x%lx bytes cma area at %ldMB from 0x%lx "
  105. "bytes of memory reserved for firmware-assisted dump\n",
  106. cma_get_size(fadump_cma),
  107. (unsigned long)cma_get_base(fadump_cma) >> 20,
  108. fw_dump.reserve_dump_area_size);
  109. return 1;
  110. }
  111. #else
  112. static int __init fadump_cma_init(void) { return 1; }
  113. #endif /* CONFIG_CMA */
  114. /* Scan the Firmware Assisted dump configuration details. */
  115. int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  116. int depth, void *data)
  117. {
  118. if (depth == 0) {
  119. early_init_dt_scan_reserved_ranges(node);
  120. return 0;
  121. }
  122. if (depth != 1)
  123. return 0;
  124. if (strcmp(uname, "rtas") == 0) {
  125. rtas_fadump_dt_scan(&fw_dump, node);
  126. return 1;
  127. }
  128. if (strcmp(uname, "ibm,opal") == 0) {
  129. opal_fadump_dt_scan(&fw_dump, node);
  130. return 1;
  131. }
  132. return 0;
  133. }
  134. /*
  135. * If fadump is registered, check if the memory provided
  136. * falls within boot memory area and reserved memory area.
  137. */
  138. int is_fadump_memory_area(u64 addr, unsigned long size)
  139. {
  140. u64 d_start, d_end;
  141. if (!fw_dump.dump_registered)
  142. return 0;
  143. if (!size)
  144. return 0;
  145. d_start = fw_dump.reserve_dump_area_start;
  146. d_end = d_start + fw_dump.reserve_dump_area_size;
  147. if (((addr + size) > d_start) && (addr <= d_end))
  148. return 1;
  149. return (addr <= fw_dump.boot_mem_top);
  150. }
  151. int should_fadump_crash(void)
  152. {
  153. if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
  154. return 0;
  155. return 1;
  156. }
  157. int is_fadump_active(void)
  158. {
  159. return fw_dump.dump_active;
  160. }
  161. /*
  162. * Returns true, if there are no holes in memory area between d_start to d_end,
  163. * false otherwise.
  164. */
  165. static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
  166. {
  167. phys_addr_t reg_start, reg_end;
  168. bool ret = false;
  169. u64 i, start, end;
  170. for_each_mem_range(i, &reg_start, &reg_end) {
  171. start = max_t(u64, d_start, reg_start);
  172. end = min_t(u64, d_end, reg_end);
  173. if (d_start < end) {
  174. /* Memory hole from d_start to start */
  175. if (start > d_start)
  176. break;
  177. if (end == d_end) {
  178. ret = true;
  179. break;
  180. }
  181. d_start = end + 1;
  182. }
  183. }
  184. return ret;
  185. }
  186. /*
  187. * Returns true, if there are no holes in boot memory area,
  188. * false otherwise.
  189. */
  190. bool is_fadump_boot_mem_contiguous(void)
  191. {
  192. unsigned long d_start, d_end;
  193. bool ret = false;
  194. int i;
  195. for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
  196. d_start = fw_dump.boot_mem_addr[i];
  197. d_end = d_start + fw_dump.boot_mem_sz[i];
  198. ret = is_fadump_mem_area_contiguous(d_start, d_end);
  199. if (!ret)
  200. break;
  201. }
  202. return ret;
  203. }
  204. /*
  205. * Returns true, if there are no holes in reserved memory area,
  206. * false otherwise.
  207. */
  208. bool is_fadump_reserved_mem_contiguous(void)
  209. {
  210. u64 d_start, d_end;
  211. d_start = fw_dump.reserve_dump_area_start;
  212. d_end = d_start + fw_dump.reserve_dump_area_size;
  213. return is_fadump_mem_area_contiguous(d_start, d_end);
  214. }
  215. /* Print firmware assisted dump configurations for debugging purpose. */
  216. static void __init fadump_show_config(void)
  217. {
  218. int i;
  219. pr_debug("Support for firmware-assisted dump (fadump): %s\n",
  220. (fw_dump.fadump_supported ? "present" : "no support"));
  221. if (!fw_dump.fadump_supported)
  222. return;
  223. pr_debug("Fadump enabled : %s\n",
  224. (fw_dump.fadump_enabled ? "yes" : "no"));
  225. pr_debug("Dump Active : %s\n",
  226. (fw_dump.dump_active ? "yes" : "no"));
  227. pr_debug("Dump section sizes:\n");
  228. pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
  229. pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size);
  230. pr_debug(" Boot memory size : %lx\n", fw_dump.boot_memory_size);
  231. pr_debug(" Boot memory top : %llx\n", fw_dump.boot_mem_top);
  232. pr_debug("Boot memory regions cnt: %llx\n", fw_dump.boot_mem_regs_cnt);
  233. for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
  234. pr_debug("[%03d] base = %llx, size = %llx\n", i,
  235. fw_dump.boot_mem_addr[i], fw_dump.boot_mem_sz[i]);
  236. }
  237. }
  238. /**
  239. * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
  240. *
  241. * Function to find the largest memory size we need to reserve during early
  242. * boot process. This will be the size of the memory that is required for a
  243. * kernel to boot successfully.
  244. *
  245. * This function has been taken from phyp-assisted dump feature implementation.
  246. *
  247. * returns larger of 256MB or 5% rounded down to multiples of 256MB.
  248. *
  249. * TODO: Come up with better approach to find out more accurate memory size
  250. * that is required for a kernel to boot successfully.
  251. *
  252. */
  253. static __init u64 fadump_calculate_reserve_size(void)
  254. {
  255. u64 base, size, bootmem_min;
  256. int ret;
  257. if (fw_dump.reserve_bootvar)
  258. pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n");
  259. /*
  260. * Check if the size is specified through crashkernel= cmdline
  261. * option. If yes, then use that but ignore base as fadump reserves
  262. * memory at a predefined offset.
  263. */
  264. ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
  265. &size, &base);
  266. if (ret == 0 && size > 0) {
  267. unsigned long max_size;
  268. if (fw_dump.reserve_bootvar)
  269. pr_info("Using 'crashkernel=' parameter for memory reservation.\n");
  270. fw_dump.reserve_bootvar = (unsigned long)size;
  271. /*
  272. * Adjust if the boot memory size specified is above
  273. * the upper limit.
  274. */
  275. max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO;
  276. if (fw_dump.reserve_bootvar > max_size) {
  277. fw_dump.reserve_bootvar = max_size;
  278. pr_info("Adjusted boot memory size to %luMB\n",
  279. (fw_dump.reserve_bootvar >> 20));
  280. }
  281. return fw_dump.reserve_bootvar;
  282. } else if (fw_dump.reserve_bootvar) {
  283. /*
  284. * 'fadump_reserve_mem=' is being used to reserve memory
  285. * for firmware-assisted dump.
  286. */
  287. return fw_dump.reserve_bootvar;
  288. }
  289. /* divide by 20 to get 5% of value */
  290. size = memblock_phys_mem_size() / 20;
  291. /* round it down in multiples of 256 */
  292. size = size & ~0x0FFFFFFFUL;
  293. /* Truncate to memory_limit. We don't want to over reserve the memory.*/
  294. if (memory_limit && size > memory_limit)
  295. size = memory_limit;
  296. bootmem_min = fw_dump.ops->fadump_get_bootmem_min();
  297. return (size > bootmem_min ? size : bootmem_min);
  298. }
  299. /*
  300. * Calculate the total memory size required to be reserved for
  301. * firmware-assisted dump registration.
  302. */
  303. static unsigned long __init get_fadump_area_size(void)
  304. {
  305. unsigned long size = 0;
  306. size += fw_dump.cpu_state_data_size;
  307. size += fw_dump.hpte_region_size;
  308. /*
  309. * Account for pagesize alignment of boot memory area destination address.
  310. * This faciliates in mmap reading of first kernel's memory.
  311. */
  312. size = PAGE_ALIGN(size);
  313. size += fw_dump.boot_memory_size;
  314. size += sizeof(struct fadump_crash_info_header);
  315. size += sizeof(struct elfhdr); /* ELF core header.*/
  316. size += sizeof(struct elf_phdr); /* place holder for cpu notes */
  317. /* Program headers for crash memory regions. */
  318. size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
  319. size = PAGE_ALIGN(size);
  320. /* This is to hold kernel metadata on platforms that support it */
  321. size += (fw_dump.ops->fadump_get_metadata_size ?
  322. fw_dump.ops->fadump_get_metadata_size() : 0);
  323. return size;
  324. }
  325. static int __init add_boot_mem_region(unsigned long rstart,
  326. unsigned long rsize)
  327. {
  328. int i = fw_dump.boot_mem_regs_cnt++;
  329. if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) {
  330. fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS;
  331. return 0;
  332. }
  333. pr_debug("Added boot memory range[%d] [%#016lx-%#016lx)\n",
  334. i, rstart, (rstart + rsize));
  335. fw_dump.boot_mem_addr[i] = rstart;
  336. fw_dump.boot_mem_sz[i] = rsize;
  337. return 1;
  338. }
  339. /*
  340. * Firmware usually has a hard limit on the data it can copy per region.
  341. * Honour that by splitting a memory range into multiple regions.
  342. */
  343. static int __init add_boot_mem_regions(unsigned long mstart,
  344. unsigned long msize)
  345. {
  346. unsigned long rstart, rsize, max_size;
  347. int ret = 1;
  348. rstart = mstart;
  349. max_size = fw_dump.max_copy_size ? fw_dump.max_copy_size : msize;
  350. while (msize) {
  351. if (msize > max_size)
  352. rsize = max_size;
  353. else
  354. rsize = msize;
  355. ret = add_boot_mem_region(rstart, rsize);
  356. if (!ret)
  357. break;
  358. msize -= rsize;
  359. rstart += rsize;
  360. }
  361. return ret;
  362. }
  363. static int __init fadump_get_boot_mem_regions(void)
  364. {
  365. unsigned long size, cur_size, hole_size, last_end;
  366. unsigned long mem_size = fw_dump.boot_memory_size;
  367. phys_addr_t reg_start, reg_end;
  368. int ret = 1;
  369. u64 i;
  370. fw_dump.boot_mem_regs_cnt = 0;
  371. last_end = 0;
  372. hole_size = 0;
  373. cur_size = 0;
  374. for_each_mem_range(i, &reg_start, &reg_end) {
  375. size = reg_end - reg_start;
  376. hole_size += (reg_start - last_end);
  377. if ((cur_size + size) >= mem_size) {
  378. size = (mem_size - cur_size);
  379. ret = add_boot_mem_regions(reg_start, size);
  380. break;
  381. }
  382. mem_size -= size;
  383. cur_size += size;
  384. ret = add_boot_mem_regions(reg_start, size);
  385. if (!ret)
  386. break;
  387. last_end = reg_end;
  388. }
  389. fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size);
  390. return ret;
  391. }
  392. /*
  393. * Returns true, if the given range overlaps with reserved memory ranges
  394. * starting at idx. Also, updates idx to index of overlapping memory range
  395. * with the given memory range.
  396. * False, otherwise.
  397. */
  398. static bool __init overlaps_reserved_ranges(u64 base, u64 end, int *idx)
  399. {
  400. bool ret = false;
  401. int i;
  402. for (i = *idx; i < reserved_mrange_info.mem_range_cnt; i++) {
  403. u64 rbase = reserved_mrange_info.mem_ranges[i].base;
  404. u64 rend = rbase + reserved_mrange_info.mem_ranges[i].size;
  405. if (end <= rbase)
  406. break;
  407. if ((end > rbase) && (base < rend)) {
  408. *idx = i;
  409. ret = true;
  410. break;
  411. }
  412. }
  413. return ret;
  414. }
  415. /*
  416. * Locate a suitable memory area to reserve memory for FADump. While at it,
  417. * lookup reserved-ranges & avoid overlap with them, as they are used by F/W.
  418. */
  419. static u64 __init fadump_locate_reserve_mem(u64 base, u64 size)
  420. {
  421. struct fadump_memory_range *mrngs;
  422. phys_addr_t mstart, mend;
  423. int idx = 0;
  424. u64 i, ret = 0;
  425. mrngs = reserved_mrange_info.mem_ranges;
  426. for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
  427. &mstart, &mend, NULL) {
  428. pr_debug("%llu) mstart: %llx, mend: %llx, base: %llx\n",
  429. i, mstart, mend, base);
  430. if (mstart > base)
  431. base = PAGE_ALIGN(mstart);
  432. while ((mend > base) && ((mend - base) >= size)) {
  433. if (!overlaps_reserved_ranges(base, base+size, &idx)) {
  434. ret = base;
  435. goto out;
  436. }
  437. base = mrngs[idx].base + mrngs[idx].size;
  438. base = PAGE_ALIGN(base);
  439. }
  440. }
  441. out:
  442. return ret;
  443. }
  444. int __init fadump_reserve_mem(void)
  445. {
  446. u64 base, size, mem_boundary, bootmem_min;
  447. int ret = 1;
  448. if (!fw_dump.fadump_enabled)
  449. return 0;
  450. if (!fw_dump.fadump_supported) {
  451. pr_info("Firmware-Assisted Dump is not supported on this hardware\n");
  452. goto error_out;
  453. }
  454. /*
  455. * Initialize boot memory size
  456. * If dump is active then we have already calculated the size during
  457. * first kernel.
  458. */
  459. if (!fw_dump.dump_active) {
  460. fw_dump.boot_memory_size =
  461. PAGE_ALIGN(fadump_calculate_reserve_size());
  462. #ifdef CONFIG_CMA
  463. if (!fw_dump.nocma) {
  464. fw_dump.boot_memory_size =
  465. ALIGN(fw_dump.boot_memory_size,
  466. CMA_MIN_ALIGNMENT_BYTES);
  467. }
  468. #endif
  469. bootmem_min = fw_dump.ops->fadump_get_bootmem_min();
  470. if (fw_dump.boot_memory_size < bootmem_min) {
  471. pr_err("Can't enable fadump with boot memory size (0x%lx) less than 0x%llx\n",
  472. fw_dump.boot_memory_size, bootmem_min);
  473. goto error_out;
  474. }
  475. if (!fadump_get_boot_mem_regions()) {
  476. pr_err("Too many holes in boot memory area to enable fadump\n");
  477. goto error_out;
  478. }
  479. }
  480. /*
  481. * Calculate the memory boundary.
  482. * If memory_limit is less than actual memory boundary then reserve
  483. * the memory for fadump beyond the memory_limit and adjust the
  484. * memory_limit accordingly, so that the running kernel can run with
  485. * specified memory_limit.
  486. */
  487. if (memory_limit && memory_limit < memblock_end_of_DRAM()) {
  488. size = get_fadump_area_size();
  489. if ((memory_limit + size) < memblock_end_of_DRAM())
  490. memory_limit += size;
  491. else
  492. memory_limit = memblock_end_of_DRAM();
  493. printk(KERN_INFO "Adjusted memory_limit for firmware-assisted"
  494. " dump, now %#016llx\n", memory_limit);
  495. }
  496. if (memory_limit)
  497. mem_boundary = memory_limit;
  498. else
  499. mem_boundary = memblock_end_of_DRAM();
  500. base = fw_dump.boot_mem_top;
  501. size = get_fadump_area_size();
  502. fw_dump.reserve_dump_area_size = size;
  503. if (fw_dump.dump_active) {
  504. pr_info("Firmware-assisted dump is active.\n");
  505. #ifdef CONFIG_HUGETLB_PAGE
  506. /*
  507. * FADump capture kernel doesn't care much about hugepages.
  508. * In fact, handling hugepages in capture kernel is asking for
  509. * trouble. So, disable HugeTLB support when fadump is active.
  510. */
  511. hugetlb_disabled = true;
  512. #endif
  513. /*
  514. * If last boot has crashed then reserve all the memory
  515. * above boot memory size so that we don't touch it until
  516. * dump is written to disk by userspace tool. This memory
  517. * can be released for general use by invalidating fadump.
  518. */
  519. fadump_reserve_crash_area(base);
  520. pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr);
  521. pr_debug("Reserve dump area start address: 0x%lx\n",
  522. fw_dump.reserve_dump_area_start);
  523. } else {
  524. /*
  525. * Reserve memory at an offset closer to bottom of the RAM to
  526. * minimize the impact of memory hot-remove operation.
  527. */
  528. base = fadump_locate_reserve_mem(base, size);
  529. if (!base || (base + size > mem_boundary)) {
  530. pr_err("Failed to find memory chunk for reservation!\n");
  531. goto error_out;
  532. }
  533. fw_dump.reserve_dump_area_start = base;
  534. /*
  535. * Calculate the kernel metadata address and register it with
  536. * f/w if the platform supports.
  537. */
  538. if (fw_dump.ops->fadump_setup_metadata &&
  539. (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0))
  540. goto error_out;
  541. if (memblock_reserve(base, size)) {
  542. pr_err("Failed to reserve memory!\n");
  543. goto error_out;
  544. }
  545. pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n",
  546. (size >> 20), base, (memblock_phys_mem_size() >> 20));
  547. ret = fadump_cma_init();
  548. }
  549. return ret;
  550. error_out:
  551. fw_dump.fadump_enabled = 0;
  552. fw_dump.reserve_dump_area_size = 0;
  553. return 0;
  554. }
  555. /* Look for fadump= cmdline option. */
  556. static int __init early_fadump_param(char *p)
  557. {
  558. if (!p)
  559. return 1;
  560. if (strncmp(p, "on", 2) == 0)
  561. fw_dump.fadump_enabled = 1;
  562. else if (strncmp(p, "off", 3) == 0)
  563. fw_dump.fadump_enabled = 0;
  564. else if (strncmp(p, "nocma", 5) == 0) {
  565. fw_dump.fadump_enabled = 1;
  566. fw_dump.nocma = 1;
  567. }
  568. return 0;
  569. }
  570. early_param("fadump", early_fadump_param);
  571. /*
  572. * Look for fadump_reserve_mem= cmdline option
  573. * TODO: Remove references to 'fadump_reserve_mem=' parameter,
  574. * the sooner 'crashkernel=' parameter is accustomed to.
  575. */
  576. static int __init early_fadump_reserve_mem(char *p)
  577. {
  578. if (p)
  579. fw_dump.reserve_bootvar = memparse(p, &p);
  580. return 0;
  581. }
  582. early_param("fadump_reserve_mem", early_fadump_reserve_mem);
  583. void crash_fadump(struct pt_regs *regs, const char *str)
  584. {
  585. unsigned int msecs;
  586. struct fadump_crash_info_header *fdh = NULL;
  587. int old_cpu, this_cpu;
  588. /* Do not include first CPU */
  589. unsigned int ncpus = num_online_cpus() - 1;
  590. if (!should_fadump_crash())
  591. return;
  592. /*
  593. * old_cpu == -1 means this is the first CPU which has come here,
  594. * go ahead and trigger fadump.
  595. *
  596. * old_cpu != -1 means some other CPU has already on it's way
  597. * to trigger fadump, just keep looping here.
  598. */
  599. this_cpu = smp_processor_id();
  600. old_cpu = cmpxchg(&crashing_cpu, -1, this_cpu);
  601. if (old_cpu != -1) {
  602. atomic_inc(&cpus_in_fadump);
  603. /*
  604. * We can't loop here indefinitely. Wait as long as fadump
  605. * is in force. If we race with fadump un-registration this
  606. * loop will break and then we go down to normal panic path
  607. * and reboot. If fadump is in force the first crashing
  608. * cpu will definitely trigger fadump.
  609. */
  610. while (fw_dump.dump_registered)
  611. cpu_relax();
  612. return;
  613. }
  614. fdh = __va(fw_dump.fadumphdr_addr);
  615. fdh->crashing_cpu = crashing_cpu;
  616. crash_save_vmcoreinfo();
  617. if (regs)
  618. fdh->regs = *regs;
  619. else
  620. ppc_save_regs(&fdh->regs);
  621. fdh->cpu_mask = *cpu_online_mask;
  622. /*
  623. * If we came in via system reset, wait a while for the secondary
  624. * CPUs to enter.
  625. */
  626. if (TRAP(&(fdh->regs)) == INTERRUPT_SYSTEM_RESET) {
  627. msecs = CRASH_TIMEOUT;
  628. while ((atomic_read(&cpus_in_fadump) < ncpus) && (--msecs > 0))
  629. mdelay(1);
  630. }
  631. fw_dump.ops->fadump_trigger(fdh, str);
  632. }
  633. u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
  634. {
  635. struct elf_prstatus prstatus;
  636. memset(&prstatus, 0, sizeof(prstatus));
  637. /*
  638. * FIXME: How do i get PID? Do I really need it?
  639. * prstatus.pr_pid = ????
  640. */
  641. elf_core_copy_regs(&prstatus.pr_reg, regs);
  642. buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
  643. &prstatus, sizeof(prstatus));
  644. return buf;
  645. }
  646. void __init fadump_update_elfcore_header(char *bufp)
  647. {
  648. struct elf_phdr *phdr;
  649. bufp += sizeof(struct elfhdr);
  650. /* First note is a place holder for cpu notes info. */
  651. phdr = (struct elf_phdr *)bufp;
  652. if (phdr->p_type == PT_NOTE) {
  653. phdr->p_paddr = __pa(fw_dump.cpu_notes_buf_vaddr);
  654. phdr->p_offset = phdr->p_paddr;
  655. phdr->p_filesz = fw_dump.cpu_notes_buf_size;
  656. phdr->p_memsz = fw_dump.cpu_notes_buf_size;
  657. }
  658. return;
  659. }
  660. static void *__init fadump_alloc_buffer(unsigned long size)
  661. {
  662. unsigned long count, i;
  663. struct page *page;
  664. void *vaddr;
  665. vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
  666. if (!vaddr)
  667. return NULL;
  668. count = PAGE_ALIGN(size) / PAGE_SIZE;
  669. page = virt_to_page(vaddr);
  670. for (i = 0; i < count; i++)
  671. mark_page_reserved(page + i);
  672. return vaddr;
  673. }
  674. static void fadump_free_buffer(unsigned long vaddr, unsigned long size)
  675. {
  676. free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL);
  677. }
  678. s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus)
  679. {
  680. /* Allocate buffer to hold cpu crash notes. */
  681. fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
  682. fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
  683. fw_dump.cpu_notes_buf_vaddr =
  684. (unsigned long)fadump_alloc_buffer(fw_dump.cpu_notes_buf_size);
  685. if (!fw_dump.cpu_notes_buf_vaddr) {
  686. pr_err("Failed to allocate %ld bytes for CPU notes buffer\n",
  687. fw_dump.cpu_notes_buf_size);
  688. return -ENOMEM;
  689. }
  690. pr_debug("Allocated buffer for cpu notes of size %ld at 0x%lx\n",
  691. fw_dump.cpu_notes_buf_size,
  692. fw_dump.cpu_notes_buf_vaddr);
  693. return 0;
  694. }
  695. void fadump_free_cpu_notes_buf(void)
  696. {
  697. if (!fw_dump.cpu_notes_buf_vaddr)
  698. return;
  699. fadump_free_buffer(fw_dump.cpu_notes_buf_vaddr,
  700. fw_dump.cpu_notes_buf_size);
  701. fw_dump.cpu_notes_buf_vaddr = 0;
  702. fw_dump.cpu_notes_buf_size = 0;
  703. }
  704. static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info)
  705. {
  706. if (mrange_info->is_static) {
  707. mrange_info->mem_range_cnt = 0;
  708. return;
  709. }
  710. kfree(mrange_info->mem_ranges);
  711. memset((void *)((u64)mrange_info + RNG_NAME_SZ), 0,
  712. (sizeof(struct fadump_mrange_info) - RNG_NAME_SZ));
  713. }
  714. /*
  715. * Allocate or reallocate mem_ranges array in incremental units
  716. * of PAGE_SIZE.
  717. */
  718. static int fadump_alloc_mem_ranges(struct fadump_mrange_info *mrange_info)
  719. {
  720. struct fadump_memory_range *new_array;
  721. u64 new_size;
  722. new_size = mrange_info->mem_ranges_sz + PAGE_SIZE;
  723. pr_debug("Allocating %llu bytes of memory for %s memory ranges\n",
  724. new_size, mrange_info->name);
  725. new_array = krealloc(mrange_info->mem_ranges, new_size, GFP_KERNEL);
  726. if (new_array == NULL) {
  727. pr_err("Insufficient memory for setting up %s memory ranges\n",
  728. mrange_info->name);
  729. fadump_free_mem_ranges(mrange_info);
  730. return -ENOMEM;
  731. }
  732. mrange_info->mem_ranges = new_array;
  733. mrange_info->mem_ranges_sz = new_size;
  734. mrange_info->max_mem_ranges = (new_size /
  735. sizeof(struct fadump_memory_range));
  736. return 0;
  737. }
  738. static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info,
  739. u64 base, u64 end)
  740. {
  741. struct fadump_memory_range *mem_ranges = mrange_info->mem_ranges;
  742. bool is_adjacent = false;
  743. u64 start, size;
  744. if (base == end)
  745. return 0;
  746. /*
  747. * Fold adjacent memory ranges to bring down the memory ranges/
  748. * PT_LOAD segments count.
  749. */
  750. if (mrange_info->mem_range_cnt) {
  751. start = mem_ranges[mrange_info->mem_range_cnt - 1].base;
  752. size = mem_ranges[mrange_info->mem_range_cnt - 1].size;
  753. /*
  754. * Boot memory area needs separate PT_LOAD segment(s) as it
  755. * is moved to a different location at the time of crash.
  756. * So, fold only if the region is not boot memory area.
  757. */
  758. if ((start + size) == base && start >= fw_dump.boot_mem_top)
  759. is_adjacent = true;
  760. }
  761. if (!is_adjacent) {
  762. /* resize the array on reaching the limit */
  763. if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) {
  764. int ret;
  765. if (mrange_info->is_static) {
  766. pr_err("Reached array size limit for %s memory ranges\n",
  767. mrange_info->name);
  768. return -ENOSPC;
  769. }
  770. ret = fadump_alloc_mem_ranges(mrange_info);
  771. if (ret)
  772. return ret;
  773. /* Update to the new resized array */
  774. mem_ranges = mrange_info->mem_ranges;
  775. }
  776. start = base;
  777. mem_ranges[mrange_info->mem_range_cnt].base = start;
  778. mrange_info->mem_range_cnt++;
  779. }
  780. mem_ranges[mrange_info->mem_range_cnt - 1].size = (end - start);
  781. pr_debug("%s_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
  782. mrange_info->name, (mrange_info->mem_range_cnt - 1),
  783. start, end - 1, (end - start));
  784. return 0;
  785. }
  786. static int fadump_exclude_reserved_area(u64 start, u64 end)
  787. {
  788. u64 ra_start, ra_end;
  789. int ret = 0;
  790. ra_start = fw_dump.reserve_dump_area_start;
  791. ra_end = ra_start + fw_dump.reserve_dump_area_size;
  792. if ((ra_start < end) && (ra_end > start)) {
  793. if ((start < ra_start) && (end > ra_end)) {
  794. ret = fadump_add_mem_range(&crash_mrange_info,
  795. start, ra_start);
  796. if (ret)
  797. return ret;
  798. ret = fadump_add_mem_range(&crash_mrange_info,
  799. ra_end, end);
  800. } else if (start < ra_start) {
  801. ret = fadump_add_mem_range(&crash_mrange_info,
  802. start, ra_start);
  803. } else if (ra_end < end) {
  804. ret = fadump_add_mem_range(&crash_mrange_info,
  805. ra_end, end);
  806. }
  807. } else
  808. ret = fadump_add_mem_range(&crash_mrange_info, start, end);
  809. return ret;
  810. }
  811. static int fadump_init_elfcore_header(char *bufp)
  812. {
  813. struct elfhdr *elf;
  814. elf = (struct elfhdr *) bufp;
  815. bufp += sizeof(struct elfhdr);
  816. memcpy(elf->e_ident, ELFMAG, SELFMAG);
  817. elf->e_ident[EI_CLASS] = ELF_CLASS;
  818. elf->e_ident[EI_DATA] = ELF_DATA;
  819. elf->e_ident[EI_VERSION] = EV_CURRENT;
  820. elf->e_ident[EI_OSABI] = ELF_OSABI;
  821. memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
  822. elf->e_type = ET_CORE;
  823. elf->e_machine = ELF_ARCH;
  824. elf->e_version = EV_CURRENT;
  825. elf->e_entry = 0;
  826. elf->e_phoff = sizeof(struct elfhdr);
  827. elf->e_shoff = 0;
  828. if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2))
  829. elf->e_flags = 2;
  830. else if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1))
  831. elf->e_flags = 1;
  832. else
  833. elf->e_flags = 0;
  834. elf->e_ehsize = sizeof(struct elfhdr);
  835. elf->e_phentsize = sizeof(struct elf_phdr);
  836. elf->e_phnum = 0;
  837. elf->e_shentsize = 0;
  838. elf->e_shnum = 0;
  839. elf->e_shstrndx = 0;
  840. return 0;
  841. }
  842. /*
  843. * Traverse through memblock structure and setup crash memory ranges. These
  844. * ranges will be used create PT_LOAD program headers in elfcore header.
  845. */
  846. static int fadump_setup_crash_memory_ranges(void)
  847. {
  848. u64 i, start, end;
  849. int ret;
  850. pr_debug("Setup crash memory ranges.\n");
  851. crash_mrange_info.mem_range_cnt = 0;
  852. /*
  853. * Boot memory region(s) registered with firmware are moved to
  854. * different location at the time of crash. Create separate program
  855. * header(s) for this memory chunk(s) with the correct offset.
  856. */
  857. for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
  858. start = fw_dump.boot_mem_addr[i];
  859. end = start + fw_dump.boot_mem_sz[i];
  860. ret = fadump_add_mem_range(&crash_mrange_info, start, end);
  861. if (ret)
  862. return ret;
  863. }
  864. for_each_mem_range(i, &start, &end) {
  865. /*
  866. * skip the memory chunk that is already added
  867. * (0 through boot_memory_top).
  868. */
  869. if (start < fw_dump.boot_mem_top) {
  870. if (end > fw_dump.boot_mem_top)
  871. start = fw_dump.boot_mem_top;
  872. else
  873. continue;
  874. }
  875. /* add this range excluding the reserved dump area. */
  876. ret = fadump_exclude_reserved_area(start, end);
  877. if (ret)
  878. return ret;
  879. }
  880. return 0;
  881. }
  882. /*
  883. * If the given physical address falls within the boot memory region then
  884. * return the relocated address that points to the dump region reserved
  885. * for saving initial boot memory contents.
  886. */
  887. static inline unsigned long fadump_relocate(unsigned long paddr)
  888. {
  889. unsigned long raddr, rstart, rend, rlast, hole_size;
  890. int i;
  891. hole_size = 0;
  892. rlast = 0;
  893. raddr = paddr;
  894. for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
  895. rstart = fw_dump.boot_mem_addr[i];
  896. rend = rstart + fw_dump.boot_mem_sz[i];
  897. hole_size += (rstart - rlast);
  898. if (paddr >= rstart && paddr < rend) {
  899. raddr += fw_dump.boot_mem_dest_addr - hole_size;
  900. break;
  901. }
  902. rlast = rend;
  903. }
  904. pr_debug("vmcoreinfo: paddr = 0x%lx, raddr = 0x%lx\n", paddr, raddr);
  905. return raddr;
  906. }
  907. static int fadump_create_elfcore_headers(char *bufp)
  908. {
  909. unsigned long long raddr, offset;
  910. struct elf_phdr *phdr;
  911. struct elfhdr *elf;
  912. int i, j;
  913. fadump_init_elfcore_header(bufp);
  914. elf = (struct elfhdr *)bufp;
  915. bufp += sizeof(struct elfhdr);
  916. /*
  917. * setup ELF PT_NOTE, place holder for cpu notes info. The notes info
  918. * will be populated during second kernel boot after crash. Hence
  919. * this PT_NOTE will always be the first elf note.
  920. *
  921. * NOTE: Any new ELF note addition should be placed after this note.
  922. */
  923. phdr = (struct elf_phdr *)bufp;
  924. bufp += sizeof(struct elf_phdr);
  925. phdr->p_type = PT_NOTE;
  926. phdr->p_flags = 0;
  927. phdr->p_vaddr = 0;
  928. phdr->p_align = 0;
  929. phdr->p_offset = 0;
  930. phdr->p_paddr = 0;
  931. phdr->p_filesz = 0;
  932. phdr->p_memsz = 0;
  933. (elf->e_phnum)++;
  934. /* setup ELF PT_NOTE for vmcoreinfo */
  935. phdr = (struct elf_phdr *)bufp;
  936. bufp += sizeof(struct elf_phdr);
  937. phdr->p_type = PT_NOTE;
  938. phdr->p_flags = 0;
  939. phdr->p_vaddr = 0;
  940. phdr->p_align = 0;
  941. phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note());
  942. phdr->p_offset = phdr->p_paddr;
  943. phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE;
  944. /* Increment number of program headers. */
  945. (elf->e_phnum)++;
  946. /* setup PT_LOAD sections. */
  947. j = 0;
  948. offset = 0;
  949. raddr = fw_dump.boot_mem_addr[0];
  950. for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) {
  951. u64 mbase, msize;
  952. mbase = crash_mrange_info.mem_ranges[i].base;
  953. msize = crash_mrange_info.mem_ranges[i].size;
  954. if (!msize)
  955. continue;
  956. phdr = (struct elf_phdr *)bufp;
  957. bufp += sizeof(struct elf_phdr);
  958. phdr->p_type = PT_LOAD;
  959. phdr->p_flags = PF_R|PF_W|PF_X;
  960. phdr->p_offset = mbase;
  961. if (mbase == raddr) {
  962. /*
  963. * The entire real memory region will be moved by
  964. * firmware to the specified destination_address.
  965. * Hence set the correct offset.
  966. */
  967. phdr->p_offset = fw_dump.boot_mem_dest_addr + offset;
  968. if (j < (fw_dump.boot_mem_regs_cnt - 1)) {
  969. offset += fw_dump.boot_mem_sz[j];
  970. raddr = fw_dump.boot_mem_addr[++j];
  971. }
  972. }
  973. phdr->p_paddr = mbase;
  974. phdr->p_vaddr = (unsigned long)__va(mbase);
  975. phdr->p_filesz = msize;
  976. phdr->p_memsz = msize;
  977. phdr->p_align = 0;
  978. /* Increment number of program headers. */
  979. (elf->e_phnum)++;
  980. }
  981. return 0;
  982. }
  983. static unsigned long init_fadump_header(unsigned long addr)
  984. {
  985. struct fadump_crash_info_header *fdh;
  986. if (!addr)
  987. return 0;
  988. fdh = __va(addr);
  989. addr += sizeof(struct fadump_crash_info_header);
  990. memset(fdh, 0, sizeof(struct fadump_crash_info_header));
  991. fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
  992. fdh->elfcorehdr_addr = addr;
  993. /* We will set the crashing cpu id in crash_fadump() during crash. */
  994. fdh->crashing_cpu = FADUMP_CPU_UNKNOWN;
  995. /*
  996. * When LPAR is terminated by PYHP, ensure all possible CPUs'
  997. * register data is processed while exporting the vmcore.
  998. */
  999. fdh->cpu_mask = *cpu_possible_mask;
  1000. return addr;
  1001. }
  1002. static int register_fadump(void)
  1003. {
  1004. unsigned long addr;
  1005. void *vaddr;
  1006. int ret;
  1007. /*
  1008. * If no memory is reserved then we can not register for firmware-
  1009. * assisted dump.
  1010. */
  1011. if (!fw_dump.reserve_dump_area_size)
  1012. return -ENODEV;
  1013. ret = fadump_setup_crash_memory_ranges();
  1014. if (ret)
  1015. return ret;
  1016. addr = fw_dump.fadumphdr_addr;
  1017. /* Initialize fadump crash info header. */
  1018. addr = init_fadump_header(addr);
  1019. vaddr = __va(addr);
  1020. pr_debug("Creating ELF core headers at %#016lx\n", addr);
  1021. fadump_create_elfcore_headers(vaddr);
  1022. /* register the future kernel dump with firmware. */
  1023. pr_debug("Registering for firmware-assisted kernel dump...\n");
  1024. return fw_dump.ops->fadump_register(&fw_dump);
  1025. }
  1026. void fadump_cleanup(void)
  1027. {
  1028. if (!fw_dump.fadump_supported)
  1029. return;
  1030. /* Invalidate the registration only if dump is active. */
  1031. if (fw_dump.dump_active) {
  1032. pr_debug("Invalidating firmware-assisted dump registration\n");
  1033. fw_dump.ops->fadump_invalidate(&fw_dump);
  1034. } else if (fw_dump.dump_registered) {
  1035. /* Un-register Firmware-assisted dump if it was registered. */
  1036. fw_dump.ops->fadump_unregister(&fw_dump);
  1037. fadump_free_mem_ranges(&crash_mrange_info);
  1038. }
  1039. if (fw_dump.ops->fadump_cleanup)
  1040. fw_dump.ops->fadump_cleanup(&fw_dump);
  1041. }
  1042. static void fadump_free_reserved_memory(unsigned long start_pfn,
  1043. unsigned long end_pfn)
  1044. {
  1045. unsigned long pfn;
  1046. unsigned long time_limit = jiffies + HZ;
  1047. pr_info("freeing reserved memory (0x%llx - 0x%llx)\n",
  1048. PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
  1049. for (pfn = start_pfn; pfn < end_pfn; pfn++) {
  1050. free_reserved_page(pfn_to_page(pfn));
  1051. if (time_after(jiffies, time_limit)) {
  1052. cond_resched();
  1053. time_limit = jiffies + HZ;
  1054. }
  1055. }
  1056. }
  1057. /*
  1058. * Skip memory holes and free memory that was actually reserved.
  1059. */
  1060. static void fadump_release_reserved_area(u64 start, u64 end)
  1061. {
  1062. unsigned long reg_spfn, reg_epfn;
  1063. u64 tstart, tend, spfn, epfn;
  1064. int i;
  1065. spfn = PHYS_PFN(start);
  1066. epfn = PHYS_PFN(end);
  1067. for_each_mem_pfn_range(i, MAX_NUMNODES, &reg_spfn, &reg_epfn, NULL) {
  1068. tstart = max_t(u64, spfn, reg_spfn);
  1069. tend = min_t(u64, epfn, reg_epfn);
  1070. if (tstart < tend) {
  1071. fadump_free_reserved_memory(tstart, tend);
  1072. if (tend == epfn)
  1073. break;
  1074. spfn = tend;
  1075. }
  1076. }
  1077. }
  1078. /*
  1079. * Sort the mem ranges in-place and merge adjacent ranges
  1080. * to minimize the memory ranges count.
  1081. */
  1082. static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info)
  1083. {
  1084. struct fadump_memory_range *mem_ranges;
  1085. u64 base, size;
  1086. int i, j, idx;
  1087. if (!reserved_mrange_info.mem_range_cnt)
  1088. return;
  1089. /* Sort the memory ranges */
  1090. mem_ranges = mrange_info->mem_ranges;
  1091. for (i = 0; i < mrange_info->mem_range_cnt; i++) {
  1092. idx = i;
  1093. for (j = (i + 1); j < mrange_info->mem_range_cnt; j++) {
  1094. if (mem_ranges[idx].base > mem_ranges[j].base)
  1095. idx = j;
  1096. }
  1097. if (idx != i)
  1098. swap(mem_ranges[idx], mem_ranges[i]);
  1099. }
  1100. /* Merge adjacent reserved ranges */
  1101. idx = 0;
  1102. for (i = 1; i < mrange_info->mem_range_cnt; i++) {
  1103. base = mem_ranges[i-1].base;
  1104. size = mem_ranges[i-1].size;
  1105. if (mem_ranges[i].base == (base + size))
  1106. mem_ranges[idx].size += mem_ranges[i].size;
  1107. else {
  1108. idx++;
  1109. if (i == idx)
  1110. continue;
  1111. mem_ranges[idx] = mem_ranges[i];
  1112. }
  1113. }
  1114. mrange_info->mem_range_cnt = idx + 1;
  1115. }
  1116. /*
  1117. * Scan reserved-ranges to consider them while reserving/releasing
  1118. * memory for FADump.
  1119. */
  1120. static void __init early_init_dt_scan_reserved_ranges(unsigned long node)
  1121. {
  1122. const __be32 *prop;
  1123. int len, ret = -1;
  1124. unsigned long i;
  1125. /* reserved-ranges already scanned */
  1126. if (reserved_mrange_info.mem_range_cnt != 0)
  1127. return;
  1128. prop = of_get_flat_dt_prop(node, "reserved-ranges", &len);
  1129. if (!prop)
  1130. return;
  1131. /*
  1132. * Each reserved range is an (address,size) pair, 2 cells each,
  1133. * totalling 4 cells per range.
  1134. */
  1135. for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
  1136. u64 base, size;
  1137. base = of_read_number(prop + (i * 4) + 0, 2);
  1138. size = of_read_number(prop + (i * 4) + 2, 2);
  1139. if (size) {
  1140. ret = fadump_add_mem_range(&reserved_mrange_info,
  1141. base, base + size);
  1142. if (ret < 0) {
  1143. pr_warn("some reserved ranges are ignored!\n");
  1144. break;
  1145. }
  1146. }
  1147. }
  1148. /* Compact reserved ranges */
  1149. sort_and_merge_mem_ranges(&reserved_mrange_info);
  1150. }
  1151. /*
  1152. * Release the memory that was reserved during early boot to preserve the
  1153. * crash'ed kernel's memory contents except reserved dump area (permanent
  1154. * reservation) and reserved ranges used by F/W. The released memory will
  1155. * be available for general use.
  1156. */
  1157. static void fadump_release_memory(u64 begin, u64 end)
  1158. {
  1159. u64 ra_start, ra_end, tstart;
  1160. int i, ret;
  1161. ra_start = fw_dump.reserve_dump_area_start;
  1162. ra_end = ra_start + fw_dump.reserve_dump_area_size;
  1163. /*
  1164. * If reserved ranges array limit is hit, overwrite the last reserved
  1165. * memory range with reserved dump area to ensure it is excluded from
  1166. * the memory being released (reused for next FADump registration).
  1167. */
  1168. if (reserved_mrange_info.mem_range_cnt ==
  1169. reserved_mrange_info.max_mem_ranges)
  1170. reserved_mrange_info.mem_range_cnt--;
  1171. ret = fadump_add_mem_range(&reserved_mrange_info, ra_start, ra_end);
  1172. if (ret != 0)
  1173. return;
  1174. /* Get the reserved ranges list in order first. */
  1175. sort_and_merge_mem_ranges(&reserved_mrange_info);
  1176. /* Exclude reserved ranges and release remaining memory */
  1177. tstart = begin;
  1178. for (i = 0; i < reserved_mrange_info.mem_range_cnt; i++) {
  1179. ra_start = reserved_mrange_info.mem_ranges[i].base;
  1180. ra_end = ra_start + reserved_mrange_info.mem_ranges[i].size;
  1181. if (tstart >= ra_end)
  1182. continue;
  1183. if (tstart < ra_start)
  1184. fadump_release_reserved_area(tstart, ra_start);
  1185. tstart = ra_end;
  1186. }
  1187. if (tstart < end)
  1188. fadump_release_reserved_area(tstart, end);
  1189. }
  1190. static void fadump_invalidate_release_mem(void)
  1191. {
  1192. mutex_lock(&fadump_mutex);
  1193. if (!fw_dump.dump_active) {
  1194. mutex_unlock(&fadump_mutex);
  1195. return;
  1196. }
  1197. fadump_cleanup();
  1198. mutex_unlock(&fadump_mutex);
  1199. fadump_release_memory(fw_dump.boot_mem_top, memblock_end_of_DRAM());
  1200. fadump_free_cpu_notes_buf();
  1201. /*
  1202. * Setup kernel metadata and initialize the kernel dump
  1203. * memory structure for FADump re-registration.
  1204. */
  1205. if (fw_dump.ops->fadump_setup_metadata &&
  1206. (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0))
  1207. pr_warn("Failed to setup kernel metadata!\n");
  1208. fw_dump.ops->fadump_init_mem_struct(&fw_dump);
  1209. }
  1210. static ssize_t release_mem_store(struct kobject *kobj,
  1211. struct kobj_attribute *attr,
  1212. const char *buf, size_t count)
  1213. {
  1214. int input = -1;
  1215. if (!fw_dump.dump_active)
  1216. return -EPERM;
  1217. if (kstrtoint(buf, 0, &input))
  1218. return -EINVAL;
  1219. if (input == 1) {
  1220. /*
  1221. * Take away the '/proc/vmcore'. We are releasing the dump
  1222. * memory, hence it will not be valid anymore.
  1223. */
  1224. #ifdef CONFIG_PROC_VMCORE
  1225. vmcore_cleanup();
  1226. #endif
  1227. fadump_invalidate_release_mem();
  1228. } else
  1229. return -EINVAL;
  1230. return count;
  1231. }
  1232. /* Release the reserved memory and disable the FADump */
  1233. static void __init unregister_fadump(void)
  1234. {
  1235. fadump_cleanup();
  1236. fadump_release_memory(fw_dump.reserve_dump_area_start,
  1237. fw_dump.reserve_dump_area_size);
  1238. fw_dump.fadump_enabled = 0;
  1239. kobject_put(fadump_kobj);
  1240. }
  1241. static ssize_t enabled_show(struct kobject *kobj,
  1242. struct kobj_attribute *attr,
  1243. char *buf)
  1244. {
  1245. return sprintf(buf, "%d\n", fw_dump.fadump_enabled);
  1246. }
  1247. static ssize_t mem_reserved_show(struct kobject *kobj,
  1248. struct kobj_attribute *attr,
  1249. char *buf)
  1250. {
  1251. return sprintf(buf, "%ld\n", fw_dump.reserve_dump_area_size);
  1252. }
  1253. static ssize_t registered_show(struct kobject *kobj,
  1254. struct kobj_attribute *attr,
  1255. char *buf)
  1256. {
  1257. return sprintf(buf, "%d\n", fw_dump.dump_registered);
  1258. }
  1259. static ssize_t registered_store(struct kobject *kobj,
  1260. struct kobj_attribute *attr,
  1261. const char *buf, size_t count)
  1262. {
  1263. int ret = 0;
  1264. int input = -1;
  1265. if (!fw_dump.fadump_enabled || fw_dump.dump_active)
  1266. return -EPERM;
  1267. if (kstrtoint(buf, 0, &input))
  1268. return -EINVAL;
  1269. mutex_lock(&fadump_mutex);
  1270. switch (input) {
  1271. case 0:
  1272. if (fw_dump.dump_registered == 0) {
  1273. goto unlock_out;
  1274. }
  1275. /* Un-register Firmware-assisted dump */
  1276. pr_debug("Un-register firmware-assisted dump\n");
  1277. fw_dump.ops->fadump_unregister(&fw_dump);
  1278. break;
  1279. case 1:
  1280. if (fw_dump.dump_registered == 1) {
  1281. /* Un-register Firmware-assisted dump */
  1282. fw_dump.ops->fadump_unregister(&fw_dump);
  1283. }
  1284. /* Register Firmware-assisted dump */
  1285. ret = register_fadump();
  1286. break;
  1287. default:
  1288. ret = -EINVAL;
  1289. break;
  1290. }
  1291. unlock_out:
  1292. mutex_unlock(&fadump_mutex);
  1293. return ret < 0 ? ret : count;
  1294. }
  1295. static int fadump_region_show(struct seq_file *m, void *private)
  1296. {
  1297. if (!fw_dump.fadump_enabled)
  1298. return 0;
  1299. mutex_lock(&fadump_mutex);
  1300. fw_dump.ops->fadump_region_show(&fw_dump, m);
  1301. mutex_unlock(&fadump_mutex);
  1302. return 0;
  1303. }
  1304. static struct kobj_attribute release_attr = __ATTR_WO(release_mem);
  1305. static struct kobj_attribute enable_attr = __ATTR_RO(enabled);
  1306. static struct kobj_attribute register_attr = __ATTR_RW(registered);
  1307. static struct kobj_attribute mem_reserved_attr = __ATTR_RO(mem_reserved);
  1308. static struct attribute *fadump_attrs[] = {
  1309. &enable_attr.attr,
  1310. &register_attr.attr,
  1311. &mem_reserved_attr.attr,
  1312. NULL,
  1313. };
  1314. ATTRIBUTE_GROUPS(fadump);
  1315. DEFINE_SHOW_ATTRIBUTE(fadump_region);
  1316. static void __init fadump_init_files(void)
  1317. {
  1318. int rc = 0;
  1319. fadump_kobj = kobject_create_and_add("fadump", kernel_kobj);
  1320. if (!fadump_kobj) {
  1321. pr_err("failed to create fadump kobject\n");
  1322. return;
  1323. }
  1324. debugfs_create_file("fadump_region", 0444, arch_debugfs_dir, NULL,
  1325. &fadump_region_fops);
  1326. if (fw_dump.dump_active) {
  1327. rc = sysfs_create_file(fadump_kobj, &release_attr.attr);
  1328. if (rc)
  1329. pr_err("unable to create release_mem sysfs file (%d)\n",
  1330. rc);
  1331. }
  1332. rc = sysfs_create_groups(fadump_kobj, fadump_groups);
  1333. if (rc) {
  1334. pr_err("sysfs group creation failed (%d), unregistering FADump",
  1335. rc);
  1336. unregister_fadump();
  1337. return;
  1338. }
  1339. /*
  1340. * The FADump sysfs are moved from kernel_kobj to fadump_kobj need to
  1341. * create symlink at old location to maintain backward compatibility.
  1342. *
  1343. * - fadump_enabled -> fadump/enabled
  1344. * - fadump_registered -> fadump/registered
  1345. * - fadump_release_mem -> fadump/release_mem
  1346. */
  1347. rc = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, fadump_kobj,
  1348. "enabled", "fadump_enabled");
  1349. if (rc) {
  1350. pr_err("unable to create fadump_enabled symlink (%d)", rc);
  1351. return;
  1352. }
  1353. rc = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, fadump_kobj,
  1354. "registered",
  1355. "fadump_registered");
  1356. if (rc) {
  1357. pr_err("unable to create fadump_registered symlink (%d)", rc);
  1358. sysfs_remove_link(kernel_kobj, "fadump_enabled");
  1359. return;
  1360. }
  1361. if (fw_dump.dump_active) {
  1362. rc = compat_only_sysfs_link_entry_to_kobj(kernel_kobj,
  1363. fadump_kobj,
  1364. "release_mem",
  1365. "fadump_release_mem");
  1366. if (rc)
  1367. pr_err("unable to create fadump_release_mem symlink (%d)",
  1368. rc);
  1369. }
  1370. return;
  1371. }
  1372. /*
  1373. * Prepare for firmware-assisted dump.
  1374. */
  1375. int __init setup_fadump(void)
  1376. {
  1377. if (!fw_dump.fadump_supported)
  1378. return 0;
  1379. fadump_init_files();
  1380. fadump_show_config();
  1381. if (!fw_dump.fadump_enabled)
  1382. return 1;
  1383. /*
  1384. * If dump data is available then see if it is valid and prepare for
  1385. * saving it to the disk.
  1386. */
  1387. if (fw_dump.dump_active) {
  1388. /*
  1389. * if dump process fails then invalidate the registration
  1390. * and release memory before proceeding for re-registration.
  1391. */
  1392. if (fw_dump.ops->fadump_process(&fw_dump) < 0)
  1393. fadump_invalidate_release_mem();
  1394. }
  1395. /* Initialize the kernel dump memory structure and register with f/w */
  1396. else if (fw_dump.reserve_dump_area_size) {
  1397. fw_dump.ops->fadump_init_mem_struct(&fw_dump);
  1398. register_fadump();
  1399. }
  1400. /*
  1401. * In case of panic, fadump is triggered via ppc_panic_event()
  1402. * panic notifier. Setting crash_kexec_post_notifiers to 'true'
  1403. * lets panic() function take crash friendly path before panic
  1404. * notifiers are invoked.
  1405. */
  1406. crash_kexec_post_notifiers = true;
  1407. return 1;
  1408. }
  1409. /*
  1410. * Use subsys_initcall_sync() here because there is dependency with
  1411. * crash_save_vmcoreinfo_init(), which must run first to ensure vmcoreinfo initialization
  1412. * is done before registering with f/w.
  1413. */
  1414. subsys_initcall_sync(setup_fadump);
  1415. #else /* !CONFIG_PRESERVE_FA_DUMP */
  1416. /* Scan the Firmware Assisted dump configuration details. */
  1417. int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  1418. int depth, void *data)
  1419. {
  1420. if ((depth != 1) || (strcmp(uname, "ibm,opal") != 0))
  1421. return 0;
  1422. opal_fadump_dt_scan(&fw_dump, node);
  1423. return 1;
  1424. }
  1425. /*
  1426. * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
  1427. * preserve crash data. The subsequent memory preserving kernel boot
  1428. * is likely to process this crash data.
  1429. */
  1430. int __init fadump_reserve_mem(void)
  1431. {
  1432. if (fw_dump.dump_active) {
  1433. /*
  1434. * If last boot has crashed then reserve all the memory
  1435. * above boot memory to preserve crash data.
  1436. */
  1437. pr_info("Preserving crash data for processing in next boot.\n");
  1438. fadump_reserve_crash_area(fw_dump.boot_mem_top);
  1439. } else
  1440. pr_debug("FADump-aware kernel..\n");
  1441. return 1;
  1442. }
  1443. #endif /* CONFIG_PRESERVE_FA_DUMP */
  1444. /* Preserve everything above the base address */
  1445. static void __init fadump_reserve_crash_area(u64 base)
  1446. {
  1447. u64 i, mstart, mend, msize;
  1448. for_each_mem_range(i, &mstart, &mend) {
  1449. msize = mend - mstart;
  1450. if ((mstart + msize) < base)
  1451. continue;
  1452. if (mstart < base) {
  1453. msize -= (base - mstart);
  1454. mstart = base;
  1455. }
  1456. pr_info("Reserving %lluMB of memory at %#016llx for preserving crash data",
  1457. (msize >> 20), mstart);
  1458. memblock_reserve(mstart, msize);
  1459. }
  1460. }
  1461. unsigned long __init arch_reserved_kernel_pages(void)
  1462. {
  1463. return memblock_reserved_size() / PAGE_SIZE;
  1464. }