dump_pagetables.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Debug helper to dump the current kernel pagetables of the system
  4. * so that we can see what the various memory ranges are set to.
  5. *
  6. * (C) Copyright 2008 Intel Corporation
  7. *
  8. * Author: Arjan van de Ven <[email protected]>
  9. */
  10. #include <linux/debugfs.h>
  11. #include <linux/kasan.h>
  12. #include <linux/mm.h>
  13. #include <linux/init.h>
  14. #include <linux/sched.h>
  15. #include <linux/seq_file.h>
  16. #include <linux/highmem.h>
  17. #include <linux/pci.h>
  18. #include <linux/ptdump.h>
  19. #include <asm/e820/types.h>
  20. /*
  21. * The dumper groups pagetable entries of the same type into one, and for
  22. * that it needs to keep some state when walking, and flush this state
  23. * when a "break" in the continuity is found.
  24. */
  25. struct pg_state {
  26. struct ptdump_state ptdump;
  27. int level;
  28. pgprotval_t current_prot;
  29. pgprotval_t effective_prot;
  30. pgprotval_t prot_levels[5];
  31. unsigned long start_address;
  32. const struct addr_marker *marker;
  33. unsigned long lines;
  34. bool to_dmesg;
  35. bool check_wx;
  36. unsigned long wx_pages;
  37. struct seq_file *seq;
  38. };
  39. struct addr_marker {
  40. unsigned long start_address;
  41. const char *name;
  42. unsigned long max_lines;
  43. };
  44. /* Address space markers hints */
  45. #ifdef CONFIG_X86_64
  46. enum address_markers_idx {
  47. USER_SPACE_NR = 0,
  48. KERNEL_SPACE_NR,
  49. #ifdef CONFIG_MODIFY_LDT_SYSCALL
  50. LDT_NR,
  51. #endif
  52. LOW_KERNEL_NR,
  53. VMALLOC_START_NR,
  54. VMEMMAP_START_NR,
  55. #ifdef CONFIG_KASAN
  56. KASAN_SHADOW_START_NR,
  57. KASAN_SHADOW_END_NR,
  58. #endif
  59. CPU_ENTRY_AREA_NR,
  60. #ifdef CONFIG_X86_ESPFIX64
  61. ESPFIX_START_NR,
  62. #endif
  63. #ifdef CONFIG_EFI
  64. EFI_END_NR,
  65. #endif
  66. HIGH_KERNEL_NR,
  67. MODULES_VADDR_NR,
  68. MODULES_END_NR,
  69. FIXADDR_START_NR,
  70. END_OF_SPACE_NR,
  71. };
  72. static struct addr_marker address_markers[] = {
  73. [USER_SPACE_NR] = { 0, "User Space" },
  74. [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
  75. [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
  76. [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
  77. [VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
  78. #ifdef CONFIG_KASAN
  79. /*
  80. * These fields get initialized with the (dynamic)
  81. * KASAN_SHADOW_{START,END} values in pt_dump_init().
  82. */
  83. [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" },
  84. [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" },
  85. #endif
  86. #ifdef CONFIG_MODIFY_LDT_SYSCALL
  87. [LDT_NR] = { 0UL, "LDT remap" },
  88. #endif
  89. [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
  90. #ifdef CONFIG_X86_ESPFIX64
  91. [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
  92. #endif
  93. #ifdef CONFIG_EFI
  94. [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
  95. #endif
  96. [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
  97. [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
  98. [MODULES_END_NR] = { MODULES_END, "End Modules" },
  99. [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
  100. [END_OF_SPACE_NR] = { -1, NULL }
  101. };
  102. #define INIT_PGD ((pgd_t *) &init_top_pgt)
  103. #else /* CONFIG_X86_64 */
  104. enum address_markers_idx {
  105. USER_SPACE_NR = 0,
  106. KERNEL_SPACE_NR,
  107. VMALLOC_START_NR,
  108. VMALLOC_END_NR,
  109. #ifdef CONFIG_HIGHMEM
  110. PKMAP_BASE_NR,
  111. #endif
  112. #ifdef CONFIG_MODIFY_LDT_SYSCALL
  113. LDT_NR,
  114. #endif
  115. CPU_ENTRY_AREA_NR,
  116. FIXADDR_START_NR,
  117. END_OF_SPACE_NR,
  118. };
  119. static struct addr_marker address_markers[] = {
  120. [USER_SPACE_NR] = { 0, "User Space" },
  121. [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
  122. [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
  123. [VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
  124. #ifdef CONFIG_HIGHMEM
  125. [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
  126. #endif
  127. #ifdef CONFIG_MODIFY_LDT_SYSCALL
  128. [LDT_NR] = { 0UL, "LDT remap" },
  129. #endif
  130. [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
  131. [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
  132. [END_OF_SPACE_NR] = { -1, NULL }
  133. };
  134. #define INIT_PGD (swapper_pg_dir)
  135. #endif /* !CONFIG_X86_64 */
  136. /* Multipliers for offsets within the PTEs */
  137. #define PTE_LEVEL_MULT (PAGE_SIZE)
  138. #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
  139. #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
  140. #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
  141. #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
  142. #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
  143. ({ \
  144. if (to_dmesg) \
  145. printk(KERN_INFO fmt, ##args); \
  146. else \
  147. if (m) \
  148. seq_printf(m, fmt, ##args); \
  149. })
  150. #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
  151. ({ \
  152. if (to_dmesg) \
  153. printk(KERN_CONT fmt, ##args); \
  154. else \
  155. if (m) \
  156. seq_printf(m, fmt, ##args); \
  157. })
  158. /*
  159. * Print a readable form of a pgprot_t to the seq_file
  160. */
  161. static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg)
  162. {
  163. static const char * const level_name[] =
  164. { "pgd", "p4d", "pud", "pmd", "pte" };
  165. if (!(pr & _PAGE_PRESENT)) {
  166. /* Not present */
  167. pt_dump_cont_printf(m, dmsg, " ");
  168. } else {
  169. if (pr & _PAGE_USER)
  170. pt_dump_cont_printf(m, dmsg, "USR ");
  171. else
  172. pt_dump_cont_printf(m, dmsg, " ");
  173. if (pr & _PAGE_RW)
  174. pt_dump_cont_printf(m, dmsg, "RW ");
  175. else
  176. pt_dump_cont_printf(m, dmsg, "ro ");
  177. if (pr & _PAGE_PWT)
  178. pt_dump_cont_printf(m, dmsg, "PWT ");
  179. else
  180. pt_dump_cont_printf(m, dmsg, " ");
  181. if (pr & _PAGE_PCD)
  182. pt_dump_cont_printf(m, dmsg, "PCD ");
  183. else
  184. pt_dump_cont_printf(m, dmsg, " ");
  185. /* Bit 7 has a different meaning on level 3 vs 4 */
  186. if (level <= 3 && pr & _PAGE_PSE)
  187. pt_dump_cont_printf(m, dmsg, "PSE ");
  188. else
  189. pt_dump_cont_printf(m, dmsg, " ");
  190. if ((level == 4 && pr & _PAGE_PAT) ||
  191. ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
  192. pt_dump_cont_printf(m, dmsg, "PAT ");
  193. else
  194. pt_dump_cont_printf(m, dmsg, " ");
  195. if (pr & _PAGE_GLOBAL)
  196. pt_dump_cont_printf(m, dmsg, "GLB ");
  197. else
  198. pt_dump_cont_printf(m, dmsg, " ");
  199. if (pr & _PAGE_NX)
  200. pt_dump_cont_printf(m, dmsg, "NX ");
  201. else
  202. pt_dump_cont_printf(m, dmsg, "x ");
  203. }
  204. pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
  205. }
  206. static void note_wx(struct pg_state *st, unsigned long addr)
  207. {
  208. unsigned long npages;
  209. npages = (addr - st->start_address) / PAGE_SIZE;
  210. #ifdef CONFIG_PCI_BIOS
  211. /*
  212. * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
  213. * Inform about it, but avoid the warning.
  214. */
  215. if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
  216. addr <= PAGE_OFFSET + BIOS_END) {
  217. pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
  218. return;
  219. }
  220. #endif
  221. /* Account the WX pages */
  222. st->wx_pages += npages;
  223. WARN_ONCE(__supported_pte_mask & _PAGE_NX,
  224. "x86/mm: Found insecure W+X mapping at address %pS\n",
  225. (void *)st->start_address);
  226. }
  227. static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
  228. {
  229. struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
  230. pgprotval_t prot = val & PTE_FLAGS_MASK;
  231. pgprotval_t effective;
  232. if (level > 0) {
  233. pgprotval_t higher_prot = st->prot_levels[level - 1];
  234. effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) |
  235. ((higher_prot | prot) & _PAGE_NX);
  236. } else {
  237. effective = prot;
  238. }
  239. st->prot_levels[level] = effective;
  240. }
  241. /*
  242. * This function gets called on a break in a continuous series
  243. * of PTE entries; the next one is different so we need to
  244. * print what we collected so far.
  245. */
  246. static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
  247. u64 val)
  248. {
  249. struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
  250. pgprotval_t new_prot, new_eff;
  251. pgprotval_t cur, eff;
  252. static const char units[] = "BKMGTPE";
  253. struct seq_file *m = st->seq;
  254. new_prot = val & PTE_FLAGS_MASK;
  255. if (!val)
  256. new_eff = 0;
  257. else
  258. new_eff = st->prot_levels[level];
  259. /*
  260. * If we have a "break" in the series, we need to flush the state that
  261. * we have now. "break" is either changing perms, levels or
  262. * address space marker.
  263. */
  264. cur = st->current_prot;
  265. eff = st->effective_prot;
  266. if (st->level == -1) {
  267. /* First entry */
  268. st->current_prot = new_prot;
  269. st->effective_prot = new_eff;
  270. st->level = level;
  271. st->marker = address_markers;
  272. st->lines = 0;
  273. pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
  274. st->marker->name);
  275. } else if (new_prot != cur || new_eff != eff || level != st->level ||
  276. addr >= st->marker[1].start_address) {
  277. const char *unit = units;
  278. unsigned long delta;
  279. int width = sizeof(unsigned long) * 2;
  280. if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
  281. note_wx(st, addr);
  282. /*
  283. * Now print the actual finished series
  284. */
  285. if (!st->marker->max_lines ||
  286. st->lines < st->marker->max_lines) {
  287. pt_dump_seq_printf(m, st->to_dmesg,
  288. "0x%0*lx-0x%0*lx ",
  289. width, st->start_address,
  290. width, addr);
  291. delta = addr - st->start_address;
  292. while (!(delta & 1023) && unit[1]) {
  293. delta >>= 10;
  294. unit++;
  295. }
  296. pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
  297. delta, *unit);
  298. printk_prot(m, st->current_prot, st->level,
  299. st->to_dmesg);
  300. }
  301. st->lines++;
  302. /*
  303. * We print markers for special areas of address space,
  304. * such as the start of vmalloc space etc.
  305. * This helps in the interpretation.
  306. */
  307. if (addr >= st->marker[1].start_address) {
  308. if (st->marker->max_lines &&
  309. st->lines > st->marker->max_lines) {
  310. unsigned long nskip =
  311. st->lines - st->marker->max_lines;
  312. pt_dump_seq_printf(m, st->to_dmesg,
  313. "... %lu entr%s skipped ... \n",
  314. nskip,
  315. nskip == 1 ? "y" : "ies");
  316. }
  317. st->marker++;
  318. st->lines = 0;
  319. pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
  320. st->marker->name);
  321. }
  322. st->start_address = addr;
  323. st->current_prot = new_prot;
  324. st->effective_prot = new_eff;
  325. st->level = level;
  326. }
  327. }
  328. static void ptdump_walk_pgd_level_core(struct seq_file *m,
  329. struct mm_struct *mm, pgd_t *pgd,
  330. bool checkwx, bool dmesg)
  331. {
  332. const struct ptdump_range ptdump_ranges[] = {
  333. #ifdef CONFIG_X86_64
  334. {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2},
  335. {GUARD_HOLE_END_ADDR, ~0UL},
  336. #else
  337. {0, ~0UL},
  338. #endif
  339. {0, 0}
  340. };
  341. struct pg_state st = {
  342. .ptdump = {
  343. .note_page = note_page,
  344. .effective_prot = effective_prot,
  345. .range = ptdump_ranges
  346. },
  347. .level = -1,
  348. .to_dmesg = dmesg,
  349. .check_wx = checkwx,
  350. .seq = m
  351. };
  352. ptdump_walk_pgd(&st.ptdump, mm, pgd);
  353. if (!checkwx)
  354. return;
  355. if (st.wx_pages)
  356. pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
  357. st.wx_pages);
  358. else
  359. pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
  360. }
  361. void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
  362. {
  363. ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true);
  364. }
  365. void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
  366. bool user)
  367. {
  368. pgd_t *pgd = mm->pgd;
  369. #ifdef CONFIG_PAGE_TABLE_ISOLATION
  370. if (user && boot_cpu_has(X86_FEATURE_PTI))
  371. pgd = kernel_to_user_pgdp(pgd);
  372. #endif
  373. ptdump_walk_pgd_level_core(m, mm, pgd, false, false);
  374. }
  375. EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
  376. void ptdump_walk_user_pgd_level_checkwx(void)
  377. {
  378. #ifdef CONFIG_PAGE_TABLE_ISOLATION
  379. pgd_t *pgd = INIT_PGD;
  380. if (!(__supported_pte_mask & _PAGE_NX) ||
  381. !boot_cpu_has(X86_FEATURE_PTI))
  382. return;
  383. pr_info("x86/mm: Checking user space page tables\n");
  384. pgd = kernel_to_user_pgdp(pgd);
  385. ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false);
  386. #endif
  387. }
  388. void ptdump_walk_pgd_level_checkwx(void)
  389. {
  390. ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
  391. }
  392. static int __init pt_dump_init(void)
  393. {
  394. /*
  395. * Various markers are not compile-time constants, so assign them
  396. * here.
  397. */
  398. #ifdef CONFIG_X86_64
  399. address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
  400. address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
  401. address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
  402. #ifdef CONFIG_MODIFY_LDT_SYSCALL
  403. address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
  404. #endif
  405. #ifdef CONFIG_KASAN
  406. address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
  407. address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
  408. #endif
  409. #endif
  410. #ifdef CONFIG_X86_32
  411. address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
  412. address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
  413. # ifdef CONFIG_HIGHMEM
  414. address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
  415. # endif
  416. address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
  417. address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
  418. # ifdef CONFIG_MODIFY_LDT_SYSCALL
  419. address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
  420. # endif
  421. #endif
  422. return 0;
  423. }
  424. __initcall(pt_dump_init);