ptdump.c 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright 2016, Rashmica Gupta, IBM Corp.
  4. *
  5. * This traverses the kernel pagetables and dumps the
  6. * information about the used sections of memory to
  7. * /sys/kernel/debug/kernel_pagetables.
  8. *
  9. * Derived from the arm64 implementation:
  10. * Copyright (c) 2014, The Linux Foundation, Laura Abbott.
  11. * (C) Copyright 2008 Intel Corporation, Arjan van de Ven.
  12. */
  13. #include <linux/debugfs.h>
  14. #include <linux/fs.h>
  15. #include <linux/hugetlb.h>
  16. #include <linux/io.h>
  17. #include <linux/mm.h>
  18. #include <linux/highmem.h>
  19. #include <linux/ptdump.h>
  20. #include <linux/sched.h>
  21. #include <linux/seq_file.h>
  22. #include <asm/fixmap.h>
  23. #include <linux/const.h>
  24. #include <linux/kasan.h>
  25. #include <asm/page.h>
  26. #include <asm/hugetlb.h>
  27. #include <mm/mmu_decl.h>
  28. #include "ptdump.h"
  29. /*
  30. * To visualise what is happening,
  31. *
  32. * - PTRS_PER_P** = how many entries there are in the corresponding P**
  33. * - P**_SHIFT = how many bits of the address we use to index into the
  34. * corresponding P**
  35. * - P**_SIZE is how much memory we can access through the table - not the
  36. * size of the table itself.
  37. * P**={PGD, PUD, PMD, PTE}
  38. *
  39. *
  40. * Each entry of the PGD points to a PUD. Each entry of a PUD points to a
  41. * PMD. Each entry of a PMD points to a PTE. And every PTE entry points to
  42. * a page.
  43. *
  44. * In the case where there are only 3 levels, the PUD is folded into the
  45. * PGD: every PUD has only one entry which points to the PMD.
  46. *
  47. * The page dumper groups page table entries of the same type into a single
  48. * description. It uses pg_state to track the range information while
  49. * iterating over the PTE entries. When the continuity is broken it then
  50. * dumps out a description of the range - ie PTEs that are virtually contiguous
  51. * with the same PTE flags are chunked together. This is to make it clear how
  52. * different areas of the kernel virtual memory are used.
  53. *
  54. */
  55. struct pg_state {
  56. struct ptdump_state ptdump;
  57. struct seq_file *seq;
  58. const struct addr_marker *marker;
  59. unsigned long start_address;
  60. unsigned long start_pa;
  61. int level;
  62. u64 current_flags;
  63. bool check_wx;
  64. unsigned long wx_pages;
  65. };
  66. struct addr_marker {
  67. unsigned long start_address;
  68. const char *name;
  69. };
  70. static struct addr_marker address_markers[] = {
  71. { 0, "Start of kernel VM" },
  72. #ifdef MODULES_VADDR
  73. { 0, "modules start" },
  74. { 0, "modules end" },
  75. #endif
  76. { 0, "vmalloc() Area" },
  77. { 0, "vmalloc() End" },
  78. #ifdef CONFIG_PPC64
  79. { 0, "isa I/O start" },
  80. { 0, "isa I/O end" },
  81. { 0, "phb I/O start" },
  82. { 0, "phb I/O end" },
  83. { 0, "I/O remap start" },
  84. { 0, "I/O remap end" },
  85. { 0, "vmemmap start" },
  86. #else
  87. { 0, "Early I/O remap start" },
  88. { 0, "Early I/O remap end" },
  89. #ifdef CONFIG_HIGHMEM
  90. { 0, "Highmem PTEs start" },
  91. { 0, "Highmem PTEs end" },
  92. #endif
  93. { 0, "Fixmap start" },
  94. { 0, "Fixmap end" },
  95. #endif
  96. #ifdef CONFIG_KASAN
  97. { 0, "kasan shadow mem start" },
  98. { 0, "kasan shadow mem end" },
  99. #endif
  100. { -1, NULL },
  101. };
  102. static struct ptdump_range ptdump_range[] __ro_after_init = {
  103. {TASK_SIZE_MAX, ~0UL},
  104. {0, 0}
  105. };
  106. #define pt_dump_seq_printf(m, fmt, args...) \
  107. ({ \
  108. if (m) \
  109. seq_printf(m, fmt, ##args); \
  110. })
  111. #define pt_dump_seq_putc(m, c) \
  112. ({ \
  113. if (m) \
  114. seq_putc(m, c); \
  115. })
  116. void pt_dump_size(struct seq_file *m, unsigned long size)
  117. {
  118. static const char units[] = " KMGTPE";
  119. const char *unit = units;
  120. /* Work out what appropriate unit to use */
  121. while (!(size & 1023) && unit[1]) {
  122. size >>= 10;
  123. unit++;
  124. }
  125. pt_dump_seq_printf(m, "%9lu%c ", size, *unit);
  126. }
  127. static void dump_flag_info(struct pg_state *st, const struct flag_info
  128. *flag, u64 pte, int num)
  129. {
  130. unsigned int i;
  131. for (i = 0; i < num; i++, flag++) {
  132. const char *s = NULL;
  133. u64 val;
  134. /* flag not defined so don't check it */
  135. if (flag->mask == 0)
  136. continue;
  137. /* Some 'flags' are actually values */
  138. if (flag->is_val) {
  139. val = pte & flag->val;
  140. if (flag->shift)
  141. val = val >> flag->shift;
  142. pt_dump_seq_printf(st->seq, " %s:%llx", flag->set, val);
  143. } else {
  144. if ((pte & flag->mask) == flag->val)
  145. s = flag->set;
  146. else
  147. s = flag->clear;
  148. if (s)
  149. pt_dump_seq_printf(st->seq, " %s", s);
  150. }
  151. st->current_flags &= ~flag->mask;
  152. }
  153. if (st->current_flags != 0)
  154. pt_dump_seq_printf(st->seq, " unknown flags:%llx", st->current_flags);
  155. }
  156. static void dump_addr(struct pg_state *st, unsigned long addr)
  157. {
  158. #ifdef CONFIG_PPC64
  159. #define REG "0x%016lx"
  160. #else
  161. #define REG "0x%08lx"
  162. #endif
  163. pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
  164. pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
  165. pt_dump_size(st->seq, addr - st->start_address);
  166. }
  167. static void note_prot_wx(struct pg_state *st, unsigned long addr)
  168. {
  169. pte_t pte = __pte(st->current_flags);
  170. if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx)
  171. return;
  172. if (!pte_write(pte) || !pte_exec(pte))
  173. return;
  174. WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
  175. (void *)st->start_address, (void *)st->start_address);
  176. st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
  177. }
  178. static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val)
  179. {
  180. u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
  181. u64 pa = val & PTE_RPN_MASK;
  182. st->level = level;
  183. st->current_flags = flag;
  184. st->start_address = addr;
  185. st->start_pa = pa;
  186. while (addr >= st->marker[1].start_address) {
  187. st->marker++;
  188. pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
  189. }
  190. }
  191. static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
  192. {
  193. u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
  194. struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
  195. /* At first no level is set */
  196. if (st->level == -1) {
  197. pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
  198. note_page_update_state(st, addr, level, val);
  199. /*
  200. * Dump the section of virtual memory when:
  201. * - the PTE flags from one entry to the next differs.
  202. * - we change levels in the tree.
  203. * - the address is in a different section of memory and is thus
  204. * used for a different purpose, regardless of the flags.
  205. */
  206. } else if (flag != st->current_flags || level != st->level ||
  207. addr >= st->marker[1].start_address) {
  208. /* Check the PTE flags */
  209. if (st->current_flags) {
  210. note_prot_wx(st, addr);
  211. dump_addr(st, addr);
  212. /* Dump all the flags */
  213. if (pg_level[st->level].flag)
  214. dump_flag_info(st, pg_level[st->level].flag,
  215. st->current_flags,
  216. pg_level[st->level].num);
  217. pt_dump_seq_putc(st->seq, '\n');
  218. }
  219. /*
  220. * Address indicates we have passed the end of the
  221. * current section of virtual memory
  222. */
  223. note_page_update_state(st, addr, level, val);
  224. }
  225. }
  226. static void populate_markers(void)
  227. {
  228. int i = 0;
  229. #ifdef CONFIG_PPC64
  230. address_markers[i++].start_address = PAGE_OFFSET;
  231. #else
  232. address_markers[i++].start_address = TASK_SIZE;
  233. #endif
  234. #ifdef MODULES_VADDR
  235. address_markers[i++].start_address = MODULES_VADDR;
  236. address_markers[i++].start_address = MODULES_END;
  237. #endif
  238. address_markers[i++].start_address = VMALLOC_START;
  239. address_markers[i++].start_address = VMALLOC_END;
  240. #ifdef CONFIG_PPC64
  241. address_markers[i++].start_address = ISA_IO_BASE;
  242. address_markers[i++].start_address = ISA_IO_END;
  243. address_markers[i++].start_address = PHB_IO_BASE;
  244. address_markers[i++].start_address = PHB_IO_END;
  245. address_markers[i++].start_address = IOREMAP_BASE;
  246. address_markers[i++].start_address = IOREMAP_END;
  247. /* What is the ifdef about? */
  248. #ifdef CONFIG_PPC_BOOK3S_64
  249. address_markers[i++].start_address = H_VMEMMAP_START;
  250. #else
  251. address_markers[i++].start_address = VMEMMAP_BASE;
  252. #endif
  253. #else /* !CONFIG_PPC64 */
  254. address_markers[i++].start_address = ioremap_bot;
  255. address_markers[i++].start_address = IOREMAP_TOP;
  256. #ifdef CONFIG_HIGHMEM
  257. address_markers[i++].start_address = PKMAP_BASE;
  258. address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
  259. #endif
  260. address_markers[i++].start_address = FIXADDR_START;
  261. address_markers[i++].start_address = FIXADDR_TOP;
  262. #endif /* CONFIG_PPC64 */
  263. #ifdef CONFIG_KASAN
  264. address_markers[i++].start_address = KASAN_SHADOW_START;
  265. address_markers[i++].start_address = KASAN_SHADOW_END;
  266. #endif
  267. }
  268. static int ptdump_show(struct seq_file *m, void *v)
  269. {
  270. struct pg_state st = {
  271. .seq = m,
  272. .marker = address_markers,
  273. .level = -1,
  274. .ptdump = {
  275. .note_page = note_page,
  276. .range = ptdump_range,
  277. }
  278. };
  279. /* Traverse kernel page tables */
  280. ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
  281. return 0;
  282. }
  283. DEFINE_SHOW_ATTRIBUTE(ptdump);
  284. static void __init build_pgtable_complete_mask(void)
  285. {
  286. unsigned int i, j;
  287. for (i = 0; i < ARRAY_SIZE(pg_level); i++)
  288. if (pg_level[i].flag)
  289. for (j = 0; j < pg_level[i].num; j++)
  290. pg_level[i].mask |= pg_level[i].flag[j].mask;
  291. }
  292. #ifdef CONFIG_DEBUG_WX
  293. void ptdump_check_wx(void)
  294. {
  295. struct pg_state st = {
  296. .seq = NULL,
  297. .marker = (struct addr_marker[]) {
  298. { 0, NULL},
  299. { -1, NULL},
  300. },
  301. .level = -1,
  302. .check_wx = true,
  303. .ptdump = {
  304. .note_page = note_page,
  305. .range = ptdump_range,
  306. }
  307. };
  308. ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
  309. if (st.wx_pages)
  310. pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
  311. st.wx_pages);
  312. else
  313. pr_info("Checked W+X mappings: passed, no W+X pages found\n");
  314. }
  315. #endif
  316. static int __init ptdump_init(void)
  317. {
  318. #ifdef CONFIG_PPC64
  319. if (!radix_enabled())
  320. ptdump_range[0].start = KERN_VIRT_START;
  321. else
  322. ptdump_range[0].start = PAGE_OFFSET;
  323. ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD);
  324. #endif
  325. populate_markers();
  326. build_pgtable_complete_mask();
  327. if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS))
  328. debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
  329. return 0;
  330. }
  331. device_initcall(ptdump_init);