page_pinner.c 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/debugfs.h>
  3. #include <linux/mm.h>
  4. #include <linux/slab.h>
  5. #include <linux/uaccess.h>
  6. #include <linux/memblock.h>
  7. #include <linux/stacktrace.h>
  8. #include <linux/page_pinner.h>
  9. #include <linux/jump_label.h>
  10. #include <linux/migrate.h>
  11. #include <linux/stackdepot.h>
  12. #include <linux/seq_file.h>
  13. #include <linux/sched/clock.h>
  14. #include "internal.h"
  15. #define PAGE_PINNER_STACK_DEPTH 16
  16. static unsigned long pp_buf_size = 4096;
  17. struct page_pinner {
  18. depot_stack_handle_t handle;
  19. u64 ts_usec;
  20. atomic_t count;
  21. };
  22. enum pp_state {
  23. PP_PUT,
  24. PP_FREE,
  25. PP_FAIL_DETECTED,
  26. };
  27. struct captured_pinner {
  28. depot_stack_handle_t handle;
  29. union {
  30. u64 ts_usec;
  31. u64 elapsed;
  32. };
  33. /* struct page fields */
  34. unsigned long pfn;
  35. int count;
  36. int mapcount;
  37. struct address_space *mapping;
  38. unsigned long flags;
  39. enum pp_state state;
  40. };
  41. struct page_pinner_buffer {
  42. spinlock_t lock;
  43. unsigned long index;
  44. struct captured_pinner *buffer;
  45. };
  46. /* alloc_contig failed pinner */
  47. static struct page_pinner_buffer pp_buffer;
  48. static bool page_pinner_enabled;
  49. DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
  50. EXPORT_SYMBOL_GPL(page_pinner_inited);
  51. DEFINE_STATIC_KEY_TRUE(failure_tracking);
  52. static depot_stack_handle_t failure_handle;
  53. static int __init early_page_pinner_param(char *buf)
  54. {
  55. page_pinner_enabled = true;
  56. return 0;
  57. }
  58. early_param("page_pinner", early_page_pinner_param);
  59. static bool need_page_pinner(void)
  60. {
  61. return page_pinner_enabled;
  62. }
  63. static noinline void register_failure_stack(void)
  64. {
  65. unsigned long entries[4];
  66. unsigned int nr_entries;
  67. nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
  68. failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
  69. }
  70. static void init_page_pinner(void)
  71. {
  72. if (!page_pinner_enabled)
  73. return;
  74. pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer),
  75. GFP_KERNEL);
  76. if (!pp_buffer.buffer) {
  77. pr_info("page_pinner disabled due to failure of buffer allocation\n");
  78. return;
  79. }
  80. spin_lock_init(&pp_buffer.lock);
  81. pp_buffer.index = 0;
  82. register_failure_stack();
  83. static_branch_enable(&page_pinner_inited);
  84. }
  85. struct page_ext_operations page_pinner_ops = {
  86. .size = sizeof(struct page_pinner),
  87. .need = need_page_pinner,
  88. .init = init_page_pinner,
  89. };
  90. static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
  91. {
  92. return (void *)page_ext + page_pinner_ops.offset;
  93. }
  94. static noinline depot_stack_handle_t save_stack(gfp_t flags)
  95. {
  96. unsigned long entries[PAGE_PINNER_STACK_DEPTH];
  97. depot_stack_handle_t handle;
  98. unsigned int nr_entries;
  99. nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
  100. handle = stack_depot_save(entries, nr_entries, flags);
  101. if (!handle)
  102. handle = failure_handle;
  103. return handle;
  104. }
  105. static void capture_page_state(struct page *page,
  106. struct captured_pinner *record)
  107. {
  108. record->flags = page->flags;
  109. record->mapping = page_mapping(page);
  110. record->pfn = page_to_pfn(page);
  111. record->count = page_count(page);
  112. record->mapcount = page_mapcount(page);
  113. }
  114. static void add_record(struct page_pinner_buffer *pp_buf,
  115. struct captured_pinner *record)
  116. {
  117. unsigned long flags;
  118. unsigned int idx;
  119. spin_lock_irqsave(&pp_buf->lock, flags);
  120. idx = pp_buf->index++;
  121. pp_buf->index %= pp_buf_size;
  122. pp_buf->buffer[idx] = *record;
  123. spin_unlock_irqrestore(&pp_buf->lock, flags);
  124. }
  125. void __free_page_pinner(struct page *page, unsigned int order)
  126. {
  127. struct page_pinner *page_pinner;
  128. struct page_ext *page_ext;
  129. int i;
  130. /* free_page could be called before buffer is initialized */
  131. if (!pp_buffer.buffer)
  132. return;
  133. page_ext = page_ext_get(page);
  134. if (unlikely(!page_ext))
  135. return;
  136. for (i = 0; i < (1 << order); i++) {
  137. struct captured_pinner record;
  138. if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
  139. continue;
  140. page_pinner = get_page_pinner(page_ext);
  141. record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
  142. record.ts_usec = (u64)ktime_to_us(ktime_get_boottime());
  143. record.state = PP_FREE;
  144. capture_page_state(page, &record);
  145. add_record(&pp_buffer, &record);
  146. atomic_set(&page_pinner->count, 0);
  147. page_pinner->ts_usec = 0;
  148. clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
  149. page_ext = page_ext_next(page_ext);
  150. }
  151. page_ext_put(page_ext);
  152. }
  153. static ssize_t
  154. print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record)
  155. {
  156. int ret;
  157. unsigned long *entries;
  158. unsigned int nr_entries;
  159. char *kbuf;
  160. count = min_t(size_t, count, PAGE_SIZE);
  161. kbuf = kmalloc(count, GFP_KERNEL);
  162. if (!kbuf)
  163. return -ENOMEM;
  164. if (record->state == PP_PUT) {
  165. ret = snprintf(kbuf, count, "At least, pinned for %llu us\n",
  166. record->elapsed);
  167. } else {
  168. u64 ts_usec = record->ts_usec;
  169. unsigned long rem_usec = do_div(ts_usec, 1000000);
  170. ret = snprintf(kbuf, count,
  171. "%s [%5lu.%06lu]\n",
  172. record->state == PP_FREE ? "Freed at" :
  173. "Failure detected at",
  174. (unsigned long)ts_usec, rem_usec);
  175. }
  176. if (ret >= count)
  177. goto err;
  178. /* Print information relevant to grouping pages by mobility */
  179. ret += snprintf(kbuf + ret, count - ret,
  180. "PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
  181. record->pfn,
  182. record->pfn >> pageblock_order,
  183. record->count, record->mapcount,
  184. record->mapping,
  185. record->flags, &record->flags);
  186. if (ret >= count)
  187. goto err;
  188. nr_entries = stack_depot_fetch(record->handle, &entries);
  189. ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
  190. nr_entries, 0);
  191. if (ret >= count)
  192. goto err;
  193. ret += snprintf(kbuf + ret, count - ret, "\n");
  194. if (ret >= count)
  195. goto err;
  196. if (copy_to_user(buf, kbuf, ret))
  197. ret = -EFAULT;
  198. kfree(kbuf);
  199. return ret;
  200. err:
  201. kfree(kbuf);
  202. return -ENOMEM;
  203. }
  204. void __page_pinner_failure_detect(struct page *page)
  205. {
  206. struct page_ext *page_ext;
  207. struct page_pinner *page_pinner;
  208. struct captured_pinner record;
  209. u64 now;
  210. if (!static_branch_unlikely(&failure_tracking))
  211. return;
  212. page_ext = page_ext_get(page);
  213. if (unlikely(!page_ext))
  214. return;
  215. if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
  216. page_ext_put(page_ext);
  217. return;
  218. }
  219. now = (u64)ktime_to_us(ktime_get_boottime());
  220. page_pinner = get_page_pinner(page_ext);
  221. if (!page_pinner->ts_usec)
  222. page_pinner->ts_usec = now;
  223. set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
  224. record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
  225. record.ts_usec = now;
  226. record.state = PP_FAIL_DETECTED;
  227. capture_page_state(page, &record);
  228. add_record(&pp_buffer, &record);
  229. page_ext_put(page_ext);
  230. }
  231. EXPORT_SYMBOL_GPL(__page_pinner_failure_detect);
  232. void __page_pinner_put_page(struct page *page)
  233. {
  234. struct page_ext *page_ext;
  235. struct page_pinner *page_pinner;
  236. struct captured_pinner record;
  237. u64 now, ts_usec;
  238. if (!static_branch_unlikely(&failure_tracking))
  239. return;
  240. page_ext = page_ext_get(page);
  241. if (unlikely(!page_ext))
  242. return;
  243. if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
  244. page_ext_put(page_ext);
  245. return;
  246. }
  247. page_pinner = get_page_pinner(page_ext);
  248. record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
  249. now = (u64)ktime_to_us(ktime_get_boottime());
  250. ts_usec = page_pinner->ts_usec;
  251. if (now > ts_usec)
  252. record.elapsed = now - ts_usec;
  253. else
  254. record.elapsed = 0;
  255. record.state = PP_PUT;
  256. capture_page_state(page, &record);
  257. add_record(&pp_buffer, &record);
  258. page_ext_put(page_ext);
  259. }
  260. EXPORT_SYMBOL_GPL(__page_pinner_put_page);
  261. static ssize_t read_buffer(struct file *file, char __user *buf,
  262. size_t count, loff_t *ppos)
  263. {
  264. u64 tmp;
  265. loff_t i, idx;
  266. struct captured_pinner record;
  267. unsigned long flags;
  268. if (!static_branch_unlikely(&failure_tracking))
  269. return -EINVAL;
  270. if (*ppos >= pp_buf_size)
  271. return 0;
  272. i = *ppos;
  273. *ppos = i + 1;
  274. /*
  275. * reading the records in the reverse order with newest one
  276. * being read first followed by older ones
  277. */
  278. tmp = pp_buffer.index - 1 - i + pp_buf_size;
  279. idx = do_div(tmp, pp_buf_size);
  280. spin_lock_irqsave(&pp_buffer.lock, flags);
  281. record = pp_buffer.buffer[idx];
  282. spin_unlock_irqrestore(&pp_buffer.lock, flags);
  283. if (!record.handle)
  284. return 0;
  285. return print_page_pinner(buf, count, &record);
  286. }
  287. static const struct file_operations proc_buffer_operations = {
  288. .read = read_buffer,
  289. };
  290. static int failure_tracking_set(void *data, u64 val)
  291. {
  292. bool on;
  293. on = (bool)val;
  294. if (on)
  295. static_branch_enable(&failure_tracking);
  296. else
  297. static_branch_disable(&failure_tracking);
  298. return 0;
  299. }
  300. static int failure_tracking_get(void *data, u64 *val)
  301. {
  302. *val = static_branch_unlikely(&failure_tracking);
  303. return 0;
  304. }
  305. DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
  306. failure_tracking_get,
  307. failure_tracking_set, "%llu\n");
  308. static int buffer_size_set(void *data, u64 val)
  309. {
  310. unsigned long flags;
  311. struct captured_pinner *new, *old;
  312. new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL);
  313. if (!new)
  314. return -ENOMEM;
  315. spin_lock_irqsave(&pp_buffer.lock, flags);
  316. old = pp_buffer.buffer;
  317. pp_buffer.buffer = new;
  318. pp_buffer.index = 0;
  319. pp_buf_size = val;
  320. spin_unlock_irqrestore(&pp_buffer.lock, flags);
  321. kvfree(old);
  322. return 0;
  323. }
  324. static int buffer_size_get(void *data, u64 *val)
  325. {
  326. *val = pp_buf_size;
  327. return 0;
  328. }
  329. DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops,
  330. buffer_size_get,
  331. buffer_size_set, "%llu\n");
  332. static int __init page_pinner_init(void)
  333. {
  334. struct dentry *pp_debugfs_root;
  335. if (!static_branch_unlikely(&page_pinner_inited))
  336. return 0;
  337. pr_info("page_pinner enabled\n");
  338. pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
  339. debugfs_create_file("buffer", 0444,
  340. pp_debugfs_root, NULL,
  341. &proc_buffer_operations);
  342. debugfs_create_file("failure_tracking", 0644,
  343. pp_debugfs_root, NULL,
  344. &failure_tracking_fops);
  345. debugfs_create_file("buffer_size", 0644,
  346. pp_debugfs_root, NULL,
  347. &buffer_size_fops);
  348. return 0;
  349. }
  350. late_initcall(page_pinner_init)