bts.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * BTS PMU driver for perf
  4. * Copyright (c) 2013-2014, Intel Corporation.
  5. */
  6. #undef DEBUG
  7. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  8. #include <linux/bitops.h>
  9. #include <linux/types.h>
  10. #include <linux/slab.h>
  11. #include <linux/debugfs.h>
  12. #include <linux/device.h>
  13. #include <linux/coredump.h>
  14. #include <linux/sizes.h>
  15. #include <asm/perf_event.h>
  16. #include "../perf_event.h"
  17. struct bts_ctx {
  18. struct perf_output_handle handle;
  19. struct debug_store ds_back;
  20. int state;
  21. };
  22. /* BTS context states: */
  23. enum {
  24. /* no ongoing AUX transactions */
  25. BTS_STATE_STOPPED = 0,
  26. /* AUX transaction is on, BTS tracing is disabled */
  27. BTS_STATE_INACTIVE,
  28. /* AUX transaction is on, BTS tracing is running */
  29. BTS_STATE_ACTIVE,
  30. };
  31. static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
  32. #define BTS_RECORD_SIZE 24
  33. #define BTS_SAFETY_MARGIN 4080
  34. struct bts_phys {
  35. struct page *page;
  36. unsigned long size;
  37. unsigned long offset;
  38. unsigned long displacement;
  39. };
  40. struct bts_buffer {
  41. size_t real_size; /* multiple of BTS_RECORD_SIZE */
  42. unsigned int nr_pages;
  43. unsigned int nr_bufs;
  44. unsigned int cur_buf;
  45. bool snapshot;
  46. local_t data_size;
  47. local_t head;
  48. unsigned long end;
  49. void **data_pages;
  50. struct bts_phys buf[];
  51. };
  52. static struct pmu bts_pmu;
  53. static int buf_nr_pages(struct page *page)
  54. {
  55. if (!PagePrivate(page))
  56. return 1;
  57. return 1 << page_private(page);
  58. }
  59. static size_t buf_size(struct page *page)
  60. {
  61. return buf_nr_pages(page) * PAGE_SIZE;
  62. }
  63. static void *
  64. bts_buffer_setup_aux(struct perf_event *event, void **pages,
  65. int nr_pages, bool overwrite)
  66. {
  67. struct bts_buffer *buf;
  68. struct page *page;
  69. int cpu = event->cpu;
  70. int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
  71. unsigned long offset;
  72. size_t size = nr_pages << PAGE_SHIFT;
  73. int pg, nbuf, pad;
  74. /* count all the high order buffers */
  75. for (pg = 0, nbuf = 0; pg < nr_pages;) {
  76. page = virt_to_page(pages[pg]);
  77. pg += buf_nr_pages(page);
  78. nbuf++;
  79. }
  80. /*
  81. * to avoid interrupts in overwrite mode, only allow one physical
  82. */
  83. if (overwrite && nbuf > 1)
  84. return NULL;
  85. buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
  86. if (!buf)
  87. return NULL;
  88. buf->nr_pages = nr_pages;
  89. buf->nr_bufs = nbuf;
  90. buf->snapshot = overwrite;
  91. buf->data_pages = pages;
  92. buf->real_size = size - size % BTS_RECORD_SIZE;
  93. for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
  94. unsigned int __nr_pages;
  95. page = virt_to_page(pages[pg]);
  96. __nr_pages = buf_nr_pages(page);
  97. buf->buf[nbuf].page = page;
  98. buf->buf[nbuf].offset = offset;
  99. buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
  100. buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
  101. pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
  102. buf->buf[nbuf].size -= pad;
  103. pg += __nr_pages;
  104. offset += __nr_pages << PAGE_SHIFT;
  105. }
  106. return buf;
  107. }
  108. static void bts_buffer_free_aux(void *data)
  109. {
  110. kfree(data);
  111. }
  112. static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
  113. {
  114. return buf->buf[idx].offset + buf->buf[idx].displacement;
  115. }
  116. static void
  117. bts_config_buffer(struct bts_buffer *buf)
  118. {
  119. int cpu = raw_smp_processor_id();
  120. struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
  121. struct bts_phys *phys = &buf->buf[buf->cur_buf];
  122. unsigned long index, thresh = 0, end = phys->size;
  123. struct page *page = phys->page;
  124. index = local_read(&buf->head);
  125. if (!buf->snapshot) {
  126. if (buf->end < phys->offset + buf_size(page))
  127. end = buf->end - phys->offset - phys->displacement;
  128. index -= phys->offset + phys->displacement;
  129. if (end - index > BTS_SAFETY_MARGIN)
  130. thresh = end - BTS_SAFETY_MARGIN;
  131. else if (end - index > BTS_RECORD_SIZE)
  132. thresh = end - BTS_RECORD_SIZE;
  133. else
  134. thresh = end;
  135. }
  136. ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
  137. ds->bts_index = ds->bts_buffer_base + index;
  138. ds->bts_absolute_maximum = ds->bts_buffer_base + end;
  139. ds->bts_interrupt_threshold = !buf->snapshot
  140. ? ds->bts_buffer_base + thresh
  141. : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
  142. }
  143. static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
  144. {
  145. unsigned long index = head - phys->offset;
  146. memset(page_address(phys->page) + index, 0, phys->size - index);
  147. }
  148. static void bts_update(struct bts_ctx *bts)
  149. {
  150. int cpu = raw_smp_processor_id();
  151. struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
  152. struct bts_buffer *buf = perf_get_aux(&bts->handle);
  153. unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
  154. if (!buf)
  155. return;
  156. head = index + bts_buffer_offset(buf, buf->cur_buf);
  157. old = local_xchg(&buf->head, head);
  158. if (!buf->snapshot) {
  159. if (old == head)
  160. return;
  161. if (ds->bts_index >= ds->bts_absolute_maximum)
  162. perf_aux_output_flag(&bts->handle,
  163. PERF_AUX_FLAG_TRUNCATED);
  164. /*
  165. * old and head are always in the same physical buffer, so we
  166. * can subtract them to get the data size.
  167. */
  168. local_add(head - old, &buf->data_size);
  169. } else {
  170. local_set(&buf->data_size, head);
  171. }
  172. /*
  173. * Since BTS is coherent, just add compiler barrier to ensure
  174. * BTS updating is ordered against bts::handle::event.
  175. */
  176. barrier();
  177. }
  178. static int
  179. bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
  180. /*
  181. * Ordering PMU callbacks wrt themselves and the PMI is done by means
  182. * of bts::state, which:
  183. * - is set when bts::handle::event is valid, that is, between
  184. * perf_aux_output_begin() and perf_aux_output_end();
  185. * - is zero otherwise;
  186. * - is ordered against bts::handle::event with a compiler barrier.
  187. */
  188. static void __bts_event_start(struct perf_event *event)
  189. {
  190. struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
  191. struct bts_buffer *buf = perf_get_aux(&bts->handle);
  192. u64 config = 0;
  193. if (!buf->snapshot)
  194. config |= ARCH_PERFMON_EVENTSEL_INT;
  195. if (!event->attr.exclude_kernel)
  196. config |= ARCH_PERFMON_EVENTSEL_OS;
  197. if (!event->attr.exclude_user)
  198. config |= ARCH_PERFMON_EVENTSEL_USR;
  199. bts_config_buffer(buf);
  200. /*
  201. * local barrier to make sure that ds configuration made it
  202. * before we enable BTS and bts::state goes ACTIVE
  203. */
  204. wmb();
  205. /* INACTIVE/STOPPED -> ACTIVE */
  206. WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
  207. intel_pmu_enable_bts(config);
  208. }
  209. static void bts_event_start(struct perf_event *event, int flags)
  210. {
  211. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  212. struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
  213. struct bts_buffer *buf;
  214. buf = perf_aux_output_begin(&bts->handle, event);
  215. if (!buf)
  216. goto fail_stop;
  217. if (bts_buffer_reset(buf, &bts->handle))
  218. goto fail_end_stop;
  219. bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
  220. bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
  221. bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
  222. perf_event_itrace_started(event);
  223. event->hw.state = 0;
  224. __bts_event_start(event);
  225. return;
  226. fail_end_stop:
  227. perf_aux_output_end(&bts->handle, 0);
  228. fail_stop:
  229. event->hw.state = PERF_HES_STOPPED;
  230. }
  231. static void __bts_event_stop(struct perf_event *event, int state)
  232. {
  233. struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
  234. /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
  235. WRITE_ONCE(bts->state, state);
  236. /*
  237. * No extra synchronization is mandated by the documentation to have
  238. * BTS data stores globally visible.
  239. */
  240. intel_pmu_disable_bts();
  241. }
  242. static void bts_event_stop(struct perf_event *event, int flags)
  243. {
  244. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  245. struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
  246. struct bts_buffer *buf = NULL;
  247. int state = READ_ONCE(bts->state);
  248. if (state == BTS_STATE_ACTIVE)
  249. __bts_event_stop(event, BTS_STATE_STOPPED);
  250. if (state != BTS_STATE_STOPPED)
  251. buf = perf_get_aux(&bts->handle);
  252. event->hw.state |= PERF_HES_STOPPED;
  253. if (flags & PERF_EF_UPDATE) {
  254. bts_update(bts);
  255. if (buf) {
  256. if (buf->snapshot)
  257. bts->handle.head =
  258. local_xchg(&buf->data_size,
  259. buf->nr_pages << PAGE_SHIFT);
  260. perf_aux_output_end(&bts->handle,
  261. local_xchg(&buf->data_size, 0));
  262. }
  263. cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
  264. cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
  265. cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
  266. cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
  267. }
  268. }
  269. void intel_bts_enable_local(void)
  270. {
  271. struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
  272. int state = READ_ONCE(bts->state);
  273. /*
  274. * Here we transition from INACTIVE to ACTIVE;
  275. * if we instead are STOPPED from the interrupt handler,
  276. * stay that way. Can't be ACTIVE here though.
  277. */
  278. if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
  279. return;
  280. if (state == BTS_STATE_STOPPED)
  281. return;
  282. if (bts->handle.event)
  283. __bts_event_start(bts->handle.event);
  284. }
  285. void intel_bts_disable_local(void)
  286. {
  287. struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
  288. /*
  289. * Here we transition from ACTIVE to INACTIVE;
  290. * do nothing for STOPPED or INACTIVE.
  291. */
  292. if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
  293. return;
  294. if (bts->handle.event)
  295. __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
  296. }
  297. static int
  298. bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
  299. {
  300. unsigned long head, space, next_space, pad, gap, skip, wakeup;
  301. unsigned int next_buf;
  302. struct bts_phys *phys, *next_phys;
  303. int ret;
  304. if (buf->snapshot)
  305. return 0;
  306. head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
  307. phys = &buf->buf[buf->cur_buf];
  308. space = phys->offset + phys->displacement + phys->size - head;
  309. pad = space;
  310. if (space > handle->size) {
  311. space = handle->size;
  312. space -= space % BTS_RECORD_SIZE;
  313. }
  314. if (space <= BTS_SAFETY_MARGIN) {
  315. /* See if next phys buffer has more space */
  316. next_buf = buf->cur_buf + 1;
  317. if (next_buf >= buf->nr_bufs)
  318. next_buf = 0;
  319. next_phys = &buf->buf[next_buf];
  320. gap = buf_size(phys->page) - phys->displacement - phys->size +
  321. next_phys->displacement;
  322. skip = pad + gap;
  323. if (handle->size >= skip) {
  324. next_space = next_phys->size;
  325. if (next_space + skip > handle->size) {
  326. next_space = handle->size - skip;
  327. next_space -= next_space % BTS_RECORD_SIZE;
  328. }
  329. if (next_space > space || !space) {
  330. if (pad)
  331. bts_buffer_pad_out(phys, head);
  332. ret = perf_aux_output_skip(handle, skip);
  333. if (ret)
  334. return ret;
  335. /* Advance to next phys buffer */
  336. phys = next_phys;
  337. space = next_space;
  338. head = phys->offset + phys->displacement;
  339. /*
  340. * After this, cur_buf and head won't match ds
  341. * anymore, so we must not be racing with
  342. * bts_update().
  343. */
  344. buf->cur_buf = next_buf;
  345. local_set(&buf->head, head);
  346. }
  347. }
  348. }
  349. /* Don't go far beyond wakeup watermark */
  350. wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
  351. handle->head;
  352. if (space > wakeup) {
  353. space = wakeup;
  354. space -= space % BTS_RECORD_SIZE;
  355. }
  356. buf->end = head + space;
  357. /*
  358. * If we have no space, the lost notification would have been sent when
  359. * we hit absolute_maximum - see bts_update()
  360. */
  361. if (!space)
  362. return -ENOSPC;
  363. return 0;
  364. }
  365. int intel_bts_interrupt(void)
  366. {
  367. struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
  368. struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
  369. struct perf_event *event = bts->handle.event;
  370. struct bts_buffer *buf;
  371. s64 old_head;
  372. int err = -ENOSPC, handled = 0;
  373. /*
  374. * The only surefire way of knowing if this NMI is ours is by checking
  375. * the write ptr against the PMI threshold.
  376. */
  377. if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
  378. handled = 1;
  379. /*
  380. * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
  381. * so we can only be INACTIVE or STOPPED
  382. */
  383. if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
  384. return handled;
  385. buf = perf_get_aux(&bts->handle);
  386. if (!buf)
  387. return handled;
  388. /*
  389. * Skip snapshot counters: they don't use the interrupt, but
  390. * there's no other way of telling, because the pointer will
  391. * keep moving
  392. */
  393. if (buf->snapshot)
  394. return 0;
  395. old_head = local_read(&buf->head);
  396. bts_update(bts);
  397. /* no new data */
  398. if (old_head == local_read(&buf->head))
  399. return handled;
  400. perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
  401. buf = perf_aux_output_begin(&bts->handle, event);
  402. if (buf)
  403. err = bts_buffer_reset(buf, &bts->handle);
  404. if (err) {
  405. WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
  406. if (buf) {
  407. /*
  408. * BTS_STATE_STOPPED should be visible before
  409. * cleared handle::event
  410. */
  411. barrier();
  412. perf_aux_output_end(&bts->handle, 0);
  413. }
  414. }
  415. return 1;
  416. }
  417. static void bts_event_del(struct perf_event *event, int mode)
  418. {
  419. bts_event_stop(event, PERF_EF_UPDATE);
  420. }
  421. static int bts_event_add(struct perf_event *event, int mode)
  422. {
  423. struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
  424. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  425. struct hw_perf_event *hwc = &event->hw;
  426. event->hw.state = PERF_HES_STOPPED;
  427. if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
  428. return -EBUSY;
  429. if (bts->handle.event)
  430. return -EBUSY;
  431. if (mode & PERF_EF_START) {
  432. bts_event_start(event, 0);
  433. if (hwc->state & PERF_HES_STOPPED)
  434. return -EINVAL;
  435. }
  436. return 0;
  437. }
  438. static void bts_event_destroy(struct perf_event *event)
  439. {
  440. x86_release_hardware();
  441. x86_del_exclusive(x86_lbr_exclusive_bts);
  442. }
  443. static int bts_event_init(struct perf_event *event)
  444. {
  445. int ret;
  446. if (event->attr.type != bts_pmu.type)
  447. return -ENOENT;
  448. /*
  449. * BTS leaks kernel addresses even when CPL0 tracing is
  450. * disabled, so disallow intel_bts driver for unprivileged
  451. * users on paranoid systems since it provides trace data
  452. * to the user in a zero-copy fashion.
  453. *
  454. * Note that the default paranoia setting permits unprivileged
  455. * users to profile the kernel.
  456. */
  457. if (event->attr.exclude_kernel) {
  458. ret = perf_allow_kernel(&event->attr);
  459. if (ret)
  460. return ret;
  461. }
  462. if (x86_add_exclusive(x86_lbr_exclusive_bts))
  463. return -EBUSY;
  464. ret = x86_reserve_hardware();
  465. if (ret) {
  466. x86_del_exclusive(x86_lbr_exclusive_bts);
  467. return ret;
  468. }
  469. event->destroy = bts_event_destroy;
  470. return 0;
  471. }
  472. static void bts_event_read(struct perf_event *event)
  473. {
  474. }
  475. static __init int bts_init(void)
  476. {
  477. if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
  478. return -ENODEV;
  479. if (boot_cpu_has(X86_FEATURE_PTI)) {
  480. /*
  481. * BTS hardware writes through a virtual memory map we must
  482. * either use the kernel physical map, or the user mapping of
  483. * the AUX buffer.
  484. *
  485. * However, since this driver supports per-CPU and per-task inherit
  486. * we cannot use the user mapping since it will not be available
  487. * if we're not running the owning process.
  488. *
  489. * With PTI we can't use the kernel map either, because its not
  490. * there when we run userspace.
  491. *
  492. * For now, disable this driver when using PTI.
  493. */
  494. return -ENODEV;
  495. }
  496. bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
  497. PERF_PMU_CAP_EXCLUSIVE;
  498. bts_pmu.task_ctx_nr = perf_sw_context;
  499. bts_pmu.event_init = bts_event_init;
  500. bts_pmu.add = bts_event_add;
  501. bts_pmu.del = bts_event_del;
  502. bts_pmu.start = bts_event_start;
  503. bts_pmu.stop = bts_event_stop;
  504. bts_pmu.read = bts_event_read;
  505. bts_pmu.setup_aux = bts_buffer_setup_aux;
  506. bts_pmu.free_aux = bts_buffer_free_aux;
  507. return perf_pmu_register(&bts_pmu, "intel_bts", -1);
  508. }
  509. arch_initcall(bts_init);