vdso.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
  4. * <[email protected]>
  5. */
  6. #include <linux/errno.h>
  7. #include <linux/sched.h>
  8. #include <linux/kernel.h>
  9. #include <linux/mm.h>
  10. #include <linux/smp.h>
  11. #include <linux/stddef.h>
  12. #include <linux/unistd.h>
  13. #include <linux/slab.h>
  14. #include <linux/user.h>
  15. #include <linux/elf.h>
  16. #include <linux/security.h>
  17. #include <linux/memblock.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/time_namespace.h>
  20. #include <vdso/datapage.h>
  21. #include <asm/syscall.h>
  22. #include <asm/processor.h>
  23. #include <asm/mmu.h>
  24. #include <asm/mmu_context.h>
  25. #include <asm/machdep.h>
  26. #include <asm/cputable.h>
  27. #include <asm/sections.h>
  28. #include <asm/firmware.h>
  29. #include <asm/vdso.h>
  30. #include <asm/vdso_datapage.h>
  31. #include <asm/setup.h>
  32. /* The alignment of the vDSO */
  33. #define VDSO_ALIGNMENT (1 << 16)
  34. extern char vdso32_start, vdso32_end;
  35. extern char vdso64_start, vdso64_end;
  36. long sys_ni_syscall(void);
  37. /*
  38. * The vdso data page (aka. systemcfg for old ppc64 fans) is here.
  39. * Once the early boot kernel code no longer needs to muck around
  40. * with it, it will become dynamically allocated
  41. */
  42. static union {
  43. struct vdso_arch_data data;
  44. u8 page[PAGE_SIZE];
  45. } vdso_data_store __page_aligned_data;
  46. struct vdso_arch_data *vdso_data = &vdso_data_store.data;
  47. enum vvar_pages {
  48. VVAR_DATA_PAGE_OFFSET,
  49. VVAR_TIMENS_PAGE_OFFSET,
  50. VVAR_NR_PAGES,
  51. };
  52. static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma,
  53. unsigned long text_size)
  54. {
  55. unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
  56. if (new_size != text_size)
  57. return -EINVAL;
  58. current->mm->context.vdso = (void __user *)new_vma->vm_start;
  59. return 0;
  60. }
  61. static int vdso32_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
  62. {
  63. return vdso_mremap(sm, new_vma, &vdso32_end - &vdso32_start);
  64. }
  65. static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
  66. {
  67. return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start);
  68. }
  69. static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
  70. struct vm_area_struct *vma, struct vm_fault *vmf);
  71. static struct vm_special_mapping vvar_spec __ro_after_init = {
  72. .name = "[vvar]",
  73. .fault = vvar_fault,
  74. };
  75. static struct vm_special_mapping vdso32_spec __ro_after_init = {
  76. .name = "[vdso]",
  77. .mremap = vdso32_mremap,
  78. };
  79. static struct vm_special_mapping vdso64_spec __ro_after_init = {
  80. .name = "[vdso]",
  81. .mremap = vdso64_mremap,
  82. };
  83. #ifdef CONFIG_TIME_NS
  84. struct vdso_data *arch_get_vdso_data(void *vvar_page)
  85. {
  86. return ((struct vdso_arch_data *)vvar_page)->data;
  87. }
  88. /*
  89. * The vvar mapping contains data for a specific time namespace, so when a task
  90. * changes namespace we must unmap its vvar data for the old namespace.
  91. * Subsequent faults will map in data for the new namespace.
  92. *
  93. * For more details see timens_setup_vdso_data().
  94. */
  95. int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
  96. {
  97. struct mm_struct *mm = task->mm;
  98. VMA_ITERATOR(vmi, mm, 0);
  99. struct vm_area_struct *vma;
  100. mmap_read_lock(mm);
  101. for_each_vma(vmi, vma) {
  102. unsigned long size = vma->vm_end - vma->vm_start;
  103. if (vma_is_special_mapping(vma, &vvar_spec))
  104. zap_page_range(vma, vma->vm_start, size);
  105. }
  106. mmap_read_unlock(mm);
  107. return 0;
  108. }
  109. static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
  110. {
  111. if (likely(vma->vm_mm == current->mm))
  112. return current->nsproxy->time_ns->vvar_page;
  113. /*
  114. * VM_PFNMAP | VM_IO protect .fault() handler from being called
  115. * through interfaces like /proc/$pid/mem or
  116. * process_vm_{readv,writev}() as long as there's no .access()
  117. * in special_mapping_vmops.
  118. * For more details check_vma_flags() and __access_remote_vm()
  119. */
  120. WARN(1, "vvar_page accessed remotely");
  121. return NULL;
  122. }
  123. #else
  124. static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
  125. {
  126. return NULL;
  127. }
  128. #endif
  129. static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
  130. struct vm_area_struct *vma, struct vm_fault *vmf)
  131. {
  132. struct page *timens_page = find_timens_vvar_page(vma);
  133. unsigned long pfn;
  134. switch (vmf->pgoff) {
  135. case VVAR_DATA_PAGE_OFFSET:
  136. if (timens_page)
  137. pfn = page_to_pfn(timens_page);
  138. else
  139. pfn = virt_to_pfn(vdso_data);
  140. break;
  141. #ifdef CONFIG_TIME_NS
  142. case VVAR_TIMENS_PAGE_OFFSET:
  143. /*
  144. * If a task belongs to a time namespace then a namespace
  145. * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
  146. * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
  147. * offset.
  148. * See also the comment near timens_setup_vdso_data().
  149. */
  150. if (!timens_page)
  151. return VM_FAULT_SIGBUS;
  152. pfn = virt_to_pfn(vdso_data);
  153. break;
  154. #endif /* CONFIG_TIME_NS */
  155. default:
  156. return VM_FAULT_SIGBUS;
  157. }
  158. return vmf_insert_pfn(vma, vmf->address, pfn);
  159. }
  160. /*
  161. * This is called from binfmt_elf, we create the special vma for the
  162. * vDSO and insert it into the mm struct tree
  163. */
  164. static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
  165. {
  166. unsigned long vdso_size, vdso_base, mappings_size;
  167. struct vm_special_mapping *vdso_spec;
  168. unsigned long vvar_size = VVAR_NR_PAGES * PAGE_SIZE;
  169. struct mm_struct *mm = current->mm;
  170. struct vm_area_struct *vma;
  171. if (is_32bit_task()) {
  172. vdso_spec = &vdso32_spec;
  173. vdso_size = &vdso32_end - &vdso32_start;
  174. } else {
  175. vdso_spec = &vdso64_spec;
  176. vdso_size = &vdso64_end - &vdso64_start;
  177. }
  178. mappings_size = vdso_size + vvar_size;
  179. mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK;
  180. /*
  181. * Pick a base address for the vDSO in process space.
  182. * Add enough to the size so that the result can be aligned.
  183. */
  184. vdso_base = get_unmapped_area(NULL, 0, mappings_size, 0, 0);
  185. if (IS_ERR_VALUE(vdso_base))
  186. return vdso_base;
  187. /* Add required alignment. */
  188. vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT);
  189. /*
  190. * Put vDSO base into mm struct. We need to do this before calling
  191. * install_special_mapping or the perf counter mmap tracking code
  192. * will fail to recognise it as a vDSO.
  193. */
  194. mm->context.vdso = (void __user *)vdso_base + vvar_size;
  195. vma = _install_special_mapping(mm, vdso_base, vvar_size,
  196. VM_READ | VM_MAYREAD | VM_IO |
  197. VM_DONTDUMP | VM_PFNMAP, &vvar_spec);
  198. if (IS_ERR(vma))
  199. return PTR_ERR(vma);
  200. /*
  201. * our vma flags don't have VM_WRITE so by default, the process isn't
  202. * allowed to write those pages.
  203. * gdb can break that with ptrace interface, and thus trigger COW on
  204. * those pages but it's then your responsibility to never do that on
  205. * the "data" page of the vDSO or you'll stop getting kernel updates
  206. * and your nice userland gettimeofday will be totally dead.
  207. * It's fine to use that for setting breakpoints in the vDSO code
  208. * pages though.
  209. */
  210. vma = _install_special_mapping(mm, vdso_base + vvar_size, vdso_size,
  211. VM_READ | VM_EXEC | VM_MAYREAD |
  212. VM_MAYWRITE | VM_MAYEXEC, vdso_spec);
  213. if (IS_ERR(vma))
  214. do_munmap(mm, vdso_base, vvar_size, NULL);
  215. return PTR_ERR_OR_ZERO(vma);
  216. }
  217. int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
  218. {
  219. struct mm_struct *mm = current->mm;
  220. int rc;
  221. mm->context.vdso = NULL;
  222. if (mmap_write_lock_killable(mm))
  223. return -EINTR;
  224. rc = __arch_setup_additional_pages(bprm, uses_interp);
  225. if (rc)
  226. mm->context.vdso = NULL;
  227. mmap_write_unlock(mm);
  228. return rc;
  229. }
  230. #define VDSO_DO_FIXUPS(type, value, bits, sec) do { \
  231. void *__start = (void *)VDSO##bits##_SYMBOL(&vdso##bits##_start, sec##_start); \
  232. void *__end = (void *)VDSO##bits##_SYMBOL(&vdso##bits##_start, sec##_end); \
  233. \
  234. do_##type##_fixups((value), __start, __end); \
  235. } while (0)
  236. static void __init vdso_fixup_features(void)
  237. {
  238. #ifdef CONFIG_PPC64
  239. VDSO_DO_FIXUPS(feature, cur_cpu_spec->cpu_features, 64, ftr_fixup);
  240. VDSO_DO_FIXUPS(feature, cur_cpu_spec->mmu_features, 64, mmu_ftr_fixup);
  241. VDSO_DO_FIXUPS(feature, powerpc_firmware_features, 64, fw_ftr_fixup);
  242. VDSO_DO_FIXUPS(lwsync, cur_cpu_spec->cpu_features, 64, lwsync_fixup);
  243. #endif /* CONFIG_PPC64 */
  244. #ifdef CONFIG_VDSO32
  245. VDSO_DO_FIXUPS(feature, cur_cpu_spec->cpu_features, 32, ftr_fixup);
  246. VDSO_DO_FIXUPS(feature, cur_cpu_spec->mmu_features, 32, mmu_ftr_fixup);
  247. #ifdef CONFIG_PPC64
  248. VDSO_DO_FIXUPS(feature, powerpc_firmware_features, 32, fw_ftr_fixup);
  249. #endif /* CONFIG_PPC64 */
  250. VDSO_DO_FIXUPS(lwsync, cur_cpu_spec->cpu_features, 32, lwsync_fixup);
  251. #endif
  252. }
  253. /*
  254. * Called from setup_arch to initialize the bitmap of available
  255. * syscalls in the systemcfg page
  256. */
  257. static void __init vdso_setup_syscall_map(void)
  258. {
  259. unsigned int i;
  260. for (i = 0; i < NR_syscalls; i++) {
  261. if (sys_call_table[i] != (void *)&sys_ni_syscall)
  262. vdso_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f);
  263. if (IS_ENABLED(CONFIG_COMPAT) &&
  264. compat_sys_call_table[i] != (void *)&sys_ni_syscall)
  265. vdso_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f);
  266. }
  267. }
  268. #ifdef CONFIG_PPC64
  269. int vdso_getcpu_init(void)
  270. {
  271. unsigned long cpu, node, val;
  272. /*
  273. * SPRG_VDSO contains the CPU in the bottom 16 bits and the NUMA node
  274. * in the next 16 bits. The VDSO uses this to implement getcpu().
  275. */
  276. cpu = get_cpu();
  277. WARN_ON_ONCE(cpu > 0xffff);
  278. node = cpu_to_node(cpu);
  279. WARN_ON_ONCE(node > 0xffff);
  280. val = (cpu & 0xffff) | ((node & 0xffff) << 16);
  281. mtspr(SPRN_SPRG_VDSO_WRITE, val);
  282. get_paca()->sprg_vdso = val;
  283. put_cpu();
  284. return 0;
  285. }
  286. /* We need to call this before SMP init */
  287. early_initcall(vdso_getcpu_init);
  288. #endif
  289. static struct page ** __init vdso_setup_pages(void *start, void *end)
  290. {
  291. int i;
  292. struct page **pagelist;
  293. int pages = (end - start) >> PAGE_SHIFT;
  294. pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
  295. if (!pagelist)
  296. panic("%s: Cannot allocate page list for VDSO", __func__);
  297. for (i = 0; i < pages; i++)
  298. pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
  299. return pagelist;
  300. }
  301. static int __init vdso_init(void)
  302. {
  303. #ifdef CONFIG_PPC64
  304. /*
  305. * Fill up the "systemcfg" stuff for backward compatibility
  306. */
  307. strcpy((char *)vdso_data->eye_catcher, "SYSTEMCFG:PPC64");
  308. vdso_data->version.major = SYSTEMCFG_MAJOR;
  309. vdso_data->version.minor = SYSTEMCFG_MINOR;
  310. vdso_data->processor = mfspr(SPRN_PVR);
  311. /*
  312. * Fake the old platform number for pSeries and add
  313. * in LPAR bit if necessary
  314. */
  315. vdso_data->platform = 0x100;
  316. if (firmware_has_feature(FW_FEATURE_LPAR))
  317. vdso_data->platform |= 1;
  318. vdso_data->physicalMemorySize = memblock_phys_mem_size();
  319. vdso_data->dcache_size = ppc64_caches.l1d.size;
  320. vdso_data->dcache_line_size = ppc64_caches.l1d.line_size;
  321. vdso_data->icache_size = ppc64_caches.l1i.size;
  322. vdso_data->icache_line_size = ppc64_caches.l1i.line_size;
  323. vdso_data->dcache_block_size = ppc64_caches.l1d.block_size;
  324. vdso_data->icache_block_size = ppc64_caches.l1i.block_size;
  325. vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size;
  326. vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size;
  327. #endif /* CONFIG_PPC64 */
  328. vdso_setup_syscall_map();
  329. vdso_fixup_features();
  330. if (IS_ENABLED(CONFIG_VDSO32))
  331. vdso32_spec.pages = vdso_setup_pages(&vdso32_start, &vdso32_end);
  332. if (IS_ENABLED(CONFIG_PPC64))
  333. vdso64_spec.pages = vdso_setup_pages(&vdso64_start, &vdso64_end);
  334. smp_wmb();
  335. return 0;
  336. }
  337. arch_initcall(vdso_init);