mmu_internal.h 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. #ifndef __KVM_X86_MMU_INTERNAL_H
  3. #define __KVM_X86_MMU_INTERNAL_H
  4. #include <linux/types.h>
  5. #include <linux/kvm_host.h>
  6. #include <asm/kvm_host.h>
  7. #undef MMU_DEBUG
  8. #ifdef MMU_DEBUG
  9. extern bool dbg;
  10. #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
  11. #define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
  12. #define MMU_WARN_ON(x) WARN_ON(x)
  13. #else
  14. #define pgprintk(x...) do { } while (0)
  15. #define rmap_printk(x...) do { } while (0)
  16. #define MMU_WARN_ON(x) do { } while (0)
  17. #endif
  18. /* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
  19. #define __PT_LEVEL_SHIFT(level, bits_per_level) \
  20. (PAGE_SHIFT + ((level) - 1) * (bits_per_level))
  21. #define __PT_INDEX(address, level, bits_per_level) \
  22. (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1))
  23. #define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \
  24. ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
  25. #define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \
  26. ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
  27. #define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level))
  28. /*
  29. * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
  30. * bit, and thus are guaranteed to be non-zero when valid. And, when a guest
  31. * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
  32. * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
  33. * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
  34. */
  35. #define INVALID_PAE_ROOT 0
  36. #define IS_VALID_PAE_ROOT(x) (!!(x))
  37. typedef u64 __rcu *tdp_ptep_t;
  38. struct kvm_mmu_page {
  39. /*
  40. * Note, "link" through "spt" fit in a single 64 byte cache line on
  41. * 64-bit kernels, keep it that way unless there's a reason not to.
  42. */
  43. struct list_head link;
  44. struct hlist_node hash_link;
  45. bool tdp_mmu_page;
  46. bool unsync;
  47. union {
  48. u8 mmu_valid_gen;
  49. /* Only accessed under slots_lock. */
  50. bool tdp_mmu_scheduled_root_to_zap;
  51. };
  52. bool lpage_disallowed; /* Can't be replaced by an equiv large page */
  53. /*
  54. * The following two entries are used to key the shadow page in the
  55. * hash table.
  56. */
  57. union kvm_mmu_page_role role;
  58. gfn_t gfn;
  59. u64 *spt;
  60. /*
  61. * Stores the result of the guest translation being shadowed by each
  62. * SPTE. KVM shadows two types of guest translations: nGPA -> GPA
  63. * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
  64. * cases the result of the translation is a GPA and a set of access
  65. * constraints.
  66. *
  67. * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
  68. * access permissions are stored in the lower bits. Note, for
  69. * convenience and uniformity across guests, the access permissions are
  70. * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.
  71. */
  72. u64 *shadowed_translation;
  73. /* Currently serving as active root */
  74. union {
  75. int root_count;
  76. refcount_t tdp_mmu_root_count;
  77. };
  78. unsigned int unsync_children;
  79. union {
  80. struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
  81. tdp_ptep_t ptep;
  82. };
  83. DECLARE_BITMAP(unsync_child_bitmap, 512);
  84. struct list_head lpage_disallowed_link;
  85. #ifdef CONFIG_X86_32
  86. /*
  87. * Used out of the mmu-lock to avoid reading spte values while an
  88. * update is in progress; see the comments in __get_spte_lockless().
  89. */
  90. int clear_spte_count;
  91. #endif
  92. /* Number of writes since the last time traversal visited this page. */
  93. atomic_t write_flooding_count;
  94. #ifdef CONFIG_X86_64
  95. /* Used for freeing the page asynchronously if it is a TDP MMU page. */
  96. struct rcu_head rcu_head;
  97. #endif
  98. };
  99. extern struct kmem_cache *mmu_page_header_cache;
  100. static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
  101. {
  102. struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
  103. return (struct kvm_mmu_page *)page_private(page);
  104. }
  105. static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
  106. {
  107. return to_shadow_page(__pa(sptep));
  108. }
  109. static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
  110. {
  111. return role.smm ? 1 : 0;
  112. }
  113. static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
  114. {
  115. return kvm_mmu_role_as_id(sp->role);
  116. }
  117. static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
  118. {
  119. /*
  120. * When using the EPT page-modification log, the GPAs in the CPU dirty
  121. * log would come from L2 rather than L1. Therefore, we need to rely
  122. * on write protection to record dirty pages, which bypasses PML, since
  123. * writes now result in a vmexit. Note, the check on CPU dirty logging
  124. * being enabled is mandatory as the bits used to denote WP-only SPTEs
  125. * are reserved for PAE paging (32-bit KVM).
  126. */
  127. return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
  128. }
  129. int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
  130. gfn_t gfn, bool can_unsync, bool prefetch);
  131. void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
  132. void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
  133. bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
  134. struct kvm_memory_slot *slot, u64 gfn,
  135. int min_level);
  136. void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
  137. u64 start_gfn, u64 pages);
  138. unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
  139. extern int nx_huge_pages;
  140. static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
  141. {
  142. return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
  143. }
  144. struct kvm_page_fault {
  145. /* arguments to kvm_mmu_do_page_fault. */
  146. const gpa_t addr;
  147. const u32 error_code;
  148. const bool prefetch;
  149. /* Derived from error_code. */
  150. const bool exec;
  151. const bool write;
  152. const bool present;
  153. const bool rsvd;
  154. const bool user;
  155. /* Derived from mmu and global state. */
  156. const bool is_tdp;
  157. const bool nx_huge_page_workaround_enabled;
  158. /*
  159. * Whether a >4KB mapping can be created or is forbidden due to NX
  160. * hugepages.
  161. */
  162. bool huge_page_disallowed;
  163. /*
  164. * Maximum page size that can be created for this fault; input to
  165. * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
  166. */
  167. u8 max_level;
  168. /*
  169. * Page size that can be created based on the max_level and the
  170. * page size used by the host mapping.
  171. */
  172. u8 req_level;
  173. /*
  174. * Page size that will be created based on the req_level and
  175. * huge_page_disallowed.
  176. */
  177. u8 goal_level;
  178. /* Shifted addr, or result of guest page table walk if addr is a gva. */
  179. gfn_t gfn;
  180. /* The memslot containing gfn. May be NULL. */
  181. struct kvm_memory_slot *slot;
  182. /* Outputs of kvm_faultin_pfn. */
  183. kvm_pfn_t pfn;
  184. hva_t hva;
  185. bool map_writable;
  186. };
  187. int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
  188. /*
  189. * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
  190. * and of course kvm_mmu_do_page_fault().
  191. *
  192. * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
  193. * RET_PF_RETRY: let CPU fault again on the address.
  194. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
  195. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
  196. * RET_PF_FIXED: The faulting entry has been fixed.
  197. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
  198. *
  199. * Any names added to this enum should be exported to userspace for use in
  200. * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
  201. *
  202. * Note, all values must be greater than or equal to zero so as not to encroach
  203. * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which
  204. * will allow for efficient machine code when checking for CONTINUE, e.g.
  205. * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
  206. */
  207. enum {
  208. RET_PF_CONTINUE = 0,
  209. RET_PF_RETRY,
  210. RET_PF_EMULATE,
  211. RET_PF_INVALID,
  212. RET_PF_FIXED,
  213. RET_PF_SPURIOUS,
  214. };
  215. static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  216. u32 err, bool prefetch)
  217. {
  218. struct kvm_page_fault fault = {
  219. .addr = cr2_or_gpa,
  220. .error_code = err,
  221. .exec = err & PFERR_FETCH_MASK,
  222. .write = err & PFERR_WRITE_MASK,
  223. .present = err & PFERR_PRESENT_MASK,
  224. .rsvd = err & PFERR_RSVD_MASK,
  225. .user = err & PFERR_USER_MASK,
  226. .prefetch = prefetch,
  227. .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
  228. .nx_huge_page_workaround_enabled =
  229. is_nx_huge_page_enabled(vcpu->kvm),
  230. .max_level = KVM_MAX_HUGEPAGE_LEVEL,
  231. .req_level = PG_LEVEL_4K,
  232. .goal_level = PG_LEVEL_4K,
  233. };
  234. int r;
  235. /*
  236. * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
  237. * guest perspective and have already been counted at the time of the
  238. * original fault.
  239. */
  240. if (!prefetch)
  241. vcpu->stat.pf_taken++;
  242. if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
  243. r = kvm_tdp_page_fault(vcpu, &fault);
  244. else
  245. r = vcpu->arch.mmu->page_fault(vcpu, &fault);
  246. /*
  247. * Similar to above, prefetch faults aren't truly spurious, and the
  248. * async #PF path doesn't do emulation. Do count faults that are fixed
  249. * by the async #PF handler though, otherwise they'll never be counted.
  250. */
  251. if (r == RET_PF_FIXED)
  252. vcpu->stat.pf_fixed++;
  253. else if (prefetch)
  254. ;
  255. else if (r == RET_PF_EMULATE)
  256. vcpu->stat.pf_emulate++;
  257. else if (r == RET_PF_SPURIOUS)
  258. vcpu->stat.pf_spurious++;
  259. return r;
  260. }
  261. int kvm_mmu_max_mapping_level(struct kvm *kvm,
  262. const struct kvm_memory_slot *slot, gfn_t gfn,
  263. int max_level);
  264. void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
  265. void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
  266. void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
  267. void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
  268. void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
  269. #endif /* __KVM_X86_MMU_INTERNAL_H */