diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cbc2a665d4d165dd7e15dff80aadf0ac02e44c8e..21e3df27aee47c7da4e13390ed3f09c1630a684d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2635,58 +2635,6 @@ kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. Default is 0 (don't ignore, but inject #GP) - kvm.enable_pmu=[KVM,X86] - If enabled, KVM will virtualize PMU functionality based - on the virtual CPU model defined by userspace. This - can be overridden on a per-VM basis via - KVM_CAP_PMU_CAPABILITY. - - If disabled, KVM will not virtualize PMU functionality, - e.g. MSRs, PMCs, PMIs, etc., even if userspace defines - a virtual CPU model that contains PMU assets. - - Note, KVM's vPMU support implicitly requires running - with an in-kernel local APIC, e.g. to deliver PMIs to - the guest. Running without an in-kernel local APIC is - not supported, though KVM will allow such a combination - (with severely degraded functionality). - - See also enable_mediated_pmu. - - Default is Y (on). - - kvm-{amd,intel}.enable_mediated_pmu=[KVM,AMD,INTEL] - If enabled, KVM will provide a mediated virtual PMU, - instead of the default perf-based virtual PMU (if - kvm.enable_pmu is true and PMU is enumerated via the - virtual CPU model). - - With a perf-based vPMU, KVM operates as a user of perf, - i.e. emulates guest PMU counters using perf events. - KVM-created perf events are managed by perf as regular - (guest-only) events, e.g. are scheduled in/out, contend - for hardware resources, etc. Using a perf-based vPMU - allows guest and host usage of the PMU to co-exist, but - incurs non-trivial overhead and can result in silently - dropped guest events (due to resource contention). - - With a mediated vPMU, hardware PMU state is context - switched around the world switch to/from the guest. - KVM mediates which events the guest can utilize, but - gives the guest direct access to all other PMU assets - when possible (KVM may intercept some accesses if the - virtual CPU model provides a subset of hardware PMU - functionality). Using a mediated vPMU significantly - reduces PMU virtualization overhead and eliminates lost - guest events, but is mutually exclusive with using perf - to profile (some) KVM guests from the host. Enabling a - mediated vPMU effectively makes the host's PMU - unavailable to perf for the duration of any VM that has - been created with an in-kernel local APIC (and PMU - support). - - Default is N (off). - kvm.eager_page_split= [KVM,X86] Controls whether or not KVM will try to proactively split all huge pages during dirty logging. diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_PERF_GUEST_MEDIATED_PMU b/anolis/configs/L1-RECOMMEND/x86/CONFIG_PERF_GUEST_MEDIATED_PMU deleted file mode 100644 index 6669c65d1763df33f7b6088ec02b99b92cbafc66..0000000000000000000000000000000000000000 --- a/anolis/configs/L1-RECOMMEND/x86/CONFIG_PERF_GUEST_MEDIATED_PMU +++ /dev/null @@ -1 +0,0 @@ -CONFIG_PERF_GUEST_MEDIATED_PMU=y diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 6e0138b1ffeb99d5b278edc73d94dbeec105e204..07cb165e30f69d19fc2615b43e731cfabbc1aeaa 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -114,7 +114,6 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(IRQ_WORK_VECTOR, irq_work), - SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, perf_guest_mediated_pmi_handler), SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 3f5bbb5e9a99091f939a0a03573672fdf8a22d9b..5c93241b426226ca77a20b667275db0c55a96163 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1408,8 +1408,6 @@ static int __init amd_core_pmu_init(void) amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64; - x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; - /* Update PMC handling functions */ x86_pmu.enable_all = amd_pmu_v2_enable_all; x86_pmu.disable_all = amd_pmu_v2_disable_all; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7ff54fad1f96644ed998ab27f0fb7a0444a16329..00c3cb6f2c75aa1afbe451ff3b153a5c9aee4c66 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -54,8 +54,6 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .pmu = &pmu, }; -static DEFINE_PER_CPU(bool, guest_lvtpc_loaded); - DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); @@ -1764,25 +1762,6 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } -#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU -void perf_load_guest_lvtpc(u32 guest_lvtpc) -{ - u32 masked = guest_lvtpc & APIC_LVT_MASKED; - - apic_write(APIC_LVTPC, - APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); - this_cpu_write(guest_lvtpc_loaded, true); -} -EXPORT_SYMBOL_GPL(perf_load_guest_lvtpc); - -void perf_put_guest_lvtpc(void) -{ - this_cpu_write(guest_lvtpc_loaded, false); - apic_write(APIC_LVTPC, APIC_DM_NMI); -} -EXPORT_SYMBOL_GPL(perf_put_guest_lvtpc); -#endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ - static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { @@ -1790,17 +1769,6 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) u64 finish_clock; int ret; - /* - * Ignore all NMIs when the CPU's LVTPC is configured to route PMIs to - * PERF_GUEST_MEDIATED_PMI_VECTOR, i.e. when an NMI time can't be due - * to a PMI. Attempting to handle a PMI while the guest's context is - * loaded will generate false positives and clobber guest state. Note, - * the LVTPC is switched to/from the dedicated mediated PMI IRQ vector - * while host events are quiesced. - */ - if (this_cpu_read(guest_lvtpc_loaded)) - return NMI_DONE; - /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. @@ -3155,12 +3123,11 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu_num_counters(NULL); cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); - cap->bit_width_gp = cap->num_counters_gp ? x86_pmu.cntval_bits : 0; - cap->bit_width_fixed = cap->num_counters_fixed ? x86_pmu.cntval_bits : 0; + cap->bit_width_gp = x86_pmu.cntval_bits; + cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; - cap->mediated = !!(pmu.capabilities & PERF_PMU_CAP_MEDIATED_VPMU); } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 673995657a98ef838102cc71dc0739c93bb59418..3558f653009db194ab24d6b511b9664136a57a0f 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -19,9 +19,6 @@ typedef struct { unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; -#endif -#ifdef CONFIG_GUEST_PERF_EVENTS - unsigned int perf_guest_mediated_pmis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index cc912f4215ee8d087dbcdb07e83d996e5c4fa5f6..86362db4a0d080ea215171fe22a0574555bccc3d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -746,12 +746,6 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested # define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif -# ifdef CONFIG_GUEST_PERF_EVENTS -DECLARE_IDTENTRY_SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, sysvec_perf_guest_mediated_pmi_handler); -#else -# define fred_sysvec_perf_guest_mediated_pmi_handler NULL -# endif - # ifdef CONFIG_X86_POSTED_MSI DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); #else diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index b43f81ece62e0b0ee85441c8a2b85f78fefef1fa..65b5f47fa1f41b0ec6fb8c253c2252766855b150 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -77,9 +77,7 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* IRQ vector for PMIs when running a guest with a mediated PMU. */ -#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 - +/* 0xf5 - unused, was UV_BAU_MESSAGE */ #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 3e8ddfe355222b2bd3da552543eb985e69c716db..058bc636356a1133ad151457d8bf0b56528e7f39 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -26,9 +26,5 @@ KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) -KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) -KVM_X86_PMU_OP_OPTIONAL(mediated_load) -KVM_X86_PMU_OP_OPTIONAL(mediated_put) - #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 73609473c90bbd69dd3bb47d112cabae52b32a17..819a564f8212d3670641eb15db5bb4612ac0a06e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -502,7 +502,6 @@ struct kvm_pmc { u64 counter; u64 prev_counter; u64 eventsel; - u64 eventsel_hw; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* @@ -526,7 +525,6 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; - u64 fixed_ctr_ctrl_hw; u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; @@ -1409,7 +1407,6 @@ struct kvm_arch { bool bus_lock_detection_enabled; bool enable_pmu; - bool created_mediated_pmu; u32 notify_window; u32 notify_vmexit_flags; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3124074ce87814db11ac13fcb14cccf6f89ec018..09e57313148b9d22663841e4acccb6288cf8c252 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1210,7 +1210,6 @@ #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 -#define MSR_CORE_PERF_GLOBAL_STATUS_SET 0x00000391 #define MSR_PERF_METRICS 0x00000329 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index b027ef9bf4d097e976a14b02bc7f309a5c182ba2..126b242b0073be116c3e1d95e037b1d22e1e5bac 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -329,7 +329,6 @@ struct x86_pmu_capability { unsigned int events_mask; int events_mask_len; unsigned int pebs_ept :1; - unsigned int mediated :1; }; /* @@ -802,14 +801,6 @@ static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif -#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU -extern void perf_load_guest_lvtpc(u32 guest_lvtpc); -extern void perf_put_guest_lvtpc(void); -#else -static inline void perf_load_guest_lvtpc(u32 guest_lvtpc) { } -static inline void perf_put_guest_lvtpc(void) { } -#endif - #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 5091db307722dba6a2e892240f9dc459ef9dcbfe..280c0b1834b22ed873ad1a844811e9ebac19cda0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -154,9 +154,6 @@ static const __initconst struct idt_data apic_idts[] = { INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif -#ifdef CONFIG_GUEST_PERF_EVENTS - INTG(PERF_GUEST_MEDIATED_PMI_VECTOR, asm_sysvec_perf_guest_mediated_pmi_handler), -#endif # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 44d3968d45b648ecd5309e86404effd71aade977..4e132df60e6cc40e95a4d46633e261b44e449c46 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -186,13 +186,6 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif -#ifdef CONFIG_GUEST_PERF_EVENTS - seq_printf(p, "%*s: ", prec, "VPMI"); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - irq_stats(j)->perf_guest_mediated_pmis); - seq_puts(p, " Perf Guest Mediated PMI\n"); -#endif #ifdef CONFIG_X86_POSTED_MSI seq_printf(p, "%*s: ", prec, "PMN"); for_each_online_cpu(j) @@ -317,18 +310,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) } #endif -#ifdef CONFIG_GUEST_PERF_EVENTS -/* - * Handler for PERF_GUEST_MEDIATED_PMI_VECTOR. - */ -DEFINE_IDTENTRY_SYSVEC(sysvec_perf_guest_mediated_pmi_handler) -{ - apic_eoi(); - inc_irq_stat(perf_guest_mediated_pmis); - perf_guest_handle_mediated_pmi(); -} -#endif - #ifdef CONFIG_HAVE_KVM static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7b927e68fee7ba30d41c1c4362aa1c1d7ed171e2..d4660e04fc20797eabea332fb954ab893d6c2402 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -40,7 +40,6 @@ config KVM select SCHED_INFO select PERF_EVENTS select GUEST_PERF_EVENTS - select PERF_GUEST_MEDIATED_PMU select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f5cec1e8b93b79d3c803a9cb6920fed82f4064fc..7fa6513e09ae19254384a28819a39de2f70af831 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -29,65 +29,6 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); -void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops) -{ - bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; - int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; - - /* - * Hybrid PMUs don't play nice with virtualization without careful - * configuration by userspace, and KVM's APIs for reporting supported - * vPMU features do not account for hybrid PMUs. Disable vPMU support - * for hybrid PMUs until KVM gains a way to let userspace opt-in. - */ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - enable_pmu = false; - - if (enable_pmu) { - perf_get_x86_pmu_capability(&kvm_pmu_cap); - - /* - * WARN if perf did NOT disable hardware PMU if the number of - * architecturally required GP counters aren't present, i.e. if - * there are a non-zero number of counters, but fewer than what - * is architecturally required. - */ - if (!kvm_pmu_cap.num_counters_gp || - WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) - enable_pmu = false; - else if (is_intel && !kvm_pmu_cap.version) - enable_pmu = false; - } - - if (!enable_pmu || !enable_mediated_pmu || !kvm_pmu_cap.mediated || - !pmu_ops->is_mediated_pmu_supported(&kvm_pmu_cap)) - enable_mediated_pmu = false; - - if (!enable_mediated_pmu) - pmu_ops->write_global_ctrl = NULL; - - if (!enable_pmu) { - memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); - return; - } - - kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); - kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, - pmu_ops->MAX_NR_GP_COUNTERS); - kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, - KVM_PMC_MAX_FIXED); -} - -void kvm_handle_guest_mediated_pmi(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - - if (WARN_ON_ONCE(!vcpu || !kvm_vcpu_has_mediated_pmu(vcpu))) - return; - - kvm_make_request(KVM_REQ_PMI, vcpu); -} - /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), @@ -455,25 +396,6 @@ static bool pmc_event_is_allowed(struct kvm_pmc *pmc) check_pmu_event_filter(pmc); } -static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc) -{ - bool allowed = check_pmu_event_filter(pmc); - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (pmc_is_gp(pmc)) { - pmc->eventsel_hw &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - if (allowed) - pmc->eventsel_hw |= pmc->eventsel & - ARCH_PERFMON_EVENTSEL_ENABLE; - } else { - u64 mask = intel_fixed_bits_by_idx(pmc->idx - INTEL_PMC_IDX_FIXED, 0xf); - - pmu->fixed_ctr_ctrl_hw &= ~mask; - if (allowed) - pmu->fixed_ctr_ctrl_hw |= pmu->fixed_ctr_ctrl & mask; - } -} - static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -481,11 +403,6 @@ static void reprogram_counter(struct kvm_pmc *pmc) u64 new_config = eventsel; u8 fixed_ctr_ctrl; - if (kvm_vcpu_has_mediated_pmu(pmu_to_vcpu(pmu))) { - kvm_mediated_pmu_refresh_event_filter(pmc); - return; - } - pmc_pause_counter(pmc); if (!pmc_event_is_allowed(pmc)) @@ -625,46 +542,6 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) return 0; } -static bool kvm_need_any_pmc_intercept(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - if (!kvm_vcpu_has_mediated_pmu(vcpu)) - return true; - - /* - * Note! Check *host* PMU capabilities, not KVM's PMU capabilities, as - * KVM's capabilities are constrained based on KVM support, i.e. KVM's - * capabilities themselves may be a subset of hardware capabilities. - */ - return pmu->nr_arch_gp_counters != kvm_pmu_cap.num_counters_gp || - pmu->nr_arch_fixed_counters != kvm_pmu_cap.num_counters_fixed; -} - -bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu) -{ - return kvm_need_any_pmc_intercept(vcpu) || - !kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); -} -EXPORT_SYMBOL_GPL(kvm_need_perf_global_ctrl_intercept); - -bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - /* - * VMware allows access to these Pseduo-PMCs even when read via RDPMC - * in Ring3 when CR4.PCE=0. - */ - if (enable_vmware_backdoor) - return true; - - return kvm_need_any_pmc_intercept(vcpu) || - pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_pmu_cap.bit_width_gp) - 1) || - pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_pmu_cap.bit_width_fixed) - 1); -} -EXPORT_SYMBOL_GPL(kvm_need_rdpmc_intercept); - void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { @@ -760,12 +637,6 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmu->global_ctrl = data; reprogram_counters(pmu, diff); } - /* - * Unconditionally forward writes to vendor code, i.e. to the - * VMC{B,S}, as pmu->global_ctrl is per-VCPU, not per-VMC{B,S}. - */ - if (kvm_vcpu_has_mediated_pmu(vcpu)) - static_call_cond(kvm_x86_pmu_write_global_ctrl)(data); break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: /* @@ -809,14 +680,11 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); pmc->counter = 0; - if (pmc_is_gp(pmc)) { + if (pmc_is_gp(pmc)) pmc->eventsel = 0; - pmc->eventsel_hw = 0; - } } - pmu->fixed_ctr_ctrl = pmu->fixed_ctr_ctrl_hw = 0; - pmu->global_ctrl = pmu->global_status = 0; + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; static_call_cond(kvm_x86_pmu_reset)(vcpu); } @@ -865,12 +733,8 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) * in the global controls). Emulate that behavior when refreshing the * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. */ - if (pmu->nr_arch_gp_counters && - (kvm_pmu_has_perf_global_ctrl(pmu) || kvm_vcpu_has_mediated_pmu(vcpu))) + if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); - - if (kvm_vcpu_has_mediated_pmu(vcpu)) - static_call_cond(kvm_x86_pmu_write_global_ctrl)(pmu->global_ctrl); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -914,44 +778,11 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } -static bool pmc_is_pmi_enabled(struct kvm_pmc *pmc) -{ - u8 fixed_ctr_ctrl; - - if (pmc_is_gp(pmc)) - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_INT; - - fixed_ctr_ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); - return fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI; -} - static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - struct kvm_vcpu *vcpu = pmc->vcpu; - - /* - * For perf-based PMUs, request reprogramming, which will consult - * both emulated and hardware-generated events to detect overflow. - */ - if (!kvm_vcpu_has_mediated_pmu(vcpu)) { - pmc->prev_counter = pmc->counter; - pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - kvm_pmu_request_counter_reprogram(pmc); - return; - } - - /* - * For mediated PMUs, pmc->counter is updated when the vCPU's PMU is - * put, and will be loaded into hardware when the PMU is loaded. Simply - * increment the counter and signal overflow if it wraps to zero. - */ + pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - if (!pmc->counter) { - pmc_to_pmu(pmc)->global_status |= BIT_ULL(pmc->idx); - if (pmc_is_pmi_enabled(pmc)) - kvm_make_request(KVM_REQ_PMI, vcpu); - } + kvm_pmu_request_counter_reprogram(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, @@ -1141,126 +972,3 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) kfree(filter); return r; } - -static __always_inline u32 fixed_counter_msr(u32 idx) -{ - return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; -} - -static __always_inline u32 gp_counter_msr(u32 idx) -{ - return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; -} - -static __always_inline u32 gp_eventsel_msr(u32 idx) -{ - return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE; -} - -static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc; - u32 i; - - /* - * No need to zero out unexposed GP/fixed counters/selectors since RDPMC - * is intercepted if hardware has counters that aren't visible to the - * guest (KVM will inject #GP as appropriate). - */ - for (i = 0; i < pmu->nr_arch_gp_counters; i++) { - pmc = &pmu->gp_counters[i]; - - if (pmc->counter != native_read_pmc(i)) - wrmsrl(gp_counter_msr(i), pmc->counter); - wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw); - } - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmc = &pmu->fixed_counters[i]; - - if (pmc->counter != native_read_pmc(INTEL_PMC_FIXED_RDPMC_BASE | i)) - wrmsrl(fixed_counter_msr(i), pmc->counter); - } -} - -void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu) -{ - if (!kvm_vcpu_has_mediated_pmu(vcpu) || - KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) - return; - - lockdep_assert_irqs_disabled(); - - perf_load_guest_context(); - - /* - * Explicitly clear PERF_GLOBAL_CTRL, as "loading" the guest's context - * disables all individual counters (if any were enabled), but doesn't - * globally disable the entire PMU. Loading event selectors and PMCs - * with guest values while PERF_GLOBAL_CTRL is non-zero will generate - * unexpected events and PMIs. - * - * VMX will enable/disable counters at VM-Enter/VM-Exit by atomically - * loading PERF_GLOBAL_CONTROL. SVM effectively performs the switch by - * configuring all events to be GUEST_ONLY. Clear PERF_GLOBAL_CONTROL - * even for SVM to minimize the damage if a perf event is left enabled, - * and to ensure a consistent starting state. - */ - wrmsrl(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0); - - perf_load_guest_lvtpc(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC)); - - kvm_pmu_load_guest_pmcs(vcpu); - - static_call_cond(kvm_x86_pmu_mediated_load)(vcpu); -} - -static void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc; - u32 i; - - /* - * Clear selectors and counters to ensure hardware doesn't count using - * guest controls when the host (perf) restores its state. - */ - for (i = 0; i < pmu->nr_arch_gp_counters; i++) { - pmc = &pmu->gp_counters[i]; - - pmc->counter = native_read_pmc(i); - if (pmc->counter) - wrmsrl(gp_counter_msr(i), 0); - if (pmc->eventsel_hw) - wrmsrl(gp_eventsel_msr(i), 0); - } - - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmc = &pmu->fixed_counters[i]; - - pmc->counter = native_read_pmc(INTEL_PMC_FIXED_RDPMC_BASE | i); - if (pmc->counter) - wrmsrl(fixed_counter_msr(i), 0); - } -} - -void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu) -{ - if (!kvm_vcpu_has_mediated_pmu(vcpu) || - KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) - return; - - lockdep_assert_irqs_disabled(); - - /* - * Defer handling of PERF_GLOBAL_CTRL to vendor code. On Intel, it's - * atomically cleared on VM-Exit, i.e. doesn't need to be clear here. - */ - static_call_cond(kvm_x86_pmu_mediated_put)(vcpu); - - kvm_pmu_put_guest_pmcs(vcpu); - - perf_put_guest_lvtpc(); - - perf_put_guest_context(); -} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a345c29e85c561718a2e32a1d82fd6f91190a9ff..004121dfddd643e0bb744cbe5e6b952bb4ecc030 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -37,26 +37,13 @@ struct kvm_pmu_ops { void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); - bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); - void (*mediated_load)(struct kvm_vcpu *vcpu); - void (*mediated_put)(struct kvm_vcpu *vcpu); - void (*write_global_ctrl)(u64 global_ctrl); - const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; const int MIN_NR_GP_COUNTERS; - - const u32 PERF_GLOBAL_CTRL; - const u32 GP_EVENTSEL_BASE; - const u32 GP_COUNTER_BASE; - const u32 FIXED_COUNTER_BASE; - const u32 MSR_STRIDE; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); -void kvm_handle_guest_mediated_pmi(void); - static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) { /* @@ -71,11 +58,6 @@ static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) return pmu->version > 1; } -static inline bool kvm_vcpu_has_mediated_pmu(struct kvm_vcpu *vcpu) -{ - return enable_mediated_pmu && vcpu_to_pmu(vcpu)->version; -} - static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -87,9 +69,6 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) { u64 counter, enabled, running; - if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) - return pmc->counter & pmc_bitmask(pmc); - counter = pmc->counter; if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, @@ -100,11 +79,6 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) { - if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) { - pmc->counter = val & pmc_bitmask(pmc); - return; - } - pmc->counter += val - pmc_read_counter(pmc); pmc->counter &= pmc_bitmask(pmc); } @@ -190,7 +164,47 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops); +static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; + + /* + * Hybrid PMUs don't play nice with virtualization without careful + * configuration by userspace, and KVM's APIs for reporting supported + * vPMU features do not account for hybrid PMUs. Disable vPMU support + * for hybrid PMUs until KVM gains a way to let userspace opt-in. + */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + enable_pmu = false; + + if (enable_pmu) { + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. + */ + if (!kvm_pmu_cap.num_counters_gp || + WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_pmu_cap.version) + enable_pmu = false; + } + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_PMC_MAX_FIXED); +} static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { @@ -225,11 +239,6 @@ static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } -static inline bool kvm_pmu_is_fastpath_emulation_allowed(struct kvm_vcpu *vcpu) -{ - return !kvm_vcpu_has_mediated_pmu(vcpu); -} - void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); @@ -245,12 +254,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); -void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu); -void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu); - bool is_vmware_backdoor_pmc(u32 pmc_idx); -bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu); -bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu); extern struct kvm_pmu_ops intel_pmu_ops; extern struct kvm_pmu_ops amd_pmu_ops; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index f6cb924451a3d9894ab1918be4f11b8e9a789c78..1313c1812c8eb55c62e9bebc52ff8d19db1c58fc 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -171,8 +171,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) | - AMD64_EVENTSEL_GUESTONLY; kvm_pmu_request_counter_reprogram(pmc); } return 0; @@ -220,37 +218,6 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } -static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) -{ - return host_pmu->version >= 2; -} - -static void amd_mediated_pmu_load(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - u64 global_status; - - rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status); - /* Clear host global_status MSR if non-zero. */ - if (global_status) - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status); - - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status); - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl); -} - -static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); - rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status); - - /* Clear global status bits if non-zero */ - if (pmu->global_status) - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status); -} - static void amd_pmu_init(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -278,18 +245,7 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .set_msr = amd_pmu_set_msr, .refresh = amd_pmu_refresh, .init = amd_pmu_init, - - .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, - .mediated_load = amd_mediated_pmu_load, - .mediated_put = amd_mediated_pmu_put, - .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, - - .PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL, - .GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0, - .GP_COUNTER_BASE = MSR_F15H_PERF_CTR0, - .FIXED_COUNTER_BASE = 0, - .MSR_STRIDE = 2, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 886387467005cabbf331cafb236fe0c71d897ce8..0f02c5911a2f06d4afd595630ef115c6601d9acc 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4608,10 +4608,12 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) - svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW, - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)); + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { + bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + + set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); + } /* * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if @@ -4628,9 +4630,9 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) */ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - svm_disable_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); else - svm_enable_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0); } void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) @@ -4702,8 +4704,8 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_XSETBV); /* Clear intercepts on selected MSRs */ - svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW); - svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW); + set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cb5b3eb557d317f714dfc1bcc1c3526e4e4ee69e..27727c657deee740e0942f93e216a9e57e757243 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -112,20 +112,6 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_TSC_AUX, .always = false }, - { .index = MSR_K7_PERFCTR0, .always = false }, - { .index = MSR_K7_PERFCTR1, .always = false }, - { .index = MSR_K7_PERFCTR2, .always = false }, - { .index = MSR_K7_PERFCTR3, .always = false }, - { .index = MSR_F15H_PERF_CTR0, .always = false }, - { .index = MSR_F15H_PERF_CTR1, .always = false }, - { .index = MSR_F15H_PERF_CTR2, .always = false }, - { .index = MSR_F15H_PERF_CTR3, .always = false }, - { .index = MSR_F15H_PERF_CTR4, .always = false }, - { .index = MSR_F15H_PERF_CTR5, .always = false }, - { .index = MSR_AMD64_PERF_CNTR_GLOBAL_CTL, .always = false }, - { .index = MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, .always = false }, - { .index = MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, .always = false }, - { .index = MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, .always = false }, { .index = X2APIC_MSR(APIC_ID), .always = false }, { .index = X2APIC_MSR(APIC_LVR), .always = false }, { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, @@ -257,8 +243,6 @@ module_param(intercept_smi, bool, 0444); bool vnmi = true; module_param(vnmi, bool, 0444); -module_param(enable_mediated_pmu, bool, 0444); - /* * Allow set guest PAT to WB in some non-passthrough * application scenarios to enhance performance. @@ -680,6 +664,7 @@ static int svm_hardware_enable(void) __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); } + /* * Get OSVW bits. * @@ -903,73 +888,11 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, svm->nested.force_msr_bitmap_recalc = true; } -void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u32 *msrpm = svm->msrpm; - u8 bit_read, bit_write; - unsigned long tmp; - u32 offset; - - /* Note, the shadow intercept bitmaps have inverted polarity. */ - set_shadow_msr_intercept(vcpu, msr, type & MSR_TYPE_R, type & MSR_TYPE_W); - - /* Don't disable interception for MSRs userspace wants to handle. */ - if ((type & MSR_TYPE_R) && - !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) - type &= ~MSR_TYPE_R; - - if ((type & MSR_TYPE_W) && - !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) - type &= ~MSR_TYPE_W; - - offset = svm_msrpm_offset(msr); - if (KVM_BUG_ON(offset == MSR_INVALID, vcpu->kvm)) - return; - - bit_read = 2 * (msr & 0x0f); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - if (type & MSR_TYPE_R) - clear_bit(bit_read, &tmp); - - if (type & MSR_TYPE_W) - clear_bit(bit_write, &tmp); - - msrpm[offset] = tmp; - svm_hv_vmcb_dirty_nested_enlightenments(vcpu); - svm->nested.force_msr_bitmap_recalc = true; -} - -void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) +void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write) { - struct vcpu_svm *svm = to_svm(vcpu); - u32 *msrpm = svm->msrpm; - u8 bit_read, bit_write; - unsigned long tmp; - u32 offset; - - set_shadow_msr_intercept(vcpu, msr, - !(type & MSR_TYPE_R), !(type & MSR_TYPE_W)); - - offset = svm_msrpm_offset(msr); - if (KVM_BUG_ON(offset == MSR_INVALID, vcpu->kvm)) - return; - - bit_read = 2 * (msr & 0x0f); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - if (type & MSR_TYPE_R) - set_bit(bit_read, &tmp); - - if (type & MSR_TYPE_W) - set_bit(bit_write, &tmp); - - msrpm[offset] = tmp; - svm_hv_vmcb_dirty_nested_enlightenments(vcpu); - svm->nested.force_msr_bitmap_recalc = true; + set_shadow_msr_intercept(vcpu, msr, read, write); + set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); } u32 *svm_vcpu_alloc_msrpm(void) @@ -992,12 +915,9 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) int i; for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 msr = direct_access_msrs[i].index; - if (!direct_access_msrs[i].always) continue; - set_shadow_msr_intercept(vcpu, msr, 1, 1); - set_msr_interception_bitmap(vcpu, msrpm, msr, 1, 1); + set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); } } @@ -1017,7 +937,8 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if ((index < APIC_BASE_MSR) || (index > APIC_BASE_MSR + 0xff)) continue; - svm_set_intercept_for_msr(&svm->vcpu, index, MSR_TYPE_RW, intercept); + set_msr_interception(&svm->vcpu, svm->msrpm, index, + !intercept, !intercept); } svm->x2avic_msrs_intercepted = intercept; @@ -1028,40 +949,6 @@ void svm_vcpu_free_msrpm(u32 *msrpm) __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } -static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) -{ - bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu); - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int i; - - if (!enable_mediated_pmu) - return; - - /* Legacy counters are always available for AMD CPUs with a PMU. */ - for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++) - svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i, - MSR_TYPE_RW, intercept); - - intercept |= !guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); - for (i = 0; i < pmu->nr_arch_gp_counters; i++) - svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, - MSR_TYPE_RW, intercept); - - for ( ; i < kvm_pmu_cap.num_counters_gp; i++) - svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, - MSR_TYPE_RW); - - intercept = kvm_need_perf_global_ctrl_intercept(vcpu); - svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL, - MSR_TYPE_RW, intercept); - svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, - MSR_TYPE_RW, intercept); - svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, - MSR_TYPE_RW, intercept); - svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, - MSR_TYPE_RW, intercept); -} - static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1079,7 +966,6 @@ static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); } - svm_recalc_pmu_msr_intercepts(vcpu); } static void add_msr_offset(u32 offset) @@ -1142,13 +1028,18 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) if (intercept == svm->lbr_msrs_intercepted) return; - svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); - svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); - svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); - svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, + !intercept, !intercept); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, + !intercept, !intercept); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, + !intercept, !intercept); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, + !intercept, !intercept); if (sev_es_guest(vcpu->kvm)) - svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, + !intercept, !intercept); svm->lbr_msrs_intercepted = intercept; } @@ -1331,11 +1222,6 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, else svm_set_intercept(svm, INTERCEPT_RDTSCP); } - - if (kvm_need_rdpmc_intercept(vcpu)) - svm_set_intercept(svm, INTERCEPT_RDPMC); - else - svm_clr_intercept(svm, INTERCEPT_RDPMC); } static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) @@ -1352,8 +1238,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - svm_enable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - svm_enable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1365,10 +1251,9 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } /* No need to intercept these MSRs */ - svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); } - svm_recalc_pmu_msr_intercepts(vcpu); } static void svm_set_guest_pat(struct vcpu_svm *svm, u64 *g_pat) @@ -1507,8 +1392,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * of MSR_IA32_SPEC_CTRL. */ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, - !guest_has_spec_ctrl_msr(vcpu)); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); if (enable_apicv && irqchip_in_kernel(vcpu->kvm)) avic_init_vmcb(svm, vmcb); @@ -3269,7 +3153,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && @@ -4549,9 +4433,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; - if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) - rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl); - /* * We need to handle MC intercepts here before the vcpu has a chance to * change the physical cpu @@ -4682,12 +4563,12 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm_recalc_instruction_intercepts(vcpu, svm); if (boot_cpu_has(X86_FEATURE_IBPB)) - svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, - !guest_has_pred_cmd_msr(vcpu)); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, + !!guest_has_pred_cmd_msr(vcpu)); if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) - svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, - !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, + !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 8aa7782ee9822bc654dfaac1d7e6bc6e3d60d54f..60641218292f221e66451620a41a7709a4be9dde 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -32,8 +32,8 @@ #define GUEST_PAT_WB_ATTR 0x0606060606060606 -#define MAX_DIRECT_ACCESS_MSRS 62 -#define MSRPM_OFFSETS 46 +#define MAX_DIRECT_ACCESS_MSRS 48 +#define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int nrips; @@ -606,18 +606,8 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu); bool svm_interrupt_blocked(struct kvm_vcpu *vcpu); void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); -void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); -void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); - -static inline void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, - int type, bool enable_intercept) -{ - if (enable_intercept) - svm_enable_intercept_for_msr(vcpu, msr, type); - else - svm_disable_intercept_for_msr(vcpu, msr, type); -} - +void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write); void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a2a02fe79310caf870f74fd32c9a93cbcb2beae1..ab062a276bcaf411775e6ca273dd1e9e3cc32e8b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -16,6 +16,10 @@ #include "../cpuid.h" #include "run_flags.h" +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +#define MSR_TYPE_RW 3 + #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33b134b21164e63bd9478622840d55e82f84113b..7f8ca32afebd458bc4ac7d62308f468c1e67cee5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -195,10 +195,6 @@ bool __read_mostly enable_pmu = true; EXPORT_SYMBOL_GPL(enable_pmu); module_param(enable_pmu, bool, 0444); -/* Enable/disabled mediated PMU virtualization. */ -bool __read_mostly enable_mediated_pmu; -EXPORT_SYMBOL_GPL(enable_mediated_pmu); - bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); @@ -2208,9 +2204,6 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) fastpath_t ret; bool handled; - if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) - return EXIT_FASTPATH_NONE; - kvm_vcpu_srcu_read_lock(vcpu); switch (msr) { @@ -6603,7 +6596,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; mutex_lock(&kvm->lock); - if (!kvm->created_vcpus && !kvm->arch.created_mediated_pmu) { + if (!kvm->created_vcpus) { kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); r = 0; } @@ -9791,8 +9784,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif - __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, - enable_mediated_pmu ? kvm_handle_guest_mediated_pmi : NULL); + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); @@ -11051,8 +11043,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) run_flags |= KVM_RUN_LOAD_DEBUGCTL; vcpu->arch.host_debugctl = debug_ctl; - kvm_mediated_pmu_load(vcpu); - guest_timing_enter_irqoff(); for (;;) { @@ -11122,8 +11112,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_handle_exit_irqoff)(vcpu); - kvm_mediated_pmu_put(vcpu); - if (vcpu->arch.guest_fpu.xfd_err) wrmsrl(MSR_IA32_XFD_ERR, 0); @@ -12177,13 +12165,8 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } -#define PERF_MEDIATED_PMU_MSG \ - "Failed to enable mediated vPMU, try disabling system wide perf events and nmi_watchdog.\n" - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - int r; - if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); @@ -12194,29 +12177,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - /* - * Note, any actions done by .vcpu_create() must be idempotent with - * respect to creating multiple vCPUs, and therefore are not undone if - * creating a vCPU fails (including failure during pre-create). - */ - r = static_call(kvm_x86_vcpu_precreate)(kvm); - if (r) - return r; - - if (enable_mediated_pmu && kvm->arch.enable_pmu && - !kvm->arch.created_mediated_pmu) { - if (irqchip_in_kernel(kvm)) { - r = perf_create_mediated_pmu(); - if (r) { - pr_warn_ratelimited(PERF_MEDIATED_PMU_MSG); - return r; - } - kvm->arch.created_mediated_pmu = true; - } else { - kvm->arch.enable_pmu = false; - } - } - return 0; + return static_call(kvm_x86_vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -12878,8 +12839,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } kvm_unload_vcpu_mmus(kvm); static_call_cond(kvm_x86_vm_destroy)(kvm); - if (kvm->arch.created_mediated_pmu) - perf_release_mediated_pmu(); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 137f15f10c89adb0290e355b61f9f519762c2b76..f9cbd355b14d35a969edd29fc922a7a45b13b691 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -9,10 +9,6 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 - struct kvm_caps { /* control of guest tsc rate supported? */ bool has_tsc_control; @@ -334,7 +330,6 @@ extern u64 host_arch_capabilities; extern struct kvm_caps kvm_caps; extern bool enable_pmu; -extern bool enable_mediated_pmu; /* * Get a filtered version of KVM's supported XCR0 that strips out dynamic diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8f174ac509e2819d48123f979466a0d4d28e2237..8a599257d0b9f948119e4c40a6dc89f54f75623a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1682,8 +1682,6 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); -void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), - void (*mediated_pmi_handler)(void)); void kvm_unregister_perf_callbacks(void); #else static inline void kvm_register_perf_callbacks(void *ign) {} diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4b8f475045e79a048a001b48e6c0099134b4680e..5c3edd9253e044f424c62731b043099b697af1bf 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -33,8 +33,6 @@ struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); - - void (*handle_mediated_pmi)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -295,7 +293,6 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 -#define PERF_PMU_CAP_MEDIATED_VPMU 0x0800 /** * pmu::scope @@ -939,12 +936,6 @@ struct perf_event_pmu_context { int rotate_necessary; }; -struct perf_time_ctx { - u64 time; - u64 stamp; - u64 offset; -}; - struct perf_event_groups { struct rb_root tree; u64 index; @@ -989,12 +980,9 @@ struct perf_event_context { /* * Context clock, runs when context enabled. */ - struct perf_time_ctx time; - - /* - * Context clock, runs when in the guest mode. - */ - struct perf_time_ctx timeguest; + u64 time; + u64 timestamp; + u64 timeoffset; /* * These fields let us detect when two contexts have both @@ -1097,8 +1085,9 @@ struct bpf_perf_event_data_kern { * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { - struct perf_time_ctx time; - struct perf_time_ctx timeguest; + u64 time; + u64 timestamp; + u64 timeoffset; int active; }; @@ -1602,7 +1591,6 @@ extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); -DECLARE_STATIC_CALL(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); static inline unsigned int perf_guest_state(void) { @@ -1616,11 +1604,6 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } - -static inline void perf_guest_handle_mediated_pmi(void) -{ - static_call(__perf_guest_handle_mediated_pmi)(); -} extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); #else @@ -1809,14 +1792,6 @@ extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); - -#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU -int perf_create_mediated_pmu(void); -void perf_release_mediated_pmu(void); -void perf_load_guest_context(void); -void perf_put_guest_context(void); -#endif - #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, diff --git a/init/Kconfig b/init/Kconfig index 579488450a15eb4dde2d8c08aee154fb16caa6bf..cacbd644409f03ed88382fdb06af99b1e7aed35d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1901,10 +1901,6 @@ config GUEST_PERF_EVENTS bool depends on HAVE_PERF_EVENTS -config PERF_GUEST_MEDIATED_PMU - bool - depends on GUEST_PERF_EVENTS - config PERF_USE_VMALLOC bool help diff --git a/kernel/events/core.c b/kernel/events/core.c index e28d1c1b9d90acba8e10304958a9a2594747db43..02399f48c1e0327907e163a28442643b4d4c7424 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -393,19 +393,6 @@ enum event_type_t { /* see ctx_resched() for details */ EVENT_CPU = 0x8, EVENT_CGROUP = 0x10, - - /* - * EVENT_GUEST is set when scheduling in/out events between the host - * and a guest with a mediated vPMU. Among other things, EVENT_GUEST - * is used: - * - * - In perf_skip_pmu_ctx() to skip PMUs that don't support events in a - * MEDIATED_VPMU guest, i.e. don't need to be context switched. - * - To indicate the start/end point of the events in a guest. Guest - * running time is deducted for host-only (exclude_guest) events. - */ - EVENT_GUEST = 0x20, - EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -444,20 +431,6 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; -#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU -static DEFINE_PER_CPU(bool, guest_ctx_loaded); - -static __always_inline bool is_guest_mediated_pmu_loaded(void) -{ - return __this_cpu_read(guest_ctx_loaded); -} -#else -static __always_inline bool is_guest_mediated_pmu_loaded(void) -{ - return false; -} -#endif - /* * perf event paranoia level: * -1 - not paranoid at all @@ -734,36 +707,23 @@ do { \ ___p; \ }) -static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, - enum event_type_t event_type) -{ - if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) - return true; - if ((event_type & EVENT_GUEST) && - !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) - return true; - return false; -} - -static void perf_ctx_disable(struct perf_event_context *ctx, - enum event_type_t event_type) +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (perf_skip_pmu_ctx(pmu_ctx, event_type)) + if (cgroup && !pmu_ctx->nr_cgroups) continue; perf_pmu_disable(pmu_ctx->pmu); } } -static void perf_ctx_enable(struct perf_event_context *ctx, - enum event_type_t event_type) +static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (perf_skip_pmu_ctx(pmu_ctx, event_type)) + if (cgroup && !pmu_ctx->nr_cgroups) continue; perf_pmu_enable(pmu_ctx->pmu); } @@ -772,57 +732,6 @@ static void perf_ctx_enable(struct perf_event_context *ctx, static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); -static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) -{ - if (adv) - time->time += now - time->stamp; - time->stamp = now; - - /* - * The above: time' = time + (now - timestamp), can be re-arranged - * into: time` = now + (time - timestamp), which gives a single value - * offset to compute future time without locks on. - * - * See perf_event_time_now(), which can be used from NMI context where - * it's (obviously) not possible to acquire ctx->lock in order to read - * both the above values in a consistent manner. - */ - WRITE_ONCE(time->offset, time->time - time->stamp); -} - -static_assert(offsetof(struct perf_event_context, timeguest) - - offsetof(struct perf_event_context, time) == - sizeof(struct perf_time_ctx)); - -#define T_TOTAL 0 -#define T_GUEST 1 - -static inline u64 __perf_event_time_ctx(struct perf_event *event, - struct perf_time_ctx *times) -{ - u64 time = times[T_TOTAL].time; - - if (event->attr.exclude_guest) - time -= times[T_GUEST].time; - - return time; -} - -static inline u64 __perf_event_time_ctx_now(struct perf_event *event, - struct perf_time_ctx *times, - u64 now) -{ - if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { - /* - * (now + times[total].offset) - (now + times[guest].offset) := - * times[total].offset - times[guest].offset - */ - return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); - } - - return now + READ_ONCE(times[T_TOTAL].offset); -} - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -859,16 +768,12 @@ static inline int is_cgroup_event(struct perf_event *event) return event->cgrp != NULL; } -static_assert(offsetof(struct perf_cgroup_info, timeguest) - - offsetof(struct perf_cgroup_info, time) == - sizeof(struct perf_time_ctx)); - static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return __perf_event_time_ctx(event, &t->time); + return t->time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -877,21 +782,20 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return __perf_event_time_ctx(event, &t->time); - - return __perf_event_time_ctx_now(event, &t->time, now); + return t->time; + now += READ_ONCE(t->timeoffset); + return now; } -static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) { - update_perf_time_ctx(&info->timeguest, now, adv); -} - -static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) -{ - update_perf_time_ctx(&info->time, now, true); - if (is_guest_mediated_pmu_loaded()) - __update_cgrp_guest_time(info, now, true); + if (adv) + info->time += now - info->timestamp; + info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) @@ -907,7 +811,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - update_cgrp_time(info, now); + __update_cgrp_time(info, now, true); if (final) __store_release(&info->active, 0); } @@ -930,11 +834,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - update_cgrp_time(info, perf_clock()); + __update_cgrp_time(info, perf_clock(), true); } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -954,12 +858,8 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - if (guest) { - __update_cgrp_guest_time(info, ctx->time.stamp, false); - } else { - update_perf_time_ctx(&info->time, ctx->time.stamp, false); - __store_release(&info->active, 1); - } + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); } } @@ -991,7 +891,7 @@ static void perf_cgroup_switch(struct task_struct *task) WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP); + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); /* @@ -1007,7 +907,7 @@ static void perf_cgroup_switch(struct task_struct *task) */ ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP); + perf_ctx_enable(&cpuctx->ctx, true); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -1168,7 +1068,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } @@ -1584,24 +1484,29 @@ static void perf_unpin_context(struct perf_event_context *ctx) */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { - lockdep_assert_held(&ctx->lock); - - update_perf_time_ctx(&ctx->time, perf_clock(), adv); -} + u64 now = perf_clock(); -static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) -{ lockdep_assert_held(&ctx->lock); - /* must be called after __update_context_time(); */ - update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); + if (adv) + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); - if (is_guest_mediated_pmu_loaded()) - __update_context_guest_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) @@ -1614,7 +1519,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return __perf_event_time_ctx(event, &ctx->time); + return ctx->time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1628,9 +1533,10 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return __perf_event_time_ctx(event, &ctx->time); + return ctx->time; - return __perf_event_time_ctx_now(event, &ctx->time, now); + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -2819,15 +2725,14 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - enum event_type_t event_type) + struct perf_event_context *ctx) { - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED | event_type); + ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, EVENT_PINNED | event_type); - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE | event_type); + ctx_sched_in(ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, EVENT_FLEXIBLE | event_type); + ctx_sched_in(ctx, EVENT_FLEXIBLE); } /* @@ -2865,9 +2770,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx, 0); + perf_ctx_disable(&cpuctx->ctx, false); if (task_ctx) { - perf_ctx_disable(task_ctx, 0); + perf_ctx_disable(task_ctx, false); task_ctx_sched_out(task_ctx, event_type); } @@ -2883,11 +2788,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, 0); + perf_event_sched_in(cpuctx, task_ctx); - perf_ctx_enable(&cpuctx->ctx, 0); + perf_ctx_enable(&cpuctx->ctx, false); if (task_ctx) - perf_ctx_enable(task_ctx, 0); + perf_ctx_enable(task_ctx, false); } void perf_pmu_resched(struct pmu *pmu) @@ -3430,9 +3335,11 @@ static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); - enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3459,8 +3366,7 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); - /* vPMU should not stop time */ - update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && ctx == &cpuctx->ctx); + update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -3468,7 +3374,7 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) barrier(); } - ctx->is_active &= ~active_type; + ctx->is_active &= ~event_type; if (!(ctx->is_active & EVENT_ALL)) ctx->is_active = 0; @@ -3478,21 +3384,10 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) cpuctx->task_ctx = NULL; } - if (event_type & EVENT_GUEST) { - /* - * Schedule out all exclude_guest events of PMU - * with PERF_PMU_CAP_MEDIATED_VPMU. - */ - is_active = EVENT_ALL; - __update_context_guest_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx, true); - barrier(); - } else { - is_active ^= ctx->is_active; /* changed bits */ - } + is_active ^= ctx->is_active; /* changed bits */ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (perf_skip_pmu_ctx(pmu_ctx, event_type)) + if (cgroup && !pmu_ctx->nr_cgroups) continue; __pmu_ctx_sched_out(pmu_ctx, is_active); } @@ -3686,7 +3581,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx, 0); + perf_ctx_disable(ctx, false); /* PMIs are disabled; ctx->nr_pending is stable. */ if (local_read(&ctx->nr_pending) || @@ -3706,7 +3601,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, task, false); perf_event_swap_task_ctx_data(ctx, next_ctx); - perf_ctx_enable(ctx, 0); + perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not @@ -3730,13 +3625,13 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx, 0); + perf_ctx_disable(ctx, false); inside_switch: perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, EVENT_ALL); - perf_ctx_enable(ctx, 0); + perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } @@ -3989,15 +3884,10 @@ static inline void group_update_userpage(struct perf_event *group_event) event_update_userpage(event); } -struct merge_sched_data { - int can_add_hw; - enum event_type_t event_type; -}; - static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - struct merge_sched_data *msd = data; + int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -4005,22 +3895,13 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - /* - * Don't schedule in any host events from PMU with - * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. - */ - if (is_guest_mediated_pmu_loaded() && - event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && - !(msd->event_type & EVENT_GUEST)) - return 0; - - if (group_can_go_on(event, msd->can_add_hw)) { + if (group_can_go_on(event, *can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - msd->can_add_hw = 0; + *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -4039,42 +3920,40 @@ static int merge_sched_in(struct perf_event *event, void *data) static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu, - enum event_type_t event_type) + struct pmu *pmu) { - struct merge_sched_data msd = { - .can_add_hw = 1, - .event_type = event_type, - }; + int can_add_hw = 1; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &msd); + merge_sched_in, &can_add_hw); } static void ctx_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - enum event_type_t event_type) + bool cgroup) { struct perf_event_pmu_context *pmu_ctx; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (perf_skip_pmu_ctx(pmu_ctx, event_type)) + if (cgroup && !pmu_ctx->nr_cgroups) continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type); + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); } } static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); } static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); - enum event_type_t active_type = event_type & ~EVENT_FLAGS; int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -4082,11 +3961,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) return; if (!(is_active & EVENT_TIME)) { - /* EVENT_TIME should be active while the guest runs */ - WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx, false); + perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -4094,7 +3971,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) barrier(); } - ctx->is_active |= active_type | EVENT_TIME; + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) cpuctx->task_ctx = ctx; @@ -4102,34 +3979,18 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - if (event_type & EVENT_GUEST) { - /* - * Schedule in the required exclude_guest events of PMU - * with PERF_PMU_CAP_MEDIATED_VPMU. - */ - is_active = event_type & EVENT_ALL; - - /* - * Update ctx time to set the new start time for - * the exclude_guest events. - */ - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, false); - barrier(); - } else { - is_active ^= ctx->is_active; /* changed bits */ - } + is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) - ctx_groups_sched_in(ctx, &ctx->pinned_groups, event_type); + ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) - ctx_groups_sched_in(ctx, &ctx->flexible_groups, event_type); + ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); } static void perf_event_context_sched_in(struct task_struct *task) @@ -4144,11 +4005,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx, 0); + perf_ctx_disable(ctx, false); perf_ctx_sched_task_cb(ctx, task, true); - perf_ctx_enable(ctx, 0); + perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -4161,7 +4022,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx, 0); + perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -4171,18 +4032,18 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx, 0); + perf_ctx_disable(&cpuctx->ctx, false); ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx, 0); + perf_event_sched_in(cpuctx, ctx); perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx, 0); + perf_ctx_enable(&cpuctx->ctx, false); - perf_ctx_enable(ctx, 0); + perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -5157,7 +5018,6 @@ static void free_event_rcu(struct rcu_head *head) { struct perf_event *event = container_of(head, typeof(*event), rcu_head); - security_perf_event_free(event); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); @@ -5391,15 +5251,14 @@ static void perf_pending_task_sync(struct perf_event *event) rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); } -static void mediated_pmu_unaccount_event(struct perf_event *event); - static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); perf_pending_task_sync(event); unaccount_event(event); - mediated_pmu_unaccount_event(event); + + security_perf_event_free(event); if (event->rb) { /* @@ -5956,133 +5815,6 @@ u64 perf_event_pause(struct perf_event *event, bool reset) } EXPORT_SYMBOL_GPL(perf_event_pause); -#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU -static atomic_t nr_include_guest_events __read_mostly; - -static atomic_t nr_mediated_pmu_vms __read_mostly; -static DEFINE_MUTEX(perf_mediated_pmu_mutex); - -/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ -static inline bool is_include_guest_event(struct perf_event *event) -{ - if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) && - !event->attr.exclude_guest) - return true; - - return false; -} - -static int mediated_pmu_account_event(struct perf_event *event) -{ - if (!is_include_guest_event(event)) - return 0; - - guard(mutex)(&perf_mediated_pmu_mutex); - - if (atomic_read(&nr_mediated_pmu_vms)) - return -EOPNOTSUPP; - - atomic_inc(&nr_include_guest_events); - return 0; -} - -static void mediated_pmu_unaccount_event(struct perf_event *event) -{ - if (!is_include_guest_event(event)) - return; - - atomic_dec(&nr_include_guest_events); -} - -/* - * Currently invoked at VM creation to - * - Check whether there are existing !exclude_guest events of PMU with - * PERF_PMU_CAP_MEDIATED_VPMU - * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on - * PMUs with PERF_PMU_CAP_MEDIATED_VPMU - * - * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf - * still owns all the PMU resources. - */ -int perf_create_mediated_pmu(void) -{ - guard(mutex)(&perf_mediated_pmu_mutex); - if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) - return 0; - - if (atomic_read(&nr_include_guest_events)) - return -EBUSY; - - atomic_inc(&nr_mediated_pmu_vms); - return 0; -} -EXPORT_SYMBOL_GPL(perf_create_mediated_pmu); - -void perf_release_mediated_pmu(void) -{ - if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms))) - return; - - atomic_dec(&nr_mediated_pmu_vms); -} -EXPORT_SYMBOL_GPL(perf_release_mediated_pmu); - -/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ -void perf_load_guest_context(void) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); - - lockdep_assert_irqs_disabled(); - - guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); - - if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded))) - return; - - perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); - ctx_sched_out(&cpuctx->ctx, EVENT_GUEST); - if (cpuctx->task_ctx) { - perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); - task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST); - } - - perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); - if (cpuctx->task_ctx) - perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); - - __this_cpu_write(guest_ctx_loaded, true); -} -EXPORT_SYMBOL_GPL(perf_load_guest_context); - -void perf_put_guest_context(void) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); - - lockdep_assert_irqs_disabled(); - - guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); - - if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded))) - return; - - perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); - if (cpuctx->task_ctx) - perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); - - perf_event_sched_in(cpuctx, cpuctx->task_ctx, EVENT_GUEST); - - if (cpuctx->task_ctx) - perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); - perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); - - __this_cpu_write(guest_ctx_loaded, false); -} -EXPORT_SYMBOL_GPL(perf_put_guest_context); -#else -static int mediated_pmu_account_event(struct perf_event *event) { return 0; } -static void mediated_pmu_unaccount_event(struct perf_event *event) {} -#endif - /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -6463,21 +6195,22 @@ void perf_event_update_userpage(struct perf_event *event) goto unlock; /* - * Disable preemption to guarantee consistent time stamps are stored to - * the user page. - */ - preempt_disable(); - - /* - * Compute total_time_enabled, total_time_running based on snapshot - * values taken when the event was last scheduled in. + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. * - * We cannot simply call update_context_time() because doing so would - * lead to deadlock when called from NMI context. + * we cannot simply called update_context_time() + * because of locking issue as we can be called in + * NMI context */ calc_timer_values(event, &now, &enabled, &running); userpg = rb->user_page; + /* + * Disable preemption to guarantee consistent time stamps are stored to + * the user page. + */ + preempt_disable(); ++userpg->lock; barrier(); userpg->index = perf_event_index(event); @@ -7207,7 +6940,6 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); -DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { @@ -7222,10 +6954,6 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); - - if (cbs->handle_mediated_pmi) - static_call_update(__perf_guest_handle_mediated_pmi, - cbs->handle_mediated_pmi); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -7237,8 +6965,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); - static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); - static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, + (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); @@ -7670,11 +7398,13 @@ static void perf_output_read(struct perf_output_handle *handle, u64 read_format = event->attr.read_format; /* - * Compute total_time_enabled, total_time_running based on snapshot - * values taken when the event was last scheduled in. + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. * - * We cannot simply call update_context_time() because doing so would - * lead to deadlock when called from NMI context. + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); @@ -11673,14 +11403,14 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { - local64_set(&event->hw.prev_count, event->ctx->time.time); + local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time.time); + task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -11700,8 +11430,8 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); - u64 delta = now - event->ctx->time.stamp; - u64 time = event->ctx->time.time + delta; + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } @@ -12638,10 +12368,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) goto err_callchain_buffer; - err = mediated_pmu_account_event(event); - if (err) - goto err_callchain_buffer; - /* symmetric to unaccount_event() in _free_event() */ account_event(event); diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h index e3ff9ef3d29bec3735d33bcac29582556deb5919..3a19904c2db6935fda03c0a7c9eeaa47e62f823c 100644 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ b/tools/arch/x86/include/asm/irq_vectors.h @@ -77,8 +77,7 @@ */ #define IRQ_WORK_VECTOR 0xf6 -#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 - +/* 0xf5 - unused, was UV_BAU_MESSAGE */ #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index af83e66051b2c1acbe098a4e96b17244f75e8f6c..6c863c50d29fa3c644c5394433a956cbd67fcd66 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6612,22 +6612,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .state = kvm_guest_state, .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, - .handle_mediated_pmi = NULL, }; -void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), - void (*mediated_pmi_handler)(void)) +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; - kvm_guest_cbs.handle_mediated_pmi = mediated_pmi_handler; - perf_register_guest_info_callbacks(&kvm_guest_cbs); } - -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) -{ - __kvm_register_perf_callbacks(pt_intr_handler, NULL); -} void kvm_unregister_perf_callbacks(void) { perf_unregister_guest_info_callbacks(&kvm_guest_cbs);