From cda2eaa35948893d70145490d5d6ded546fc3bc6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Nov 2017 16:40:24 +1100 Subject: [PATCH 001/197] KVM: PPC: Book3S HV: Avoid shifts by negative amounts The kvmppc_hpte_page_shifts function decodes the actual and base page sizes for a HPTE, returning -1 if it doesn't recognize the page size encoding. This then gets used as a shift amount in various places, which is undefined behaviour. This was reported by Coverity. In fact this should never occur, since we should only get HPTEs in the HPT which have a recognized page size encoding. The only place where this might not be true is in the call to kvmppc_actual_pgsz() near the beginning of kvmppc_do_h_enter(), where we are validating the HPTE value passed in from the guest. So to fix this and eliminate the undefined behaviour, we make kvmppc_hpte_page_shifts return 0 for unrecognized page size encodings, and make kvmppc_actual_pgsz() detect that case and return 0 for the page size, which will then cause kvmppc_do_h_enter() to return an error and refuse to insert any HPTE with an unrecognized page size encoding. To ensure that we don't get undefined behaviour in compute_tlbie_rb(), we take the 4k page size path for any unrecognized page size encoding. This should never be hit in practice because it is only used on HPTE values which have previously been checked for having a recognized page size encoding. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s_64.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 735cfa35298ac7..998f7b7aaa9e5c 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -122,13 +122,13 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) lphi = (l >> 16) & 0xf; switch ((l >> 12) & 0xf) { case 0: - return !lphi ? 24 : -1; /* 16MB */ + return !lphi ? 24 : 0; /* 16MB */ break; case 1: return 16; /* 64kB */ break; case 3: - return !lphi ? 34 : -1; /* 16GB */ + return !lphi ? 34 : 0; /* 16GB */ break; case 7: return (16 << 8) + 12; /* 64kB in 4kB */ @@ -140,7 +140,7 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) return (24 << 8) + 12; /* 16MB in 4kB */ break; } - return -1; + return 0; } static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l) @@ -159,7 +159,11 @@ static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r) { - return 1ul << kvmppc_hpte_actual_page_shift(v, r); + int shift = kvmppc_hpte_actual_page_shift(v, r); + + if (shift) + return 1ul << shift; + return 0; } static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift) @@ -232,7 +236,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, va_low ^= v >> (SID_SHIFT_1T - 16); va_low &= 0x7ff; - if (b_pgshift == 12) { + if (b_pgshift <= 12) { if (a_pgshift > 12) { sllp = (a_pgshift == 16) ? 5 : 4; rb |= sllp << 5; /* AP field */ From 117647ff936e2d9684cc881d87c0291f46669c20 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Nov 2017 16:43:35 +1100 Subject: [PATCH 002/197] KVM: PPC: Book3S HV: Fix typo in kvmppc_hv_get_dirty_log_radix() This fixes a typo where the intent was to assign to 'j' in order to skip some number of bits in the dirty bitmap for a guest. The effect of the typo is benign since it means we just iterate through all the bits rather than skipping bits which we know will be zero. This issue was found by Coverity. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 58618f644c56bd..0c854816e653e2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -573,7 +573,7 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, j = i + 1; if (npages) { set_dirty_bits(map, i, npages); - i = j + npages; + j = i + npages; } } return 0; From 4fcf361dbdbdb43038bb173e2391c4073e713745 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 20 Nov 2017 14:17:53 +1100 Subject: [PATCH 003/197] KVM: PPC: Book3S HV: Remove useless statement This removes a statement that has no effect. It should have been removed in commit 898b25b202f3 ("KVM: PPC: Book3S HV: Simplify dynamic micro-threading code", 2017-06-22) along with the loop over the piggy-backed virtual cores. This issue was reported by Coverity. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2d46037ce93664..597498d6db2e27 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2831,7 +2831,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ if (!thr0_done) kvmppc_start_thread(NULL, pvc); - thr += pvc->num_threads; } /* From c0093f1a38a0fd6c32a2269f0533bb13fb95143d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 20 Nov 2017 16:12:25 +1100 Subject: [PATCH 004/197] KVM: PPC: Book3S HV: Fix conditions for starting vcpu This corrects the test that determines whether a vcpu that has just become able to run in the guest (e.g. it has just finished handling a hypercall or hypervisor page fault) and whose virtual core is already running somewhere as a "piggybacked" vcore can start immediately or not. (A piggybacked vcore is one which is executing along with another vcore as a result of dynamic micro-threading.) Previously the test tried to lock the piggybacked vcore using spin_trylock, which would always fail because the vcore was already locked, and so the vcpu would have to wait until its vcore exited the guest before it could enter. In fact the vcpu can enter if its vcore is in VCORE_PIGGYBACK state and not already exiting (or exited) the guest, so the test in VCORE_PIGGYBACK state is basically the same as for VCORE_RUNNING state. Coverity detected this as a double unlock issue, which it isn't because the spin_trylock would always fail. This will fix the apparent double unlock as well. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 597498d6db2e27..c4f0bebfc5ba48 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3175,17 +3175,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * this thread straight away and have it join in. */ if (!signal_pending(current)) { - if (vc->vcore_state == VCORE_PIGGYBACK) { - if (spin_trylock(&vc->lock)) { - if (vc->vcore_state == VCORE_RUNNING && - !VCORE_IS_EXITING(vc)) { - kvmppc_create_dtl_entry(vcpu, vc); - kvmppc_start_thread(vcpu, vc); - trace_kvm_guest_enter(vcpu); - } - spin_unlock(&vc->lock); - } - } else if (vc->vcore_state == VCORE_RUNNING && + if ((vc->vcore_state == VCORE_PIGGYBACK || + vc->vcore_state == VCORE_RUNNING) && !VCORE_IS_EXITING(vc)) { kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu, vc); From 9aa6825bbb7526a7fdec137b7cc3b042581cd2fc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 20 Nov 2017 19:56:27 +1100 Subject: [PATCH 005/197] KVM: PPC: Book3S: Eliminate some unnecessary checks In an excess of caution, commit 6f63e81bda98 ("KVM: PPC: Book3S: Add MMIO emulation for FP and VSX instructions", 2017-02-21) included checks for the case that vcpu->arch.mmio_vsx_copy_nums is less than zero, even though its type is u8. This causes a Coverity warning, so we remove the check for < 0. We also adjust the associated comment to be more accurate ("4 or less" rather than "less than 4"). Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6b6c53c42ac945..c2c7ef33055325 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1101,11 +1101,9 @@ int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu, { enum emulation_result emulated = EMULATE_DONE; - /* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */ - if ( (vcpu->arch.mmio_vsx_copy_nums > 4) || - (vcpu->arch.mmio_vsx_copy_nums < 0) ) { + /* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */ + if (vcpu->arch.mmio_vsx_copy_nums > 4) return EMULATE_FAIL; - } while (vcpu->arch.mmio_vsx_copy_nums) { emulated = __kvmppc_handle_load(run, vcpu, rt, bytes, @@ -1247,11 +1245,9 @@ int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.io_gpr = rs; - /* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */ - if ( (vcpu->arch.mmio_vsx_copy_nums > 4) || - (vcpu->arch.mmio_vsx_copy_nums < 0) ) { + /* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */ + if (vcpu->arch.mmio_vsx_copy_nums > 4) return EMULATE_FAIL; - } while (vcpu->arch.mmio_vsx_copy_nums) { if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1) From b38defdb44fb0377b38896e38ac1fc8482e68f76 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:23 -0600 Subject: [PATCH 006/197] Documentation/virtual/kvm: Add AMD Secure Encrypted Virtualization (SEV) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a Documentation entry to describe the AMD Secure Encrypted Virtualization (SEV) feature. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Jonathan Corbet Cc: Borislav Petkov Cc: Tom Lendacky Cc: kvm@vger.kernel.org Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- Documentation/virtual/kvm/00-INDEX | 3 ++ .../virtual/kvm/amd-memory-encryption.rst | 45 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 Documentation/virtual/kvm/amd-memory-encryption.rst diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX index 69fe1a8b7ad16e..3da73aabff5ac0 100644 --- a/Documentation/virtual/kvm/00-INDEX +++ b/Documentation/virtual/kvm/00-INDEX @@ -26,3 +26,6 @@ s390-diag.txt - Diagnose hypercall description (for IBM S/390) timekeeping.txt - timekeeping virtualization for x86-based architectures. +amd-memory-encryption.txt + - notes on AMD Secure Encrypted Virtualization feature and SEV firmware + command description diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst new file mode 100644 index 00000000000000..a8ef21e737db69 --- /dev/null +++ b/Documentation/virtual/kvm/amd-memory-encryption.rst @@ -0,0 +1,45 @@ +====================================== +Secure Encrypted Virtualization (SEV) +====================================== + +Overview +======== + +Secure Encrypted Virtualization (SEV) is a feature found on AMD processors. + +SEV is an extension to the AMD-V architecture which supports running +virtual machines (VMs) under the control of a hypervisor. When enabled, +the memory contents of a VM will be transparently encrypted with a key +unique to that VM. + +The hypervisor can determine the SEV support through the CPUID +instruction. The CPUID function 0x8000001f reports information related +to SEV:: + + 0x8000001f[eax]: + Bit[1] indicates support for SEV + ... + [ecx]: + Bits[31:0] Number of encrypted guests supported simultaneously + +If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 +(MSR_K7_HWCR) can be used to determine if it can be enabled:: + + 0xc001_0010: + Bit[23] 1 = memory encryption can be enabled + 0 = memory encryption can not be enabled + + 0xc001_0015: + Bit[0] 1 = memory encryption can be enabled + 0 = memory encryption can not be enabled + +When SEV support is available, it can be enabled in a specific VM by +setting the SEV bit before executing VMRUN.:: + + VMCB[0x90]: + Bit[1] 1 = SEV is enabled + 0 = SEV is disabled + +SEV hardware uses ASIDs to associate a memory encryption key with a VM. +Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value +defined in the CPUID 0x8000001f[ecx] field. From 18c71ce9c8822d48d2b4c50242051535d46082ac Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 4 Dec 2017 10:57:23 -0600 Subject: [PATCH 007/197] x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the CPU features to include identifying and reporting on the Secure Encrypted Virtualization (SEV) feature. SEV is identified by CPUID 0x8000001f, but requires BIOS support to enable it (set bit 23 of MSR_K8_SYSCFG and set bit 0 of MSR_K7_HWCR). Only show the SEV feature as available if reported by CPUID and enabled by BIOS. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: kvm@vger.kernel.org Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 2 + arch/x86/kernel/cpu/amd.c | 66 +++++++++++++++++++++--------- arch/x86/kernel/cpu/scattered.c | 1 + 4 files changed, 50 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c0b0e9e8aa66eb..19b955adacff28 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -201,6 +201,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV ( 7*32+11) /* AMD Secure Encrypted Virtualization */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 34c4922bbc3fb5..507d3e30f7fe8c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -382,6 +382,8 @@ #define MSR_K7_PERFCTR3 0xc0010007 #define MSR_K7_CLK_CTL 0xc001001b #define MSR_K7_HWCR 0xc0010015 +#define MSR_K7_HWCR_SMMLOCK_BIT 0 +#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d58184b7cd4438..c1234aa0550ceb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -556,6 +556,51 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } } +static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) +{ + u64 msr; + + /* + * BIOS support is required for SME and SEV. + * For SME: If BIOS has enabled SME then adjust x86_phys_bits by + * the SME physical address space reduction value. + * If BIOS has not enabled SME then don't advertise the + * SME feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise the + * SEV feature (set in scattered.c). + * + * In all cases, since support for SME and SEV requires long mode, + * don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { + /* Check if memory encryption is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + goto clear_all; + + /* + * Always adjust physical address bits. Even though this + * will be a value above 32-bits this is still done for + * CONFIG_X86_32 so that accurate values are reported. + */ + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + + if (IS_ENABLED(CONFIG_X86_32)) + goto clear_all; + + rdmsrl(MSR_K7_HWCR, msr); + if (!(msr & MSR_K7_HWCR_SMMLOCK)) + goto clear_sev; + + return; + +clear_all: + clear_cpu_cap(c, X86_FEATURE_SME); +clear_sev: + clear_cpu_cap(c, X86_FEATURE_SEV); + } +} + static void early_init_amd(struct cpuinfo_x86 *c) { u32 dummy; @@ -627,26 +672,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); - /* - * BIOS support is required for SME. If BIOS has enabled SME then - * adjust x86_phys_bits by the SME physical address space reduction - * value. If BIOS has not enabled SME then don't advertise the - * feature (set in scattered.c). Also, since the SME support requires - * long mode, don't advertise the feature under CONFIG_X86_32. - */ - if (cpu_has(c, X86_FEATURE_SME)) { - u64 msr; - - /* Check if SME is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { - c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; - if (IS_ENABLED(CONFIG_X86_32)) - clear_cpu_cap(c, X86_FEATURE_SME); - } else { - clear_cpu_cap(c, X86_FEATURE_SME); - } - } + early_detect_mem_encrypt(c); } static void init_amd_k8(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 05459ad3db46e2..63a78d5fe505bd 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -32,6 +32,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, + { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; From cea3a19b007a690a40fac373f22bf34ea69761a2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 4 Dec 2017 10:57:24 -0600 Subject: [PATCH 008/197] kvm: svm: prepare for new bit definition in nested_ctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the nested_ctl variable in the vmcb_control_area structure is used to indicate nested paging support. The nested paging support field is actually defined as bit 0 of the field. In order to support a new feature flag the usage of the nested_ctl and nested paging support must be converted to operate on a single bit. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/svm.h | 2 ++ arch/x86/kvm/svm.c | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 78dd9df881577d..c936c9824405c6 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -146,6 +146,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL #define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL +#define SVM_NESTED_CTL_NP_ENABLE BIT(0) + struct __attribute__ ((__packed__)) vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 59e13a79c2e3ee..1c8158a6ee06b9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1293,7 +1293,7 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ - control->nested_ctl = 1; + control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); @@ -2918,7 +2918,8 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) if (vmcb->control.asid == 0) return false; - if (vmcb->control.nested_ctl && !npt_enabled) + if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && + !npt_enabled) return false; return true; @@ -2932,7 +2933,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; - if (nested_vmcb->control.nested_ctl) { + if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { kvm_mmu_unload(&svm->vcpu); svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; nested_svm_init_mmu_context(&svm->vcpu); From ba7c3398dc7eedbbe9d19a17ce3ab98349079fc7 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 4 Dec 2017 10:57:24 -0600 Subject: [PATCH 009/197] kvm: svm: Add SEV feature definitions to KVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define the SEV enable bit for the VMCB control structure. The hypervisor will use this bit to enable SEV in the guest. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/svm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index c936c9824405c6..0487ac0548704d 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -147,6 +147,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL #define SVM_NESTED_CTL_NP_ENABLE BIT(0) +#define SVM_NESTED_CTL_SEV_ENABLE BIT(1) struct __attribute__ ((__packed__)) vmcb_seg { u16 selector; From 4faefff3241e02dd66bfb77bd7ca40f3aa08f62e Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:25 -0600 Subject: [PATCH 010/197] KVM: SVM: Prepare to reserve asid for SEV guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, ASID allocation start at 1. Add a svm_vcpu_data.min_asid which allows supplying a dynamic start ASID. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Paolo Bonzini Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1c8158a6ee06b9..e403cf1f6ba3d3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -525,6 +525,7 @@ struct svm_cpu_data { u64 asid_generation; u32 max_asid; u32 next_asid; + u32 min_asid; struct kvm_ldttss_desc *tss_desc; struct page *save_area; @@ -782,6 +783,7 @@ static int svm_hardware_enable(void) sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; + sd->min_asid = 1; gdt = get_current_gdt_rw(); sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); @@ -2088,7 +2090,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) { if (sd->next_asid > sd->max_asid) { ++sd->asid_generation; - sd->next_asid = 1; + sd->next_asid = sd->min_asid; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } From 8765d75329a386dd7742f94a1ea5fdcdea8d93d0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:25 -0600 Subject: [PATCH 011/197] KVM: X86: Extend CPUID range to include new leaf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CPUID leaf provides the memory encryption support information on AMD Platform. Its complete description is available in APM volume 2, Section 15.34 Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Borislav Petkov Signed-off-by: Brijesh Singh --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0099e10eb04525..c6473ca825cdd8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -604,7 +604,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x8000001a); + entry->eax = min(entry->eax, 0x8000001f); break; case 0x80000001: entry->edx &= kvm_cpuid_8000_0001_edx_x86_features; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e403cf1f6ba3d3..7e302a25bc4550 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5170,6 +5170,12 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->edx |= SVM_FEATURE_NPT; break; + case 0x8000001F: + /* Support memory encryption cpuid if host supports it */ + if (boot_cpu_has(X86_FEATURE_SEV)) + cpuid(0x8000001f, &entry->eax, &entry->ebx, + &entry->ecx, &entry->edx); + } } From 5acc5c063196b4a531a761a954023c1848ec832b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:26 -0600 Subject: [PATCH 012/197] KVM: Introduce KVM_MEMORY_ENCRYPT_OP ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the hardware supports memory encryption then the KVM_MEMORY_ENCRYPT_OP ioctl can be used by qemu to issue a platform specific memory encryption commands. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Paolo Bonzini Reviewed-by: Borislav Petkov --- Documentation/virtual/kvm/api.txt | 16 ++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 6 ++++++ include/uapi/linux/kvm.h | 2 ++ 4 files changed, 26 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f670e4b9e7f33f..c8755be35543a6 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3394,6 +3394,22 @@ invalid, if invalid pages are written to (e.g. after the end of memory) or if no page table is present for the addresses (e.g. when using hugepages). +4.109 KVM_MEMORY_ENCRYPT_OP + +Capability: basic +Architectures: x86 +Type: system +Parameters: an opaque platform specific structure (in/out) +Returns: 0 on success; -1 on error + +If the platform supports creating encrypted VMs then this ioctl can be used +for issuing platform-specific memory encryption commands to manage those +encrypted VMs. + +Currently, this ioctl is used for issuing Secure Encrypted Virtualization +(SEV) commands on AMD Processors. The SEV commands are defined in +Documentation/virtual/kvm/amd-memory-encryption.txt. + 5. The kvm_run structure ------------------------ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1bfb99770c3419..c87e214d55dfdc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1066,6 +1066,8 @@ struct kvm_x86_ops { int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); int (*enable_smi_window)(struct kvm_vcpu *vcpu); + + int (*mem_enc_op)(struct kvm *kvm, void __user *argp); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 34c85aa2e2d1d4..7bbed0c0ba7998 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4281,6 +4281,12 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_enable_cap(kvm, &cap); break; } + case KVM_MEMORY_ENCRYPT_OP: { + r = -ENOTTY; + if (kvm_x86_ops->mem_enc_op) + r = kvm_x86_ops->mem_enc_op(kvm, argp); + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 282d7613fce878..addd0cf4445fac 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1358,6 +1358,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_S390_CMMA_MIGRATION */ #define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log) #define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) +/* Memory Encryption Commands */ +#define KVM_MEMORY_ENCRYPT_OP _IOWR(KVMIO, 0xba, unsigned long) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) From 69eaedee411c1fc1cf123520897a96b7cf04d8a0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:26 -0600 Subject: [PATCH 013/197] KVM: Introduce KVM_MEMORY_ENCRYPT_{UN,}REG_REGION ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hardware supports memory encryption then KVM_MEMORY_ENCRYPT_REG_REGION and KVM_MEMORY_ENCRYPT_UNREG_REGION ioctl's can be used by userspace to register/unregister the guest memory regions which may contain the encrypted data (e.g guest RAM, PCI BAR, SMRAM etc). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- Documentation/virtual/kvm/api.txt | 34 +++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 24 ++++++++++++++++++++++ include/uapi/linux/kvm.h | 8 ++++++++ 4 files changed, 68 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c8755be35543a6..c2ced6a44bbb45 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3410,6 +3410,40 @@ Currently, this ioctl is used for issuing Secure Encrypted Virtualization (SEV) commands on AMD Processors. The SEV commands are defined in Documentation/virtual/kvm/amd-memory-encryption.txt. +4.110 KVM_MEMORY_ENCRYPT_REG_REGION + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_enc_region (in) +Returns: 0 on success; -1 on error + +This ioctl can be used to register a guest memory region which may +contain encrypted data (e.g. guest RAM, SMRAM etc). + +It is used in the SEV-enabled guest. When encryption is enabled, a guest +memory region may contain encrypted data. The SEV memory encryption +engine uses a tweak such that two identical plaintext pages, each at +different locations will have differing ciphertexts. So swapping or +moving ciphertext of those pages will not result in plaintext being +swapped. So relocating (or migrating) physical backing pages for the SEV +guest will require some additional steps. + +Note: The current SEV key management spec does not provide commands to +swap or migrate (move) ciphertext pages. Hence, for now we pin the guest +memory region registered with the ioctl. + +4.111 KVM_MEMORY_ENCRYPT_UNREG_REGION + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_enc_region (in) +Returns: 0 on success; -1 on error + +This ioctl can be used to unregister the guest memory region registered +with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. + 5. The kvm_run structure ------------------------ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c87e214d55dfdc..58b7cc30466b42 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1068,6 +1068,8 @@ struct kvm_x86_ops { int (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); + int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bbed0c0ba7998..926f55cecf2e3e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4287,6 +4287,30 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_x86_ops->mem_enc_op(kvm, argp); break; } + case KVM_MEMORY_ENCRYPT_REG_REGION: { + struct kvm_enc_region region; + + r = -EFAULT; + if (copy_from_user(®ion, argp, sizeof(region))) + goto out; + + r = -ENOTTY; + if (kvm_x86_ops->mem_enc_reg_region) + r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion); + break; + } + case KVM_MEMORY_ENCRYPT_UNREG_REGION: { + struct kvm_enc_region region; + + r = -EFAULT; + if (copy_from_user(®ion, argp, sizeof(region))) + goto out; + + r = -ENOTTY; + if (kvm_x86_ops->mem_enc_unreg_region) + r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion); + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index addd0cf4445fac..c8c65190907d61 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1361,6 +1361,14 @@ struct kvm_s390_ucas_mapping { /* Memory Encryption Commands */ #define KVM_MEMORY_ENCRYPT_OP _IOWR(KVMIO, 0xba, unsigned long) +struct kvm_enc_region { + __u64 addr; + __u64 size; +}; + +#define KVM_MEMORY_ENCRYPT_REG_REGION _IOR(KVMIO, 0xbb, struct kvm_enc_region) +#define KVM_MEMORY_ENCRYPT_UNREG_REGION _IOR(KVMIO, 0xbc, struct kvm_enc_region) + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) From 016db9c5c39fc7f1c766daf2083211318ef93d38 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Dec 2017 10:57:26 -0600 Subject: [PATCH 014/197] crypto: ccp: Build the AMD secure processor driver only with AMD CPU support This is AMD-specific hardware so present it in Kconfig only when AMD CPU support is enabled or on ARM64 where it is also used. Signed-off-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Gary R Hook Cc: Brijesh Singh Cc: Tom Lendacky Cc: Gary Hook Cc: Herbert Xu Cc: "David S. Miller" Cc: linux-crypto@vger.kernel.org --- drivers/crypto/ccp/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/ccp/Kconfig b/drivers/crypto/ccp/Kconfig index 6d626606b9c51d..9c84f98389319c 100644 --- a/drivers/crypto/ccp/Kconfig +++ b/drivers/crypto/ccp/Kconfig @@ -1,5 +1,6 @@ config CRYPTO_DEV_CCP_DD tristate "Secure Processor device driver" + depends on CPU_SUP_AMD || ARM64 default m help Provides AMD Secure Processor device driver. From 1d57b17c60ff245b8e50813bf0fd24143ecf26d4 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:27 -0600 Subject: [PATCH 015/197] crypto: ccp: Define SEV userspace ioctl and command id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a include file which defines the ioctl and command id used for issuing SEV platform management specific commands. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov Acked-by: Gary R Hook --- include/uapi/linux/psp-sev.h | 142 +++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 include/uapi/linux/psp-sev.h diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h new file mode 100644 index 00000000000000..3d77fe91239a80 --- /dev/null +++ b/include/uapi/linux/psp-sev.h @@ -0,0 +1,142 @@ +/* + * Userspace interface for AMD Secure Encrypted Virtualization (SEV) + * platform management commands. + * + * Copyright (C) 2016-2017 Advanced Micro Devices, Inc. + * + * Author: Brijesh Singh + * + * SEV spec 0.14 is available at: + * http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __PSP_SEV_USER_H__ +#define __PSP_SEV_USER_H__ + +#include + +/** + * SEV platform commands + */ +enum { + SEV_FACTORY_RESET = 0, + SEV_PLATFORM_STATUS, + SEV_PEK_GEN, + SEV_PEK_CSR, + SEV_PDH_GEN, + SEV_PDH_CERT_EXPORT, + SEV_PEK_CERT_IMPORT, + + SEV_MAX, +}; + +/** + * SEV Firmware status code + */ +typedef enum { + SEV_RET_SUCCESS = 0, + SEV_RET_INVALID_PLATFORM_STATE, + SEV_RET_INVALID_GUEST_STATE, + SEV_RET_INAVLID_CONFIG, + SEV_RET_INVALID_len, + SEV_RET_ALREADY_OWNED, + SEV_RET_INVALID_CERTIFICATE, + SEV_RET_POLICY_FAILURE, + SEV_RET_INACTIVE, + SEV_RET_INVALID_ADDRESS, + SEV_RET_BAD_SIGNATURE, + SEV_RET_BAD_MEASUREMENT, + SEV_RET_ASID_OWNED, + SEV_RET_INVALID_ASID, + SEV_RET_WBINVD_REQUIRED, + SEV_RET_DFFLUSH_REQUIRED, + SEV_RET_INVALID_GUEST, + SEV_RET_INVALID_COMMAND, + SEV_RET_ACTIVE, + SEV_RET_HWSEV_RET_PLATFORM, + SEV_RET_HWSEV_RET_UNSAFE, + SEV_RET_UNSUPPORTED, + SEV_RET_MAX, +} sev_ret_code; + +/** + * struct sev_user_data_status - PLATFORM_STATUS command parameters + * + * @major: major API version + * @minor: minor API version + * @state: platform state + * @flags: platform config flags + * @build: firmware build id for API version + * @guest_count: number of active guests + */ +struct sev_user_data_status { + __u8 api_major; /* Out */ + __u8 api_minor; /* Out */ + __u8 state; /* Out */ + __u32 flags; /* Out */ + __u8 build; /* Out */ + __u32 guest_count; /* Out */ +} __packed; + +/** + * struct sev_user_data_pek_csr - PEK_CSR command parameters + * + * @address: PEK certificate chain + * @length: length of certificate + */ +struct sev_user_data_pek_csr { + __u64 address; /* In */ + __u32 length; /* In/Out */ +} __packed; + +/** + * struct sev_user_data_cert_import - PEK_CERT_IMPORT command parameters + * + * @pek_address: PEK certificate chain + * @pek_len: length of PEK certificate + * @oca_address: OCA certificate chain + * @oca_len: length of OCA certificate + */ +struct sev_user_data_pek_cert_import { + __u64 pek_cert_address; /* In */ + __u32 pek_cert_len; /* In */ + __u64 oca_cert_address; /* In */ + __u32 oca_cert_len; /* In */ +} __packed; + +/** + * struct sev_user_data_pdh_cert_export - PDH_CERT_EXPORT command parameters + * + * @pdh_address: PDH certificate address + * @pdh_len: length of PDH certificate + * @cert_chain_address: PDH certificate chain + * @cert_chain_len: length of PDH certificate chain + */ +struct sev_user_data_pdh_cert_export { + __u64 pdh_cert_address; /* In */ + __u32 pdh_cert_len; /* In/Out */ + __u64 cert_chain_address; /* In */ + __u32 cert_chain_len; /* In/Out */ +} __packed; + +/** + * struct sev_issue_cmd - SEV ioctl parameters + * + * @cmd: SEV commands to execute + * @opaque: pointer to the command structure + * @error: SEV FW return code on failure + */ +struct sev_issue_cmd { + __u32 cmd; /* In */ + __u64 data; /* In */ + __u32 error; /* Out */ +} __packed; + +#define SEV_IOC_TYPE 'S' +#define SEV_ISSUE_CMD _IOWR(SEV_IOC_TYPE, 0x0, struct sev_issue_cmd) + +#endif /* __PSP_USER_SEV_H */ From 592d5e74ddb28f66c5f4acffcd36156b1621a7c4 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:27 -0600 Subject: [PATCH 016/197] crypto: ccp: Define SEV key management command id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define Secure Encrypted Virtualization (SEV) key management command id and structure. The command definition is available in SEV KM spec 0.14 (http://support.amd.com/TechDocs/55766_SEV-KM API_Specification.pdf) Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov Acked-by: Gary R Hook --- include/linux/psp-sev.h | 465 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 include/linux/psp-sev.h diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h new file mode 100644 index 00000000000000..4a150d17d537e0 --- /dev/null +++ b/include/linux/psp-sev.h @@ -0,0 +1,465 @@ +/* + * AMD Secure Encrypted Virtualization (SEV) driver interface + * + * Copyright (C) 2016-2017 Advanced Micro Devices, Inc. + * + * Author: Brijesh Singh + * + * SEV spec 0.14 is available at: + * http://support.amd.com/TechDocs/55766_SEV-KM API_Specification.pdf + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __PSP_SEV_H__ +#define __PSP_SEV_H__ + +#include + +#ifdef CONFIG_X86 +#include + +#define __psp_pa(x) __sme_pa(x) +#else +#define __psp_pa(x) __pa(x) +#endif + +#define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ + +/** + * SEV platform state + */ +enum sev_state { + SEV_STATE_UNINIT = 0x0, + SEV_STATE_INIT = 0x1, + SEV_STATE_WORKING = 0x2, + + SEV_STATE_MAX +}; + +/** + * SEV platform and guest management commands + */ +enum sev_cmd { + /* platform commands */ + SEV_CMD_INIT = 0x001, + SEV_CMD_SHUTDOWN = 0x002, + SEV_CMD_FACTORY_RESET = 0x003, + SEV_CMD_PLATFORM_STATUS = 0x004, + SEV_CMD_PEK_GEN = 0x005, + SEV_CMD_PEK_CSR = 0x006, + SEV_CMD_PEK_CERT_IMPORT = 0x007, + SEV_CMD_PDH_CERT_EXPORT = 0x008, + SEV_CMD_PDH_GEN = 0x009, + SEV_CMD_DF_FLUSH = 0x00A, + + /* Guest commands */ + SEV_CMD_DECOMMISSION = 0x020, + SEV_CMD_ACTIVATE = 0x021, + SEV_CMD_DEACTIVATE = 0x022, + SEV_CMD_GUEST_STATUS = 0x023, + + /* Guest launch commands */ + SEV_CMD_LAUNCH_START = 0x030, + SEV_CMD_LAUNCH_UPDATE_DATA = 0x031, + SEV_CMD_LAUNCH_UPDATE_VMSA = 0x032, + SEV_CMD_LAUNCH_MEASURE = 0x033, + SEV_CMD_LAUNCH_UPDATE_SECRET = 0x034, + SEV_CMD_LAUNCH_FINISH = 0x035, + + /* Guest migration commands (outgoing) */ + SEV_CMD_SEND_START = 0x040, + SEV_CMD_SEND_UPDATE_DATA = 0x041, + SEV_CMD_SEND_UPDATE_VMSA = 0x042, + SEV_CMD_SEND_FINISH = 0x043, + + /* Guest migration commands (incoming) */ + SEV_CMD_RECEIVE_START = 0x050, + SEV_CMD_RECEIVE_UPDATE_DATA = 0x051, + SEV_CMD_RECEIVE_UPDATE_VMSA = 0x052, + SEV_CMD_RECEIVE_FINISH = 0x053, + + /* Guest debug commands */ + SEV_CMD_DBG_DECRYPT = 0x060, + SEV_CMD_DBG_ENCRYPT = 0x061, + + SEV_CMD_MAX, +}; + +/** + * struct sev_data_init - INIT command parameters + * + * @flags: processing flags + * @tmr_address: system physical address used for SEV-ES + * @tmr_len: len of tmr_address + */ +struct sev_data_init { + u32 flags; /* In */ + u32 reserved; /* In */ + u64 tmr_address; /* In */ + u32 tmr_len; /* In */ +} __packed; + +/** + * struct sev_data_pek_csr - PEK_CSR command parameters + * + * @address: PEK certificate chain + * @len: len of certificate + */ +struct sev_data_pek_csr { + u64 address; /* In */ + u32 len; /* In/Out */ +} __packed; + +/** + * struct sev_data_cert_import - PEK_CERT_IMPORT command parameters + * + * @pek_address: PEK certificate chain + * @pek_len: len of PEK certificate + * @oca_address: OCA certificate chain + * @oca_len: len of OCA certificate + */ +struct sev_data_pek_cert_import { + u64 pek_cert_address; /* In */ + u32 pek_cert_len; /* In */ + u32 reserved; /* In */ + u64 oca_cert_address; /* In */ + u32 oca_cert_len; /* In */ +} __packed; + +/** + * struct sev_data_pdh_cert_export - PDH_CERT_EXPORT command parameters + * + * @pdh_address: PDH certificate address + * @pdh_len: len of PDH certificate + * @cert_chain_address: PDH certificate chain + * @cert_chain_len: len of PDH certificate chain + */ +struct sev_data_pdh_cert_export { + u64 pdh_cert_address; /* In */ + u32 pdh_cert_len; /* In/Out */ + u32 reserved; /* In */ + u64 cert_chain_address; /* In */ + u32 cert_chain_len; /* In/Out */ +} __packed; + +/** + * struct sev_data_decommission - DECOMMISSION command parameters + * + * @handle: handle of the VM to decommission + */ +struct sev_data_decommission { + u32 handle; /* In */ +} __packed; + +/** + * struct sev_data_activate - ACTIVATE command parameters + * + * @handle: handle of the VM to activate + * @asid: asid assigned to the VM + */ +struct sev_data_activate { + u32 handle; /* In */ + u32 asid; /* In */ +} __packed; + +/** + * struct sev_data_deactivate - DEACTIVATE command parameters + * + * @handle: handle of the VM to deactivate + */ +struct sev_data_deactivate { + u32 handle; /* In */ +} __packed; + +/** + * struct sev_data_guest_status - SEV GUEST_STATUS command parameters + * + * @handle: handle of the VM to retrieve status + * @policy: policy information for the VM + * @asid: current ASID of the VM + * @state: current state of the VM + */ +struct sev_data_guest_status { + u32 handle; /* In */ + u32 policy; /* Out */ + u32 asid; /* Out */ + u8 state; /* Out */ +} __packed; + +/** + * struct sev_data_launch_start - LAUNCH_START command parameters + * + * @handle: handle assigned to the VM + * @policy: guest launch policy + * @dh_cert_address: physical address of DH certificate blob + * @dh_cert_len: len of DH certificate blob + * @session_address: physical address of session parameters + * @session_len: len of session parameters + */ +struct sev_data_launch_start { + u32 handle; /* In/Out */ + u32 policy; /* In */ + u64 dh_cert_address; /* In */ + u32 dh_cert_len; /* In */ + u32 reserved; /* In */ + u64 session_address; /* In */ + u32 session_len; /* In */ +} __packed; + +/** + * struct sev_data_launch_update_data - LAUNCH_UPDATE_DATA command parameter + * + * @handle: handle of the VM to update + * @len: len of memory to be encrypted + * @address: physical address of memory region to encrypt + */ +struct sev_data_launch_update_data { + u32 handle; /* In */ + u32 reserved; + u64 address; /* In */ + u32 len; /* In */ +} __packed; + +/** + * struct sev_data_launch_update_vmsa - LAUNCH_UPDATE_VMSA command + * + * @handle: handle of the VM + * @address: physical address of memory region to encrypt + * @len: len of memory region to encrypt + */ +struct sev_data_launch_update_vmsa { + u32 handle; /* In */ + u32 reserved; + u64 address; /* In */ + u32 len; /* In */ +} __packed; + +/** + * struct sev_data_launch_measure - LAUNCH_MEASURE command parameters + * + * @handle: handle of the VM to process + * @address: physical address containing the measurement blob + * @len: len of measurement blob + */ +struct sev_data_launch_measure { + u32 handle; /* In */ + u32 reserved; + u64 address; /* In */ + u32 len; /* In/Out */ +} __packed; + +/** + * struct sev_data_launch_secret - LAUNCH_SECRET command parameters + * + * @handle: handle of the VM to process + * @hdr_address: physical address containing the packet header + * @hdr_len: len of packet header + * @guest_address: system physical address of guest memory region + * @guest_len: len of guest_paddr + * @trans_address: physical address of transport memory buffer + * @trans_len: len of transport memory buffer + */ +struct sev_data_launch_secret { + u32 handle; /* In */ + u32 reserved1; + u64 hdr_address; /* In */ + u32 hdr_len; /* In */ + u32 reserved2; + u64 guest_address; /* In */ + u32 guest_len; /* In */ + u32 reserved3; + u64 trans_address; /* In */ + u32 trans_len; /* In */ +} __packed; + +/** + * struct sev_data_launch_finish - LAUNCH_FINISH command parameters + * + * @handle: handle of the VM to process + */ +struct sev_data_launch_finish { + u32 handle; /* In */ +} __packed; + +/** + * struct sev_data_send_start - SEND_START command parameters + * + * @handle: handle of the VM to process + * @policy: policy information for the VM + * @pdh_cert_address: physical address containing PDH certificate + * @pdh_cert_len: len of PDH certificate + * @plat_certs_address: physical address containing platform certificate + * @plat_certs_len: len of platform certificate + * @amd_certs_address: physical address containing AMD certificate + * @amd_certs_len: len of AMD certificate + * @session_address: physical address containing Session data + * @session_len: len of session data + */ +struct sev_data_send_start { + u32 handle; /* In */ + u32 policy; /* Out */ + u64 pdh_cert_address; /* In */ + u32 pdh_cert_len; /* In */ + u32 reserved1; + u64 plat_cert_address; /* In */ + u32 plat_cert_len; /* In */ + u32 reserved2; + u64 amd_cert_address; /* In */ + u32 amd_cert_len; /* In */ + u32 reserved3; + u64 session_address; /* In */ + u32 session_len; /* In/Out */ +} __packed; + +/** + * struct sev_data_send_update - SEND_UPDATE_DATA command + * + * @handle: handle of the VM to process + * @hdr_address: physical address containing packet header + * @hdr_len: len of packet header + * @guest_address: physical address of guest memory region to send + * @guest_len: len of guest memory region to send + * @trans_address: physical address of host memory region + * @trans_len: len of host memory region + */ +struct sev_data_send_update_data { + u32 handle; /* In */ + u32 reserved1; + u64 hdr_address; /* In */ + u32 hdr_len; /* In/Out */ + u32 reserved2; + u64 guest_address; /* In */ + u32 guest_len; /* In */ + u32 reserved3; + u64 trans_address; /* In */ + u32 trans_len; /* In */ +} __packed; + +/** + * struct sev_data_send_update - SEND_UPDATE_VMSA command + * + * @handle: handle of the VM to process + * @hdr_address: physical address containing packet header + * @hdr_len: len of packet header + * @guest_address: physical address of guest memory region to send + * @guest_len: len of guest memory region to send + * @trans_address: physical address of host memory region + * @trans_len: len of host memory region + */ +struct sev_data_send_update_vmsa { + u32 handle; /* In */ + u64 hdr_address; /* In */ + u32 hdr_len; /* In/Out */ + u32 reserved2; + u64 guest_address; /* In */ + u32 guest_len; /* In */ + u32 reserved3; + u64 trans_address; /* In */ + u32 trans_len; /* In */ +} __packed; + +/** + * struct sev_data_send_finish - SEND_FINISH command parameters + * + * @handle: handle of the VM to process + */ +struct sev_data_send_finish { + u32 handle; /* In */ +} __packed; + +/** + * struct sev_data_receive_start - RECEIVE_START command parameters + * + * @handle: handle of the VM to perform receive operation + * @pdh_cert_address: system physical address containing PDH certificate blob + * @pdh_cert_len: len of PDH certificate blob + * @session_address: system physical address containing session blob + * @session_len: len of session blob + */ +struct sev_data_receive_start { + u32 handle; /* In/Out */ + u32 policy; /* In */ + u64 pdh_cert_address; /* In */ + u32 pdh_cert_len; /* In */ + u32 reserved1; + u64 session_address; /* In */ + u32 session_len; /* In */ +} __packed; + +/** + * struct sev_data_receive_update_data - RECEIVE_UPDATE_DATA command parameters + * + * @handle: handle of the VM to update + * @hdr_address: physical address containing packet header blob + * @hdr_len: len of packet header + * @guest_address: system physical address of guest memory region + * @guest_len: len of guest memory region + * @trans_address: system physical address of transport buffer + * @trans_len: len of transport buffer + */ +struct sev_data_receive_update_data { + u32 handle; /* In */ + u32 reserved1; + u64 hdr_address; /* In */ + u32 hdr_len; /* In */ + u32 reserved2; + u64 guest_address; /* In */ + u32 guest_len; /* In */ + u32 reserved3; + u64 trans_address; /* In */ + u32 trans_len; /* In */ +} __packed; + +/** + * struct sev_data_receive_update_vmsa - RECEIVE_UPDATE_VMSA command parameters + * + * @handle: handle of the VM to update + * @hdr_address: physical address containing packet header blob + * @hdr_len: len of packet header + * @guest_address: system physical address of guest memory region + * @guest_len: len of guest memory region + * @trans_address: system physical address of transport buffer + * @trans_len: len of transport buffer + */ +struct sev_data_receive_update_vmsa { + u32 handle; /* In */ + u32 reserved1; + u64 hdr_address; /* In */ + u32 hdr_len; /* In */ + u32 reserved2; + u64 guest_address; /* In */ + u32 guest_len; /* In */ + u32 reserved3; + u64 trans_address; /* In */ + u32 trans_len; /* In */ +} __packed; + +/** + * struct sev_data_receive_finish - RECEIVE_FINISH command parameters + * + * @handle: handle of the VM to finish + */ +struct sev_data_receive_finish { + u32 handle; /* In */ +} __packed; + +/** + * struct sev_data_dbg - DBG_ENCRYPT/DBG_DECRYPT command parameters + * + * @handle: handle of the VM to perform debug operation + * @src_addr: source address of data to operate on + * @dst_addr: destination address of data to operate on + * @len: len of data to operate on + */ +struct sev_data_dbg { + u32 handle; /* In */ + u32 reserved; + u64 src_addr; /* In */ + u64 dst_addr; /* In */ + u32 len; /* In */ +} __packed; + +#endif /* __PSP_SEV_H__ */ From 2a6170dfe755b167ca8d6bba2e73695f08b37c54 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:28 -0600 Subject: [PATCH 017/197] crypto: ccp: Add Platform Security Processor (PSP) device support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Platform Security Processor (PSP) is part of the AMD Secure Processor (AMD-SP) functionality. The PSP is a dedicated processor that provides support for key management commands in Secure Encrypted Virtualization (SEV) mode, along with software-based Trusted Execution Environment (TEE) to enable third-party trusted applications. Note that the key management functionality provided by the SEV firmware can be used outside of the kvm-amd driver hence it doesn't need to depend on CONFIG_KVM_AMD. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- drivers/crypto/ccp/Kconfig | 11 ++++ drivers/crypto/ccp/Makefile | 1 + drivers/crypto/ccp/psp-dev.c | 105 +++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/psp-dev.h | 59 ++++++++++++++++++++ drivers/crypto/ccp/sp-dev.c | 26 +++++++++ drivers/crypto/ccp/sp-dev.h | 24 +++++++- drivers/crypto/ccp/sp-pci.c | 52 +++++++++++++++++ 7 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 drivers/crypto/ccp/psp-dev.c create mode 100644 drivers/crypto/ccp/psp-dev.h diff --git a/drivers/crypto/ccp/Kconfig b/drivers/crypto/ccp/Kconfig index 9c84f98389319c..b9dfae47aefd80 100644 --- a/drivers/crypto/ccp/Kconfig +++ b/drivers/crypto/ccp/Kconfig @@ -33,3 +33,14 @@ config CRYPTO_DEV_CCP_CRYPTO Support for using the cryptographic API with the AMD Cryptographic Coprocessor. This module supports offload of SHA and AES algorithms. If you choose 'M' here, this module will be called ccp_crypto. + +config CRYPTO_DEV_SP_PSP + bool "Platform Security Processor (PSP) device" + default y + depends on CRYPTO_DEV_CCP_DD && X86_64 + help + Provide support for the AMD Platform Security Processor (PSP). + The PSP is a dedicated processor that provides support for key + management commands in Secure Encrypted Virtualization (SEV) mode, + along with software-based Trusted Execution Environment (TEE) to + enable third-party trusted applications. diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index c4ce726b931e3d..51d1c0cf66c737 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -8,6 +8,7 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_CCP) += ccp-dev.o \ ccp-dmaengine.o \ ccp-debugfs.o ccp-$(CONFIG_PCI) += sp-pci.o +ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c new file mode 100644 index 00000000000000..b5789f87856038 --- /dev/null +++ b/drivers/crypto/ccp/psp-dev.c @@ -0,0 +1,105 @@ +/* + * AMD Platform Security Processor (PSP) interface + * + * Copyright (C) 2016-2017 Advanced Micro Devices, Inc. + * + * Author: Brijesh Singh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sp-dev.h" +#include "psp-dev.h" + +static struct psp_device *psp_alloc_struct(struct sp_device *sp) +{ + struct device *dev = sp->dev; + struct psp_device *psp; + + psp = devm_kzalloc(dev, sizeof(*psp), GFP_KERNEL); + if (!psp) + return NULL; + + psp->dev = dev; + psp->sp = sp; + + snprintf(psp->name, sizeof(psp->name), "psp-%u", sp->ord); + + return psp; +} + +static irqreturn_t psp_irq_handler(int irq, void *data) +{ + return IRQ_HANDLED; +} + +int psp_dev_init(struct sp_device *sp) +{ + struct device *dev = sp->dev; + struct psp_device *psp; + int ret; + + ret = -ENOMEM; + psp = psp_alloc_struct(sp); + if (!psp) + goto e_err; + + sp->psp_data = psp; + + psp->vdata = (struct psp_vdata *)sp->dev_vdata->psp_vdata; + if (!psp->vdata) { + ret = -ENODEV; + dev_err(dev, "missing driver data\n"); + goto e_err; + } + + psp->io_regs = sp->io_map + psp->vdata->offset; + + /* Disable and clear interrupts until ready */ + iowrite32(0, psp->io_regs + PSP_P2CMSG_INTEN); + iowrite32(-1, psp->io_regs + PSP_P2CMSG_INTSTS); + + /* Request an irq */ + ret = sp_request_psp_irq(psp->sp, psp_irq_handler, psp->name, psp); + if (ret) { + dev_err(dev, "psp: unable to allocate an IRQ\n"); + goto e_err; + } + + if (sp->set_psp_master_device) + sp->set_psp_master_device(sp); + + /* Enable interrupt */ + iowrite32(-1, psp->io_regs + PSP_P2CMSG_INTEN); + + return 0; + +e_err: + sp->psp_data = NULL; + + dev_notice(dev, "psp initialization failed\n"); + + return ret; +} + +void psp_dev_destroy(struct sp_device *sp) +{ + struct psp_device *psp = sp->psp_data; + + sp_free_psp_irq(sp, psp); +} diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h new file mode 100644 index 00000000000000..55b7808367c3a3 --- /dev/null +++ b/drivers/crypto/ccp/psp-dev.h @@ -0,0 +1,59 @@ +/* + * AMD Platform Security Processor (PSP) interface driver + * + * Copyright (C) 2017 Advanced Micro Devices, Inc. + * + * Author: Brijesh Singh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __PSP_DEV_H__ +#define __PSP_DEV_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sp-dev.h" + +#define PSP_P2CMSG_INTEN 0x0110 +#define PSP_P2CMSG_INTSTS 0x0114 + +#define PSP_C2PMSG_ATTR_0 0x0118 +#define PSP_C2PMSG_ATTR_1 0x011c +#define PSP_C2PMSG_ATTR_2 0x0120 +#define PSP_C2PMSG_ATTR_3 0x0124 +#define PSP_P2CMSG_ATTR_0 0x0128 + +#define PSP_CMDRESP_CMD_SHIFT 16 +#define PSP_CMDRESP_IOC BIT(0) +#define PSP_CMDRESP_RESP BIT(31) +#define PSP_CMDRESP_ERR_MASK 0xffff + +#define MAX_PSP_NAME_LEN 16 + +struct psp_device { + struct list_head entry; + + struct psp_vdata *vdata; + char name[MAX_PSP_NAME_LEN]; + + struct device *dev; + struct sp_device *sp; + + void __iomem *io_regs; +}; + +#endif /* __PSP_DEV_H */ diff --git a/drivers/crypto/ccp/sp-dev.c b/drivers/crypto/ccp/sp-dev.c index bef387c8abfd75..cf101c039c8ff4 100644 --- a/drivers/crypto/ccp/sp-dev.c +++ b/drivers/crypto/ccp/sp-dev.c @@ -198,6 +198,8 @@ int sp_init(struct sp_device *sp) if (sp->dev_vdata->ccp_vdata) ccp_dev_init(sp); + if (sp->dev_vdata->psp_vdata) + psp_dev_init(sp); return 0; } @@ -206,6 +208,9 @@ void sp_destroy(struct sp_device *sp) if (sp->dev_vdata->ccp_vdata) ccp_dev_destroy(sp); + if (sp->dev_vdata->psp_vdata) + psp_dev_destroy(sp); + sp_del_device(sp); } @@ -237,6 +242,27 @@ int sp_resume(struct sp_device *sp) } #endif +struct sp_device *sp_get_psp_master_device(void) +{ + struct sp_device *i, *ret = NULL; + unsigned long flags; + + write_lock_irqsave(&sp_unit_lock, flags); + if (list_empty(&sp_units)) + goto unlock; + + list_for_each_entry(i, &sp_units, entry) { + if (i->psp_data) + break; + } + + if (i->get_psp_master_device) + ret = i->get_psp_master_device(); +unlock: + write_unlock_irqrestore(&sp_unit_lock, flags); + return ret; +} + static int __init sp_mod_init(void) { #ifdef CONFIG_X86 diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h index 5ab486ade1ad93..909cf3e436b425 100644 --- a/drivers/crypto/ccp/sp-dev.h +++ b/drivers/crypto/ccp/sp-dev.h @@ -42,12 +42,17 @@ struct ccp_vdata { const unsigned int offset; const unsigned int rsamax; }; + +struct psp_vdata { + const unsigned int offset; +}; + /* Structure to hold SP device data */ struct sp_dev_vdata { const unsigned int bar; const struct ccp_vdata *ccp_vdata; - void *psp_vdata; + const struct psp_vdata *psp_vdata; }; struct sp_device { @@ -68,6 +73,10 @@ struct sp_device { /* DMA caching attribute support */ unsigned int axcache; + /* get and set master device */ + struct sp_device*(*get_psp_master_device)(void); + void (*set_psp_master_device)(struct sp_device *); + bool irq_registered; bool use_tasklet; @@ -103,6 +112,7 @@ void sp_free_ccp_irq(struct sp_device *sp, void *data); int sp_request_psp_irq(struct sp_device *sp, irq_handler_t handler, const char *name, void *data); void sp_free_psp_irq(struct sp_device *sp, void *data); +struct sp_device *sp_get_psp_master_device(void); #ifdef CONFIG_CRYPTO_DEV_SP_CCP @@ -130,4 +140,16 @@ static inline int ccp_dev_resume(struct sp_device *sp) } #endif /* CONFIG_CRYPTO_DEV_SP_CCP */ +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + +int psp_dev_init(struct sp_device *sp); +void psp_dev_destroy(struct sp_device *sp); + +#else /* !CONFIG_CRYPTO_DEV_SP_PSP */ + +static inline int psp_dev_init(struct sp_device *sp) { return 0; } +static inline void psp_dev_destroy(struct sp_device *sp) { } + +#endif /* CONFIG_CRYPTO_DEV_SP_PSP */ + #endif diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 9859aa683a2838..f5f43c50698ac4 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -25,6 +25,7 @@ #include #include "ccp-dev.h" +#include "psp-dev.h" #define MSIX_VECTORS 2 @@ -32,6 +33,7 @@ struct sp_pci { int msix_count; struct msix_entry msix_entry[MSIX_VECTORS]; }; +static struct sp_device *sp_dev_master; static int sp_get_msix_irqs(struct sp_device *sp) { @@ -108,6 +110,45 @@ static void sp_free_irqs(struct sp_device *sp) sp->psp_irq = 0; } +static bool sp_pci_is_master(struct sp_device *sp) +{ + struct device *dev_cur, *dev_new; + struct pci_dev *pdev_cur, *pdev_new; + + dev_new = sp->dev; + dev_cur = sp_dev_master->dev; + + pdev_new = to_pci_dev(dev_new); + pdev_cur = to_pci_dev(dev_cur); + + if (pdev_new->bus->number < pdev_cur->bus->number) + return true; + + if (PCI_SLOT(pdev_new->devfn) < PCI_SLOT(pdev_cur->devfn)) + return true; + + if (PCI_FUNC(pdev_new->devfn) < PCI_FUNC(pdev_cur->devfn)) + return true; + + return false; +} + +static void psp_set_master(struct sp_device *sp) +{ + if (!sp_dev_master) { + sp_dev_master = sp; + return; + } + + if (sp_pci_is_master(sp)) + sp_dev_master = sp; +} + +static struct sp_device *psp_get_master(void) +{ + return sp_dev_master; +} + static int sp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct sp_device *sp; @@ -166,6 +207,8 @@ static int sp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto e_err; pci_set_master(pdev); + sp->set_psp_master_device = psp_set_master; + sp->get_psp_master_device = psp_get_master; ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(48)); if (ret) { @@ -225,6 +268,12 @@ static int sp_pci_resume(struct pci_dev *pdev) } #endif +#ifdef CONFIG_CRYPTO_DEV_SP_PSP +static const struct psp_vdata psp_entry = { + .offset = 0x10500, +}; +#endif + static const struct sp_dev_vdata dev_vdata[] = { { .bar = 2, @@ -236,6 +285,9 @@ static const struct sp_dev_vdata dev_vdata[] = { .bar = 2, #ifdef CONFIG_CRYPTO_DEV_SP_CCP .ccp_vdata = &ccpv5a, +#endif +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + .psp_vdata = &psp_entry #endif }, { From 200664d5237f3f8cd2a2f9f5c5dea08502336bd1 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:28 -0600 Subject: [PATCH 018/197] crypto: ccp: Add Secure Encrypted Virtualization (SEV) command support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AMD's new Secure Encrypted Virtualization (SEV) feature allows the memory contents of virtual machines to be transparently encrypted with a key unique to the VM. The programming and management of the encryption keys are handled by the AMD Secure Processor (AMD-SP) which exposes the commands for these tasks. The complete spec is available at: http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf Extend the AMD-SP driver to provide the following support: - an in-kernel API to communicate with the SEV firmware. The API can be used by the hypervisor to create encryption context for a SEV guest. - a userspace IOCTL to manage the platform certificates. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh --- drivers/crypto/ccp/psp-dev.c | 344 +++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/psp-dev.h | 24 +++ drivers/crypto/ccp/sp-dev.c | 9 + drivers/crypto/ccp/sp-dev.h | 4 + include/linux/psp-sev.h | 137 ++++++++++++++ 5 files changed, 518 insertions(+) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index b5789f87856038..9915a6c604a36c 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -26,6 +26,12 @@ #include "sp-dev.h" #include "psp-dev.h" +#define DEVICE_NAME "sev" + +static DEFINE_MUTEX(sev_cmd_mutex); +static struct sev_misc_dev *misc_dev; +static struct psp_device *psp_master; + static struct psp_device *psp_alloc_struct(struct sp_device *sp) { struct device *dev = sp->dev; @@ -45,9 +51,285 @@ static struct psp_device *psp_alloc_struct(struct sp_device *sp) static irqreturn_t psp_irq_handler(int irq, void *data) { + struct psp_device *psp = data; + unsigned int status; + int reg; + + /* Read the interrupt status: */ + status = ioread32(psp->io_regs + PSP_P2CMSG_INTSTS); + + /* Check if it is command completion: */ + if (!(status & BIT(PSP_CMD_COMPLETE_REG))) + goto done; + + /* Check if it is SEV command completion: */ + reg = ioread32(psp->io_regs + PSP_CMDRESP); + if (reg & PSP_CMDRESP_RESP) { + psp->sev_int_rcvd = 1; + wake_up(&psp->sev_int_queue); + } + +done: + /* Clear the interrupt status by writing the same value we read. */ + iowrite32(status, psp->io_regs + PSP_P2CMSG_INTSTS); + return IRQ_HANDLED; } +static void sev_wait_cmd_ioc(struct psp_device *psp, unsigned int *reg) +{ + psp->sev_int_rcvd = 0; + + wait_event(psp->sev_int_queue, psp->sev_int_rcvd); + *reg = ioread32(psp->io_regs + PSP_CMDRESP); +} + +static int sev_cmd_buffer_len(int cmd) +{ + switch (cmd) { + case SEV_CMD_INIT: return sizeof(struct sev_data_init); + case SEV_CMD_PLATFORM_STATUS: return sizeof(struct sev_user_data_status); + case SEV_CMD_PEK_CSR: return sizeof(struct sev_data_pek_csr); + case SEV_CMD_PEK_CERT_IMPORT: return sizeof(struct sev_data_pek_cert_import); + case SEV_CMD_PDH_CERT_EXPORT: return sizeof(struct sev_data_pdh_cert_export); + case SEV_CMD_LAUNCH_START: return sizeof(struct sev_data_launch_start); + case SEV_CMD_LAUNCH_UPDATE_DATA: return sizeof(struct sev_data_launch_update_data); + case SEV_CMD_LAUNCH_UPDATE_VMSA: return sizeof(struct sev_data_launch_update_vmsa); + case SEV_CMD_LAUNCH_FINISH: return sizeof(struct sev_data_launch_finish); + case SEV_CMD_LAUNCH_MEASURE: return sizeof(struct sev_data_launch_measure); + case SEV_CMD_ACTIVATE: return sizeof(struct sev_data_activate); + case SEV_CMD_DEACTIVATE: return sizeof(struct sev_data_deactivate); + case SEV_CMD_DECOMMISSION: return sizeof(struct sev_data_decommission); + case SEV_CMD_GUEST_STATUS: return sizeof(struct sev_data_guest_status); + case SEV_CMD_DBG_DECRYPT: return sizeof(struct sev_data_dbg); + case SEV_CMD_DBG_ENCRYPT: return sizeof(struct sev_data_dbg); + case SEV_CMD_SEND_START: return sizeof(struct sev_data_send_start); + case SEV_CMD_SEND_UPDATE_DATA: return sizeof(struct sev_data_send_update_data); + case SEV_CMD_SEND_UPDATE_VMSA: return sizeof(struct sev_data_send_update_vmsa); + case SEV_CMD_SEND_FINISH: return sizeof(struct sev_data_send_finish); + case SEV_CMD_RECEIVE_START: return sizeof(struct sev_data_receive_start); + case SEV_CMD_RECEIVE_FINISH: return sizeof(struct sev_data_receive_finish); + case SEV_CMD_RECEIVE_UPDATE_DATA: return sizeof(struct sev_data_receive_update_data); + case SEV_CMD_RECEIVE_UPDATE_VMSA: return sizeof(struct sev_data_receive_update_vmsa); + case SEV_CMD_LAUNCH_UPDATE_SECRET: return sizeof(struct sev_data_launch_secret); + default: return 0; + } + + return 0; +} + +static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) +{ + struct psp_device *psp = psp_master; + unsigned int phys_lsb, phys_msb; + unsigned int reg, ret = 0; + + if (!psp) + return -ENODEV; + + /* Get the physical address of the command buffer */ + phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0; + phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0; + + dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x\n", + cmd, phys_msb, phys_lsb); + + print_hex_dump_debug("(in): ", DUMP_PREFIX_OFFSET, 16, 2, data, + sev_cmd_buffer_len(cmd), false); + + iowrite32(phys_lsb, psp->io_regs + PSP_CMDBUFF_ADDR_LO); + iowrite32(phys_msb, psp->io_regs + PSP_CMDBUFF_ADDR_HI); + + reg = cmd; + reg <<= PSP_CMDRESP_CMD_SHIFT; + reg |= PSP_CMDRESP_IOC; + iowrite32(reg, psp->io_regs + PSP_CMDRESP); + + /* wait for command completion */ + sev_wait_cmd_ioc(psp, ®); + + if (psp_ret) + *psp_ret = reg & PSP_CMDRESP_ERR_MASK; + + if (reg & PSP_CMDRESP_ERR_MASK) { + dev_dbg(psp->dev, "sev command %#x failed (%#010x)\n", + cmd, reg & PSP_CMDRESP_ERR_MASK); + ret = -EIO; + } + + print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data, + sev_cmd_buffer_len(cmd), false); + + return ret; +} + +static int sev_do_cmd(int cmd, void *data, int *psp_ret) +{ + int rc; + + mutex_lock(&sev_cmd_mutex); + rc = __sev_do_cmd_locked(cmd, data, psp_ret); + mutex_unlock(&sev_cmd_mutex); + + return rc; +} + +static int __sev_platform_init_locked(int *error) +{ + struct psp_device *psp = psp_master; + int rc = 0; + + if (!psp) + return -ENODEV; + + if (psp->sev_state == SEV_STATE_INIT) + return 0; + + rc = __sev_do_cmd_locked(SEV_CMD_INIT, &psp->init_cmd_buf, error); + if (rc) + return rc; + + psp->sev_state = SEV_STATE_INIT; + dev_dbg(psp->dev, "SEV firmware initialized\n"); + + return rc; +} + +int sev_platform_init(int *error) +{ + int rc; + + mutex_lock(&sev_cmd_mutex); + rc = __sev_platform_init_locked(error); + mutex_unlock(&sev_cmd_mutex); + + return rc; +} +EXPORT_SYMBOL_GPL(sev_platform_init); + +static int __sev_platform_shutdown_locked(int *error) +{ + int ret; + + ret = __sev_do_cmd_locked(SEV_CMD_SHUTDOWN, 0, error); + if (ret) + return ret; + + psp_master->sev_state = SEV_STATE_UNINIT; + dev_dbg(psp_master->dev, "SEV firmware shutdown\n"); + + return ret; +} + +static int sev_platform_shutdown(int *error) +{ + int rc; + + mutex_lock(&sev_cmd_mutex); + rc = __sev_platform_shutdown_locked(NULL); + mutex_unlock(&sev_cmd_mutex); + + return rc; +} + +static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) +{ + return -ENOTTY; +} + +static const struct file_operations sev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sev_ioctl, +}; + +int sev_platform_status(struct sev_user_data_status *data, int *error) +{ + return sev_do_cmd(SEV_CMD_PLATFORM_STATUS, data, error); +} +EXPORT_SYMBOL_GPL(sev_platform_status); + +int sev_guest_deactivate(struct sev_data_deactivate *data, int *error) +{ + return sev_do_cmd(SEV_CMD_DEACTIVATE, data, error); +} +EXPORT_SYMBOL_GPL(sev_guest_deactivate); + +int sev_guest_activate(struct sev_data_activate *data, int *error) +{ + return sev_do_cmd(SEV_CMD_ACTIVATE, data, error); +} +EXPORT_SYMBOL_GPL(sev_guest_activate); + +int sev_guest_decommission(struct sev_data_decommission *data, int *error) +{ + return sev_do_cmd(SEV_CMD_DECOMMISSION, data, error); +} +EXPORT_SYMBOL_GPL(sev_guest_decommission); + +int sev_guest_df_flush(int *error) +{ + return sev_do_cmd(SEV_CMD_DF_FLUSH, 0, error); +} +EXPORT_SYMBOL_GPL(sev_guest_df_flush); + +static void sev_exit(struct kref *ref) +{ + struct sev_misc_dev *misc_dev = container_of(ref, struct sev_misc_dev, refcount); + + misc_deregister(&misc_dev->misc); +} + +static int sev_misc_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + int ret; + + /* + * SEV feature support can be detected on multiple devices but the SEV + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sev on the first device probe. + * sev_do_cmd() finds the right master device to which to issue the + * command to the firmware. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = devm_kzalloc(dev, sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = DEVICE_NAME; + misc->fops = &sev_fops; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + init_waitqueue_head(&psp->sev_int_queue); + psp->sev_misc = misc_dev; + dev_dbg(dev, "registered SEV device\n"); + + return 0; +} + +static int sev_init(struct psp_device *psp) +{ + /* Check if device supports SEV feature */ + if (!(ioread32(psp->io_regs + PSP_FEATURE_REG) & 1)) { + dev_dbg(psp->dev, "device does not support SEV\n"); + return 1; + } + + return sev_misc_init(psp); +} + int psp_dev_init(struct sp_device *sp) { struct device *dev = sp->dev; @@ -81,6 +363,10 @@ int psp_dev_init(struct sp_device *sp) goto e_err; } + ret = sev_init(psp); + if (ret) + goto e_irq; + if (sp->set_psp_master_device) sp->set_psp_master_device(sp); @@ -89,6 +375,8 @@ int psp_dev_init(struct sp_device *sp) return 0; +e_irq: + sp_free_psp_irq(psp->sp, psp); e_err: sp->psp_data = NULL; @@ -101,5 +389,61 @@ void psp_dev_destroy(struct sp_device *sp) { struct psp_device *psp = sp->psp_data; + if (psp->sev_misc) + kref_put(&misc_dev->refcount, sev_exit); + sp_free_psp_irq(sp, psp); } + +int sev_issue_cmd_external_user(struct file *filep, unsigned int cmd, + void *data, int *error) +{ + if (!filep || filep->f_op != &sev_fops) + return -EBADF; + + return sev_do_cmd(cmd, data, error); +} +EXPORT_SYMBOL_GPL(sev_issue_cmd_external_user); + +void psp_pci_init(void) +{ + struct sev_user_data_status *status; + struct sp_device *sp; + int error, rc; + + sp = sp_get_psp_master_device(); + if (!sp) + return; + + psp_master = sp->psp_data; + + /* Initialize the platform */ + rc = sev_platform_init(&error); + if (rc) { + dev_err(sp->dev, "SEV: failed to INIT error %#x\n", error); + goto err; + } + + /* Display SEV firmware version */ + status = &psp_master->status_cmd_buf; + rc = sev_platform_status(status, &error); + if (rc) { + dev_err(sp->dev, "SEV: failed to get status error %#x\n", error); + goto err; + } + + dev_info(sp->dev, "SEV API:%d.%d build:%d\n", status->api_major, + status->api_minor, status->build); + return; + +err: + psp_master = NULL; +} + +void psp_pci_exit(void) +{ + if (!psp_master) + return; + + sev_platform_shutdown(NULL); +} diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index 55b7808367c3a3..c81f0b11287ae0 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -25,9 +25,21 @@ #include #include #include +#include +#include #include "sp-dev.h" +#define PSP_C2PMSG(_num) ((_num) << 2) +#define PSP_CMDRESP PSP_C2PMSG(32) +#define PSP_CMDBUFF_ADDR_LO PSP_C2PMSG(56) +#define PSP_CMDBUFF_ADDR_HI PSP_C2PMSG(57) +#define PSP_FEATURE_REG PSP_C2PMSG(63) + +#define PSP_P2CMSG(_num) ((_num) << 2) +#define PSP_CMD_COMPLETE_REG 1 +#define PSP_CMD_COMPLETE PSP_P2CMSG(PSP_CMD_COMPLETE_REG) + #define PSP_P2CMSG_INTEN 0x0110 #define PSP_P2CMSG_INTSTS 0x0114 @@ -44,6 +56,11 @@ #define MAX_PSP_NAME_LEN 16 +struct sev_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + struct psp_device { struct list_head entry; @@ -54,6 +71,13 @@ struct psp_device { struct sp_device *sp; void __iomem *io_regs; + + int sev_state; + unsigned int sev_int_rcvd; + wait_queue_head_t sev_int_queue; + struct sev_misc_dev *sev_misc; + struct sev_user_data_status status_cmd_buf; + struct sev_data_init init_cmd_buf; }; #endif /* __PSP_DEV_H */ diff --git a/drivers/crypto/ccp/sp-dev.c b/drivers/crypto/ccp/sp-dev.c index cf101c039c8ff4..eb0da657272043 100644 --- a/drivers/crypto/ccp/sp-dev.c +++ b/drivers/crypto/ccp/sp-dev.c @@ -272,6 +272,10 @@ static int __init sp_mod_init(void) if (ret) return ret; +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + psp_pci_init(); +#endif + return 0; #endif @@ -291,6 +295,11 @@ static int __init sp_mod_init(void) static void __exit sp_mod_exit(void) { #ifdef CONFIG_X86 + +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + psp_pci_exit(); +#endif + sp_pci_exit(); #endif diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h index 909cf3e436b425..acb197b66ced97 100644 --- a/drivers/crypto/ccp/sp-dev.h +++ b/drivers/crypto/ccp/sp-dev.h @@ -143,12 +143,16 @@ static inline int ccp_dev_resume(struct sp_device *sp) #ifdef CONFIG_CRYPTO_DEV_SP_PSP int psp_dev_init(struct sp_device *sp); +void psp_pci_init(void); void psp_dev_destroy(struct sp_device *sp); +void psp_pci_exit(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ static inline int psp_dev_init(struct sp_device *sp) { return 0; } +static inline void psp_pci_init(void) { } static inline void psp_dev_destroy(struct sp_device *sp) { } +static inline void psp_pci_exit(void) { } #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 4a150d17d537e0..0b6dd306d88bff 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -462,4 +462,141 @@ struct sev_data_dbg { u32 len; /* In */ } __packed; +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + +/** + * sev_platform_init - perform SEV INIT command + * + * @error: SEV command return code + * + * Returns: + * 0 if the SEV successfully processed the command + * -%ENODEV if the SEV device is not available + * -%ENOTSUPP if the SEV does not support SEV + * -%ETIMEDOUT if the SEV command timed out + * -%EIO if the SEV returned a non-zero return code + */ +int sev_platform_init(int *error); + +/** + * sev_platform_status - perform SEV PLATFORM_STATUS command + * + * @status: sev_user_data_status structure to be processed + * @error: SEV command return code + * + * Returns: + * 0 if the SEV successfully processed the command + * -%ENODEV if the SEV device is not available + * -%ENOTSUPP if the SEV does not support SEV + * -%ETIMEDOUT if the SEV command timed out + * -%EIO if the SEV returned a non-zero return code + */ +int sev_platform_status(struct sev_user_data_status *status, int *error); + +/** + * sev_issue_cmd_external_user - issue SEV command by other driver with a file + * handle. + * + * This function can be used by other drivers to issue a SEV command on + * behalf of userspace. The caller must pass a valid SEV file descriptor + * so that we know that it has access to SEV device. + * + * @filep - SEV device file pointer + * @cmd - command to issue + * @data - command buffer + * @error: SEV command return code + * + * Returns: + * 0 if the SEV successfully processed the command + * -%ENODEV if the SEV device is not available + * -%ENOTSUPP if the SEV does not support SEV + * -%ETIMEDOUT if the SEV command timed out + * -%EIO if the SEV returned a non-zero return code + * -%EINVAL if the SEV file descriptor is not valid + */ +int sev_issue_cmd_external_user(struct file *filep, unsigned int id, + void *data, int *error); + +/** + * sev_guest_deactivate - perform SEV DEACTIVATE command + * + * @deactivate: sev_data_deactivate structure to be processed + * @sev_ret: sev command return code + * + * Returns: + * 0 if the sev successfully processed the command + * -%ENODEV if the sev device is not available + * -%ENOTSUPP if the sev does not support SEV + * -%ETIMEDOUT if the sev command timed out + * -%EIO if the sev returned a non-zero return code + */ +int sev_guest_deactivate(struct sev_data_deactivate *data, int *error); + +/** + * sev_guest_activate - perform SEV ACTIVATE command + * + * @activate: sev_data_activate structure to be processed + * @sev_ret: sev command return code + * + * Returns: + * 0 if the sev successfully processed the command + * -%ENODEV if the sev device is not available + * -%ENOTSUPP if the sev does not support SEV + * -%ETIMEDOUT if the sev command timed out + * -%EIO if the sev returned a non-zero return code + */ +int sev_guest_activate(struct sev_data_activate *data, int *error); + +/** + * sev_guest_df_flush - perform SEV DF_FLUSH command + * + * @sev_ret: sev command return code + * + * Returns: + * 0 if the sev successfully processed the command + * -%ENODEV if the sev device is not available + * -%ENOTSUPP if the sev does not support SEV + * -%ETIMEDOUT if the sev command timed out + * -%EIO if the sev returned a non-zero return code + */ +int sev_guest_df_flush(int *error); + +/** + * sev_guest_decommission - perform SEV DECOMMISSION command + * + * @decommission: sev_data_decommission structure to be processed + * @sev_ret: sev command return code + * + * Returns: + * 0 if the sev successfully processed the command + * -%ENODEV if the sev device is not available + * -%ENOTSUPP if the sev does not support SEV + * -%ETIMEDOUT if the sev command timed out + * -%EIO if the sev returned a non-zero return code + */ +int sev_guest_decommission(struct sev_data_decommission *data, int *error); + +#else /* !CONFIG_CRYPTO_DEV_SP_PSP */ + +static inline int +sev_platform_status(struct sev_user_data_status *status, int *error) { return -ENODEV; } + +static inline int sev_platform_init(int *error) { return -ENODEV; } + +static inline int +sev_guest_deactivate(struct sev_data_deactivate *data, int *error) { return -ENODEV; } + +static inline int +sev_guest_decommission(struct sev_data_decommission *data, int *error) { return -ENODEV; } + +static inline int +sev_guest_activate(struct sev_data_activate *data, int *error) { return -ENODEV; } + +static inline int sev_guest_df_flush(int *error) { return -ENODEV; } + +static inline int +sev_issue_cmd_external_user(struct file *filep, unsigned int id, void *data, int *error) { return -ENODEV; } + +#endif /* CONFIG_CRYPTO_DEV_SP_PSP */ + #endif /* __PSP_SEV_H__ */ From 2960f9a51556f79a1f9a70c5ed5a52476f62f1a1 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:29 -0600 Subject: [PATCH 019/197] crypto: ccp: Implement SEV_FACTORY_RESET ioctl command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV_FACTORY_RESET command can be used by the platform owner to reset the non-volatile SEV related data. The command is defined in SEV spec section 5.4 Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh --- drivers/crypto/ccp/psp-dev.c | 77 +++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 9915a6c604a36c..b49583a45a55ea 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -232,9 +232,84 @@ static int sev_platform_shutdown(int *error) return rc; } +static int sev_get_platform_state(int *state, int *error) +{ + int rc; + + rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, + &psp_master->status_cmd_buf, error); + if (rc) + return rc; + + *state = psp_master->status_cmd_buf.state; + return rc; +} + +static int sev_ioctl_do_reset(struct sev_issue_cmd *argp) +{ + int state, rc; + + /* + * The SEV spec requires that FACTORY_RESET must be issued in + * UNINIT state. Before we go further lets check if any guest is + * active. + * + * If FW is in WORKING state then deny the request otherwise issue + * SHUTDOWN command do INIT -> UNINIT before issuing the FACTORY_RESET. + * + */ + rc = sev_get_platform_state(&state, &argp->error); + if (rc) + return rc; + + if (state == SEV_STATE_WORKING) + return -EBUSY; + + if (state == SEV_STATE_INIT) { + rc = __sev_platform_shutdown_locked(&argp->error); + if (rc) + return rc; + } + + return __sev_do_cmd_locked(SEV_CMD_FACTORY_RESET, 0, &argp->error); +} + static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { - return -ENOTTY; + void __user *argp = (void __user *)arg; + struct sev_issue_cmd input; + int ret = -EFAULT; + + if (!psp_master) + return -ENODEV; + + if (ioctl != SEV_ISSUE_CMD) + return -EINVAL; + + if (copy_from_user(&input, argp, sizeof(struct sev_issue_cmd))) + return -EFAULT; + + if (input.cmd > SEV_MAX) + return -EINVAL; + + mutex_lock(&sev_cmd_mutex); + + switch (input.cmd) { + + case SEV_FACTORY_RESET: + ret = sev_ioctl_do_reset(&input); + break; + default: + ret = -EINVAL; + goto out; + } + + if (copy_to_user(argp, &input, sizeof(struct sev_issue_cmd))) + ret = -EFAULT; +out: + mutex_unlock(&sev_cmd_mutex); + + return ret; } static const struct file_operations sev_fops = { From efe1829b1a8fb7a8e69791141b8c8f6291708863 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:29 -0600 Subject: [PATCH 020/197] crypto: ccp: Implement SEV_PLATFORM_STATUS ioctl command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV_PLATFORM_STATUS command can be used by the platform owner to get the current status of the platform. The command is defined in SEV spec section 5.5. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov Acked-by: Gary R Hook --- drivers/crypto/ccp/psp-dev.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index b49583a45a55ea..a5072b166ab84d 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -274,6 +274,21 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp) return __sev_do_cmd_locked(SEV_CMD_FACTORY_RESET, 0, &argp->error); } +static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp) +{ + struct sev_user_data_status *data = &psp_master->status_cmd_buf; + int ret; + + ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, data, &argp->error); + if (ret) + return ret; + + if (copy_to_user((void __user *)argp->data, data, sizeof(*data))) + ret = -EFAULT; + + return ret; +} + static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -299,6 +314,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) case SEV_FACTORY_RESET: ret = sev_ioctl_do_reset(&input); break; + case SEV_PLATFORM_STATUS: + ret = sev_ioctl_do_platform_status(&input); + break; default: ret = -EINVAL; goto out; From 4d84b726be834d4991b94fd51df66ec299b66d45 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:30 -0600 Subject: [PATCH 021/197] crypto: ccp: Implement SEV_PEK_GEN ioctl command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV_PEK_GEN command is used to generate a new Platform Endorsement Key (PEK). The command is defined in SEV spec section 5.6. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Borislav Petkov Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Acked-by: Gary R Hook --- drivers/crypto/ccp/psp-dev.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index a5072b166ab84d..8aa8036023e0b5 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -289,6 +289,19 @@ static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp) return ret; } +static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp) +{ + int rc; + + if (psp_master->sev_state == SEV_STATE_UNINIT) { + rc = __sev_platform_init_locked(&argp->error); + if (rc) + return rc; + } + + return __sev_do_cmd_locked(cmd, 0, &argp->error); +} + static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -317,6 +330,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) case SEV_PLATFORM_STATUS: ret = sev_ioctl_do_platform_status(&input); break; + case SEV_PEK_GEN: + ret = sev_ioctl_do_pek_pdh_gen(SEV_CMD_PEK_GEN, &input); + break; default: ret = -EINVAL; goto out; From 77f65327228f001bf1a43eb09dc96bbdba5b4eac Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:30 -0600 Subject: [PATCH 022/197] crypto: ccp: Implement SEV_PDH_GEN ioctl command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV_PDH_GEN command is used to re-generate the Platform Diffie-Hellman (PDH) key. The command is defined in SEV spec section 5.6. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov Acked-by: Gary R Hook --- drivers/crypto/ccp/psp-dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 8aa8036023e0b5..fd3daf0a1176e8 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -333,6 +333,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) case SEV_PEK_GEN: ret = sev_ioctl_do_pek_pdh_gen(SEV_CMD_PEK_GEN, &input); break; + case SEV_PDH_GEN: + ret = sev_ioctl_do_pek_pdh_gen(SEV_CMD_PDH_GEN, &input); + break; default: ret = -EINVAL; goto out; From e799035609e1526761aa2f896a974b233d04d36d Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:31 -0600 Subject: [PATCH 023/197] crypto: ccp: Implement SEV_PEK_CSR ioctl command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV_PEK_CSR command can be used to generate a PEK certificate signing request. The command is defined in SEV spec section 5.7. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Acked-by: Gary R Hook --- drivers/crypto/ccp/psp-dev.c | 66 ++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index fd3daf0a1176e8..c3906bbdb69be3 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -302,6 +302,69 @@ static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp) return __sev_do_cmd_locked(cmd, 0, &argp->error); } +static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp) +{ + struct sev_user_data_pek_csr input; + struct sev_data_pek_csr *data; + void *blob = NULL; + int ret; + + if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + /* userspace wants to query CSR length */ + if (!input.address || !input.length) + goto cmd; + + /* allocate a physically contiguous buffer to store the CSR blob */ + if (!access_ok(VERIFY_WRITE, input.address, input.length) || + input.length > SEV_FW_BLOB_MAX_SIZE) { + ret = -EFAULT; + goto e_free; + } + + blob = kmalloc(input.length, GFP_KERNEL); + if (!blob) { + ret = -ENOMEM; + goto e_free; + } + + data->address = __psp_pa(blob); + data->len = input.length; + +cmd: + if (psp_master->sev_state == SEV_STATE_UNINIT) { + ret = __sev_platform_init_locked(&argp->error); + if (ret) + goto e_free_blob; + } + + ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, data, &argp->error); + + /* If we query the CSR length, FW responded with expected data. */ + input.length = data->len; + + if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) { + ret = -EFAULT; + goto e_free_blob; + } + + if (blob) { + if (copy_to_user((void __user *)input.address, blob, input.length)) + ret = -EFAULT; + } + +e_free_blob: + kfree(blob); +e_free: + kfree(data); + return ret; +} + static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -336,6 +399,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) case SEV_PDH_GEN: ret = sev_ioctl_do_pek_pdh_gen(SEV_CMD_PDH_GEN, &input); break; + case SEV_PEK_CSR: + ret = sev_ioctl_do_pek_csr(&input); + break; default: ret = -EINVAL; goto out; From 7360e4b14350a3a461645e0413ce58fbc8785fd8 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:31 -0600 Subject: [PATCH 024/197] crypto: ccp: Implement SEV_PEK_CERT_IMPORT ioctl command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV_PEK_CERT_IMPORT command can be used to import the signed PEK certificate. The command is defined in SEV spec section 5.8. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Acked-by: Gary R Hook Reviewed-by: Borislav Petkov --- drivers/crypto/ccp/psp-dev.c | 81 ++++++++++++++++++++++++++++++++++++ include/linux/psp-sev.h | 4 ++ 2 files changed, 85 insertions(+) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index c3906bbdb69be3..9d1c4600db19da 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -365,6 +365,84 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp) return ret; } +void *psp_copy_user_blob(u64 __user uaddr, u32 len) +{ + void *data; + + if (!uaddr || !len) + return ERR_PTR(-EINVAL); + + /* verify that blob length does not exceed our limit */ + if (len > SEV_FW_BLOB_MAX_SIZE) + return ERR_PTR(-EINVAL); + + data = kmalloc(len, GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(data, (void __user *)(uintptr_t)uaddr, len)) + goto e_free; + + return data; + +e_free: + kfree(data); + return ERR_PTR(-EFAULT); +} +EXPORT_SYMBOL_GPL(psp_copy_user_blob); + +static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp) +{ + struct sev_user_data_pek_cert_import input; + struct sev_data_pek_cert_import *data; + void *pek_blob, *oca_blob; + int ret; + + if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + /* copy PEK certificate blobs from userspace */ + pek_blob = psp_copy_user_blob(input.pek_cert_address, input.pek_cert_len); + if (IS_ERR(pek_blob)) { + ret = PTR_ERR(pek_blob); + goto e_free; + } + + data->pek_cert_address = __psp_pa(pek_blob); + data->pek_cert_len = input.pek_cert_len; + + /* copy PEK certificate blobs from userspace */ + oca_blob = psp_copy_user_blob(input.oca_cert_address, input.oca_cert_len); + if (IS_ERR(oca_blob)) { + ret = PTR_ERR(oca_blob); + goto e_free_pek; + } + + data->oca_cert_address = __psp_pa(oca_blob); + data->oca_cert_len = input.oca_cert_len; + + /* If platform is not in INIT state then transition it to INIT */ + if (psp_master->sev_state != SEV_STATE_INIT) { + ret = __sev_platform_init_locked(&argp->error); + if (ret) + goto e_free_oca; + } + + ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, data, &argp->error); + +e_free_oca: + kfree(oca_blob); +e_free_pek: + kfree(pek_blob); +e_free: + kfree(data); + return ret; +} + static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -402,6 +480,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) case SEV_PEK_CSR: ret = sev_ioctl_do_pek_csr(&input); break; + case SEV_PEK_CERT_IMPORT: + ret = sev_ioctl_do_pek_import(&input); + break; default: ret = -EINVAL; goto out; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 0b6dd306d88bff..93addfa3406199 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -576,6 +576,8 @@ int sev_guest_df_flush(int *error); */ int sev_guest_decommission(struct sev_data_decommission *data, int *error); +void *psp_copy_user_blob(u64 __user uaddr, u32 len); + #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ static inline int @@ -597,6 +599,8 @@ static inline int sev_guest_df_flush(int *error) { return -ENODEV; } static inline int sev_issue_cmd_external_user(struct file *filep, unsigned int id, void *data, int *error) { return -ENODEV; } +static inline void *psp_copy_user_blob(u64 __user uaddr, u32 len) { return ERR_PTR(-EINVAL); } + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ #endif /* __PSP_SEV_H__ */ From 76a2b524a4b1d6dc0f2421f9854a01d55d5e5436 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:31 -0600 Subject: [PATCH 025/197] crypto: ccp: Implement SEV_PDH_CERT_EXPORT ioctl command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV_PDH_CERT_EXPORT command can be used to export the PDH and its certificate chain. The command is defined in SEV spec section 5.10. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Acked-by: Gary R Hook --- drivers/crypto/ccp/psp-dev.c | 97 ++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 9d1c4600db19da..fcfa5b1eae6169 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -443,6 +443,100 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp) return ret; } +static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp) +{ + struct sev_user_data_pdh_cert_export input; + void *pdh_blob = NULL, *cert_blob = NULL; + struct sev_data_pdh_cert_export *data; + int ret; + + if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + /* Userspace wants to query the certificate length. */ + if (!input.pdh_cert_address || + !input.pdh_cert_len || + !input.cert_chain_address) + goto cmd; + + /* Allocate a physically contiguous buffer to store the PDH blob. */ + if ((input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) || + !access_ok(VERIFY_WRITE, input.pdh_cert_address, input.pdh_cert_len)) { + ret = -EFAULT; + goto e_free; + } + + /* Allocate a physically contiguous buffer to store the cert chain blob. */ + if ((input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) || + !access_ok(VERIFY_WRITE, input.cert_chain_address, input.cert_chain_len)) { + ret = -EFAULT; + goto e_free; + } + + pdh_blob = kmalloc(input.pdh_cert_len, GFP_KERNEL); + if (!pdh_blob) { + ret = -ENOMEM; + goto e_free; + } + + data->pdh_cert_address = __psp_pa(pdh_blob); + data->pdh_cert_len = input.pdh_cert_len; + + cert_blob = kmalloc(input.cert_chain_len, GFP_KERNEL); + if (!cert_blob) { + ret = -ENOMEM; + goto e_free_pdh; + } + + data->cert_chain_address = __psp_pa(cert_blob); + data->cert_chain_len = input.cert_chain_len; + +cmd: + /* If platform is not in INIT state then transition it to INIT. */ + if (psp_master->sev_state != SEV_STATE_INIT) { + ret = __sev_platform_init_locked(&argp->error); + if (ret) + goto e_free_cert; + } + + ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, data, &argp->error); + + /* If we query the length, FW responded with expected data. */ + input.cert_chain_len = data->cert_chain_len; + input.pdh_cert_len = data->pdh_cert_len; + + if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) { + ret = -EFAULT; + goto e_free_cert; + } + + if (pdh_blob) { + if (copy_to_user((void __user *)input.pdh_cert_address, + pdh_blob, input.pdh_cert_len)) { + ret = -EFAULT; + goto e_free_cert; + } + } + + if (cert_blob) { + if (copy_to_user((void __user *)input.cert_chain_address, + cert_blob, input.cert_chain_len)) + ret = -EFAULT; + } + +e_free_cert: + kfree(cert_blob); +e_free_pdh: + kfree(pdh_blob); +e_free: + kfree(data); + return ret; +} + static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -483,6 +577,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) case SEV_PEK_CERT_IMPORT: ret = sev_ioctl_do_pek_import(&input); break; + case SEV_PDH_CERT_EXPORT: + ret = sev_ioctl_do_pdh_export(&input); + break; default: ret = -EINVAL; goto out; From 5dd0a57cf38eeb8b6be1d9c3df9add2f5756d974 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:32 -0600 Subject: [PATCH 026/197] KVM: X86: Add CONFIG_KVM_AMD_SEV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The config option can be used to enable SEV support on AMD Processors. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3df51c28784428..148ea324b90c40 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -81,6 +81,16 @@ config KVM_AMD To compile this as a module, choose M here: the module will be called kvm-amd. +config KVM_AMD_SEV + def_bool y + bool "AMD Secure Encrypted Virtualization (SEV) support" + depends on KVM_AMD && X86_64 + select CRYPTO_DEV_CCP + select CRYPTO_DEV_CCP_DD + select CRYPTO_DEV_SP_PSP + ---help--- + Provides support for launching Encrypted VMs on AMD processors. + config KVM_MMU_AUDIT bool "Audit KVM MMU" depends on KVM && TRACEPOINTS From ed3cd233f8074015e164547abfcdd4678fd10bb4 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:32 -0600 Subject: [PATCH 027/197] KVM: SVM: Reserve ASID range for SEV guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A SEV-enabled guest must use ASIDs from the defined subset, while non-SEV guests can use the remaining ASID range. The range of allowed SEV guest ASIDs is [1 - CPUID_8000_001F[ECX][31:0]]. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7e302a25bc4550..bff7ce727a782a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -319,6 +319,8 @@ enum { #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL +static unsigned int max_sev_asid; + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -783,7 +785,7 @@ static int svm_hardware_enable(void) sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - sd->min_asid = 1; + sd->min_asid = max_sev_asid + 1; gdt = get_current_gdt_rw(); sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); From e9df09428996fcdc43e2b0db2a0e8b38198931c4 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:33 -0600 Subject: [PATCH 028/197] KVM: SVM: Add sev module_param MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The module parameter can be used to control the SEV feature support. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bff7ce727a782a..ca63642517a707 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -284,6 +285,10 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* enable/disable SEV support */ +static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); +module_param(sev, int, 0444); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -1049,6 +1054,39 @@ static int avic_ga_log_notifier(u32 ga_tag) return 0; } +static __init int sev_hardware_setup(void) +{ + struct sev_user_data_status *status; + int rc; + + /* Maximum number of encrypted guests supported simultaneously */ + max_sev_asid = cpuid_ecx(0x8000001F); + + if (!max_sev_asid) + return 1; + + status = kmalloc(sizeof(*status), GFP_KERNEL); + if (!status) + return 1; + + /* + * Check SEV platform status. + * + * PLATFORM_STATUS can be called in any state, if we failed to query + * the PLATFORM status then either PSP firmware does not support SEV + * feature or SEV firmware is dead. + */ + rc = sev_platform_status(status, NULL); + if (rc) + goto err; + + pr_info("SEV supported\n"); + +err: + kfree(status); + return rc; +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1084,6 +1122,17 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } + if (sev) { + if (boot_cpu_has(X86_FEATURE_SEV) && + IS_ENABLED(CONFIG_KVM_AMD_SEV)) { + r = sev_hardware_setup(); + if (r) + sev = false; + } else { + sev = false; + } + } + for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) From dc48bae01e5a23ae67758e8fe31cdc439202b190 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:33 -0600 Subject: [PATCH 029/197] KVM: Define SEV key management command id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define Secure Encrypted Virtualization (SEV) key management command id and structure. The command definition is available in SEV KM spec 0.14 (http://support.amd.com/TechDocs/55766_SEV-KM API_Specification.pdf) and Documentation/virtual/kvm/amd-memory-encryption.txt. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: Jonathan Corbet Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- .../virtual/kvm/amd-memory-encryption.rst | 202 ++++++++++++++++++ include/uapi/linux/kvm.h | 80 +++++++ 2 files changed, 282 insertions(+) diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst index a8ef21e737db69..71d6d257074ffd 100644 --- a/Documentation/virtual/kvm/amd-memory-encryption.rst +++ b/Documentation/virtual/kvm/amd-memory-encryption.rst @@ -43,3 +43,205 @@ setting the SEV bit before executing VMRUN.:: SEV hardware uses ASIDs to associate a memory encryption key with a VM. Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value defined in the CPUID 0x8000001f[ecx] field. + +SEV Key Management +================== + +The SEV guest key management is handled by a separate processor called the AMD +Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure +key management interface to perform common hypervisor activities such as +encrypting bootstrap code, snapshot, migrating and debugging the guest. For more +information, see the SEV Key Management spec [api-spec]_ + +KVM implements the following commands to support common lifecycle events of SEV +guests, such as launching, running, snapshotting, migrating and decommissioning. + +1. KVM_SEV_INIT +--------------- + +The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform +context. In a typical workflow, this command should be the first command issued. + +Returns: 0 on success, -negative on error + +2. KVM_SEV_LAUNCH_START +----------------------- + +The KVM_SEV_LAUNCH_START command is used for creating the memory encryption +context. To create the encryption context, user must provide a guest policy, +the owner's public Diffie-Hellman (PDH) key and session information. + +Parameters: struct kvm_sev_launch_start (in/out) + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_start { + __u32 handle; /* if zero then firmware creates a new handle */ + __u32 policy; /* guest's policy */ + + __u64 dh_uaddr; /* userspace address pointing to the guest owner's PDH key */ + __u32 dh_len; + + __u64 session_addr; /* userspace address which points to the guest session information */ + __u32 session_len; + }; + +On success, the 'handle' field contains a new handle and on error, a negative value. + +For more details, see SEV spec Section 6.2. + +3. KVM_SEV_LAUNCH_UPDATE_DATA +----------------------------- + +The KVM_SEV_LAUNCH_UPDATE_DATA is used for encrypting a memory region. It also +calculates a measurement of the memory contents. The measurement is a signature +of the memory contents that can be sent to the guest owner as an attestation +that the memory was encrypted correctly by the firmware. + +Parameters (in): struct kvm_sev_launch_update_data + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_update { + __u64 uaddr; /* userspace address to be encrypted (must be 16-byte aligned) */ + __u32 len; /* length of the data to be encrypted (must be 16-byte aligned) */ + }; + +For more details, see SEV spec Section 6.3. + +4. KVM_SEV_LAUNCH_MEASURE +------------------------- + +The KVM_SEV_LAUNCH_MEASURE command is used to retrieve the measurement of the +data encrypted by the KVM_SEV_LAUNCH_UPDATE_DATA command. The guest owner may +wait to provide the guest with confidential information until it can verify the +measurement. Since the guest owner knows the initial contents of the guest at +boot, the measurement can be verified by comparing it to what the guest owner +expects. + +Parameters (in): struct kvm_sev_launch_measure + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_measure { + __u64 uaddr; /* where to copy the measurement */ + __u32 len; /* length of measurement blob */ + }; + +For more details on the measurement verification flow, see SEV spec Section 6.4. + +5. KVM_SEV_LAUNCH_FINISH +------------------------ + +After completion of the launch flow, the KVM_SEV_LAUNCH_FINISH command can be +issued to make the guest ready for the execution. + +Returns: 0 on success, -negative on error + +6. KVM_SEV_GUEST_STATUS +----------------------- + +The KVM_SEV_GUEST_STATUS command is used to retrieve status information about a +SEV-enabled guest. + +Parameters (out): struct kvm_sev_guest_status + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_guest_status { + __u32 handle; /* guest handle */ + __u32 policy; /* guest policy */ + __u8 state; /* guest state (see enum below) */ + }; + +SEV guest state: + +:: + + enum { + SEV_STATE_INVALID = 0; + SEV_STATE_LAUNCHING, /* guest is currently being launched */ + SEV_STATE_SECRET, /* guest is being launched and ready to accept the ciphertext data */ + SEV_STATE_RUNNING, /* guest is fully launched and running */ + SEV_STATE_RECEIVING, /* guest is being migrated in from another SEV machine */ + SEV_STATE_SENDING /* guest is getting migrated out to another SEV machine */ + }; + +7. KVM_SEV_DBG_DECRYPT +---------------------- + +The KVM_SEV_DEBUG_DECRYPT command can be used by the hypervisor to request the +firmware to decrypt the data at the given memory region. + +Parameters (in): struct kvm_sev_dbg + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_dbg { + __u64 src_uaddr; /* userspace address of data to decrypt */ + __u64 dst_uaddr; /* userspace address of destination */ + __u32 len; /* length of memory region to decrypt */ + }; + +The command returns an error if the guest policy does not allow debugging. + +8. KVM_SEV_DBG_ENCRYPT +---------------------- + +The KVM_SEV_DEBUG_ENCRYPT command can be used by the hypervisor to request the +firmware to encrypt the data at the given memory region. + +Parameters (in): struct kvm_sev_dbg + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_dbg { + __u64 src_uaddr; /* userspace address of data to encrypt */ + __u64 dst_uaddr; /* userspace address of destination */ + __u32 len; /* length of memory region to encrypt */ + }; + +The command returns an error if the guest policy does not allow debugging. + +9. KVM_SEV_LAUNCH_SECRET +------------------------ + +The KVM_SEV_LAUNCH_SECRET command can be used by the hypervisor to inject secret +data after the measurement has been validated by the guest owner. + +Parameters (in): struct kvm_sev_launch_secret + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_secret { + __u64 hdr_uaddr; /* userspace address containing the packet header */ + __u32 hdr_len; + + __u64 guest_uaddr; /* the guest memory region where the secret should be injected */ + __u32 guest_len; + + __u64 trans_uaddr; /* the hypervisor memory region which contains the secret */ + __u32 trans_len; + }; + +References +========== + +.. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf +.. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf +.. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34) +.. [kvm-forum] http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c8c65190907d61..571431d3384b40 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1369,6 +1369,86 @@ struct kvm_enc_region { #define KVM_MEMORY_ENCRYPT_REG_REGION _IOR(KVMIO, 0xbb, struct kvm_enc_region) #define KVM_MEMORY_ENCRYPT_UNREG_REGION _IOR(KVMIO, 0xbc, struct kvm_enc_region) +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) From 1654efcbc431a369397a20bf85e45870d15c8689 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:34 -0600 Subject: [PATCH 030/197] KVM: SVM: Add KVM_SEV_INIT command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command initializes the SEV platform context and allocates a new ASID for this guest from the SEV ASID pool. The firmware must be initialized before we issue any guest launch commands to create a new memory encryption context. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/kvm_host.h | 7 ++ arch/x86/kvm/svm.c | 132 +++++++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 58b7cc30466b42..384dcfda43cc75 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -747,6 +747,11 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +struct kvm_sev_info { + bool active; /* SEV enabled guest */ + unsigned int asid; /* ASID used for this guest */ +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -834,6 +839,8 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + + struct kvm_sev_info sev_info; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ca63642517a707..51186107eb22ac 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -325,6 +326,20 @@ enum { #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL static unsigned int max_sev_asid; +static unsigned int min_sev_asid; +static unsigned long *sev_asid_bitmap; + +static inline bool svm_sev_enabled(void) +{ + return max_sev_asid; +} + +static inline bool sev_guest(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + return sev->active; +} static inline void mark_all_dirty(struct vmcb *vmcb) { @@ -1065,6 +1080,15 @@ static __init int sev_hardware_setup(void) if (!max_sev_asid) return 1; + /* Minimum ASID value that should be used for SEV guest */ + min_sev_asid = cpuid_edx(0x8000001F); + + /* Initialize SEV ASID bitmap */ + sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid), + sizeof(unsigned long), GFP_KERNEL); + if (!sev_asid_bitmap) + return 1; + status = kmalloc(sizeof(*status), GFP_KERNEL); if (!status) return 1; @@ -1194,6 +1218,9 @@ static __exit void svm_hardware_unsetup(void) { int cpu; + if (svm_sev_enabled()) + kfree(sev_asid_bitmap); + for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1384,6 +1411,9 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (sev_guest(svm->vcpu.kvm)) + svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; + mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1466,6 +1496,29 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static void __sev_asid_free(int asid) +{ + int pos; + + pos = asid - 1; + clear_bit(pos, sev_asid_bitmap); +} + +static void sev_asid_free(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + __sev_asid_free(sev->asid); +} + +static void sev_vm_destroy(struct kvm *kvm) +{ + if (!sev_guest(kvm)) + return; + + sev_asid_free(kvm); +} + static void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; @@ -1484,6 +1537,12 @@ static void avic_vm_destroy(struct kvm *kvm) spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } +static void svm_vm_destroy(struct kvm *kvm) +{ + avic_vm_destroy(kvm); + sev_vm_destroy(kvm); +} + static int avic_vm_init(struct kvm *kvm) { unsigned long flags; @@ -5556,6 +5615,75 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int sev_asid_new(void) +{ + int pos; + + /* + * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. + */ + pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); + if (pos >= max_sev_asid) + return -EBUSY; + + set_bit(pos, sev_asid_bitmap); + return pos + 1; +} + +static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + int asid, ret; + + ret = -EBUSY; + asid = sev_asid_new(); + if (asid < 0) + return ret; + + ret = sev_platform_init(&argp->error); + if (ret) + goto e_free; + + sev->active = true; + sev->asid = asid; + + return 0; + +e_free: + __sev_asid_free(asid); + return ret; +} + +static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +{ + struct kvm_sev_cmd sev_cmd; + int r; + + if (!svm_sev_enabled()) + return -ENOTTY; + + if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) + return -EFAULT; + + mutex_lock(&kvm->lock); + + switch (sev_cmd.id) { + case KVM_SEV_INIT: + r = sev_guest_init(kvm, &sev_cmd); + break; + default: + r = -EINVAL; + goto out; + } + + if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) + r = -EFAULT; + +out: + mutex_unlock(&kvm->lock); + return r; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5572,7 +5700,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vcpu_reset = svm_vcpu_reset, .vm_init = avic_vm_init, - .vm_destroy = avic_vm_destroy, + .vm_destroy = svm_vm_destroy, .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, @@ -5671,6 +5799,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pre_enter_smm = svm_pre_enter_smm, .pre_leave_smm = svm_pre_leave_smm, .enable_smi_window = enable_smi_window, + + .mem_enc_op = svm_mem_enc_op, }; static int __init svm_init(void) From 70cd94e60c733e3afc18b0e6aab789c13b5571da Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:34 -0600 Subject: [PATCH 031/197] KVM: SVM: VMRUN should use associated ASID when SEV is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEV hardware uses ASIDs to associate a memory encryption key with a guest VM. During guest creation, a SEV VM uses the SEV_CMD_ACTIVATE command to bind a particular ASID to the guest. Lets make sure that the VMCB is programmed with the bound ASID before a VMRUN. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 51186107eb22ac..cdbdc86d7aee94 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -213,6 +213,9 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; + + /* which host CPU was used for running this vcpu */ + unsigned int last_cpu; }; /* @@ -341,6 +344,13 @@ static inline bool sev_guest(struct kvm *kvm) return sev->active; } +static inline int sev_get_asid(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + return sev->asid; +} + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -551,6 +561,9 @@ struct svm_cpu_data { struct kvm_ldttss_desc *tss_desc; struct page *save_area; + + /* index = sev_asid, value = vmcb pointer */ + struct vmcb **sev_vmcbs; }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); @@ -864,6 +877,7 @@ static void svm_cpu_uninit(int cpu) return; per_cpu(svm_data, raw_smp_processor_id()) = NULL; + kfree(sd->sev_vmcbs); __free_page(sd->save_area); kfree(sd); } @@ -877,11 +891,18 @@ static int svm_cpu_init(int cpu) if (!sd) return -ENOMEM; sd->cpu = cpu; - sd->save_area = alloc_page(GFP_KERNEL); r = -ENOMEM; + sd->save_area = alloc_page(GFP_KERNEL); if (!sd->save_area) goto err_1; + if (svm_sev_enabled()) { + r = -ENOMEM; + sd->sev_vmcbs = kmalloc((max_sev_asid + 1) * sizeof(void *), GFP_KERNEL); + if (!sd->sev_vmcbs) + goto err_1; + } + per_cpu(svm_data, cpu) = sd; return 0; @@ -1498,10 +1519,16 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) static void __sev_asid_free(int asid) { - int pos; + struct svm_cpu_data *sd; + int cpu, pos; pos = asid - 1; clear_bit(pos, sev_asid_bitmap); + + for_each_possible_cpu(cpu) { + sd = per_cpu(svm_data, cpu); + sd->sev_vmcbs[pos] = NULL; + } } static void sev_asid_free(struct kvm *kvm) @@ -4466,12 +4493,39 @@ static void reload_tss(struct kvm_vcpu *vcpu) load_TR_desc(); } +static void pre_sev_run(struct vcpu_svm *svm, int cpu) +{ + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + int asid = sev_get_asid(svm->vcpu.kvm); + + /* Assign the asid allocated with this SEV guest */ + svm->vmcb->control.asid = asid; + + /* + * Flush guest TLB: + * + * 1) when different VMCB for the same ASID is to be run on the same host CPU. + * 2) or this VMCB was executed on different host CPU in previous VMRUNs. + */ + if (sd->sev_vmcbs[asid] == svm->vmcb && + svm->last_cpu == cpu) + return; + + svm->last_cpu = cpu; + sd->sev_vmcbs[asid] = svm->vmcb; + svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; + mark_dirty(svm->vmcb, VMCB_ASID); +} + static void pre_svm_run(struct vcpu_svm *svm) { int cpu = raw_smp_processor_id(); struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + if (sev_guest(svm->vcpu.kvm)) + return pre_sev_run(svm, cpu); + /* FIXME: handle wraparound of asid_generation */ if (svm->asid_generation != sd->asid_generation) new_asid(svm, sd); From 59414c989220825f970f38dbcbf11f18e817d73c Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:35 -0600 Subject: [PATCH 032/197] KVM: SVM: Add support for KVM_SEV_LAUNCH_START command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KVM_SEV_LAUNCH_START command is used to create a memory encryption context within the SEV firmware. In order to do so, the guest owner should provide the guest's policy, its public Diffie-Hellman (PDH) key and session information. The command implements the LAUNCH_START flow defined in SEV spec Section 6.2. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/svm.c | 153 ++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 384dcfda43cc75..0ea890375532d1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -750,6 +750,8 @@ enum kvm_irqchip_mode { struct kvm_sev_info { bool active; /* SEV enabled guest */ unsigned int asid; /* ASID used for this guest */ + unsigned int handle; /* SEV firmware handle */ + int fd; /* SEV device fd */ }; struct kvm_arch { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cdbdc86d7aee94..e5b712e5518654 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1538,11 +1538,45 @@ static void sev_asid_free(struct kvm *kvm) __sev_asid_free(sev->asid); } +static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) +{ + struct sev_data_decommission *decommission; + struct sev_data_deactivate *data; + + if (!handle) + return; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + + /* deactivate handle */ + data->handle = handle; + sev_guest_deactivate(data, NULL); + + wbinvd_on_all_cpus(); + sev_guest_df_flush(NULL); + kfree(data); + + decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); + if (!decommission) + return; + + /* decommission handle */ + decommission->handle = handle; + sev_guest_decommission(decommission, NULL); + + kfree(decommission); +} + static void sev_vm_destroy(struct kvm *kvm) { + struct kvm_sev_info *sev = &kvm->arch.sev_info; + if (!sev_guest(kvm)) return; + sev_unbind_asid(kvm, sev->handle); sev_asid_free(kvm); } @@ -5708,6 +5742,122 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } +static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) +{ + struct sev_data_activate *data; + int asid = sev_get_asid(kvm); + int ret; + + wbinvd_on_all_cpus(); + + ret = sev_guest_df_flush(error); + if (ret) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + /* activate ASID on the given handle */ + data->handle = handle; + data->asid = asid; + ret = sev_guest_activate(data, error); + kfree(data); + + return ret; +} + +static int sev_issue_cmd(int fd, int id, void *data, int *error) +{ + struct fd f; + int ret; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = sev_issue_cmd_external_user(f.file, id, data, error); + + fdput(f); + return ret; +} + +static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_launch_start *start; + struct kvm_sev_launch_start params; + void *dh_blob, *session_blob; + int *error = &argp->error; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + start = kzalloc(sizeof(*start), GFP_KERNEL); + if (!start) + return -ENOMEM; + + dh_blob = NULL; + if (params.dh_uaddr) { + dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); + if (IS_ERR(dh_blob)) { + ret = PTR_ERR(dh_blob); + goto e_free; + } + + start->dh_cert_address = __sme_set(__pa(dh_blob)); + start->dh_cert_len = params.dh_len; + } + + session_blob = NULL; + if (params.session_uaddr) { + session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len); + if (IS_ERR(session_blob)) { + ret = PTR_ERR(session_blob); + goto e_free_dh; + } + + start->session_address = __sme_set(__pa(session_blob)); + start->session_len = params.session_len; + } + + start->handle = params.handle; + start->policy = params.policy; + + /* create memory encryption context */ + ret = sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); + if (ret) + goto e_free_session; + + /* Bind ASID to this guest */ + ret = sev_bind_asid(kvm, start->handle, error); + if (ret) + goto e_free_session; + + /* return handle to userspace */ + params.handle = start->handle; + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { + sev_unbind_asid(kvm, start->handle); + ret = -EFAULT; + goto e_free_session; + } + + sev->handle = start->handle; + sev->fd = argp->sev_fd; + +e_free_session: + kfree(session_blob); +e_free_dh: + kfree(dh_blob); +e_free: + kfree(start); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -5725,6 +5875,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_START: + r = sev_launch_start(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; From 89c5058090528026f6542e8b12f4262e492bd3a2 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:35 -0600 Subject: [PATCH 033/197] KVM: SVM: Add support for KVM_SEV_LAUNCH_UPDATE_DATA command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for encrypting the guest memory region using the VM encryption key (VEK) created during KVM_SEV_LAUNCH_START. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 191 +++++++++++++++++++++++++++++++- 2 files changed, 190 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0ea890375532d1..a0b021f1fd0508 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -752,6 +752,7 @@ struct kvm_sev_info { unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long pages_locked; /* Number of pages locked */ }; struct kvm_arch { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e5b712e5518654..88951cbef3ec14 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include #include @@ -331,6 +333,7 @@ enum { static unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; +#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) static inline bool svm_sev_enabled(void) { @@ -1569,6 +1572,83 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) kfree(decommission); } +static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, + unsigned long ulen, unsigned long *n, + int write) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + unsigned long npages, npinned, size; + unsigned long locked, lock_limit; + struct page **pages; + int first, last; + + /* Calculate number of pages. */ + first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; + last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT; + npages = (last - first + 1); + + locked = sev->pages_locked + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); + return NULL; + } + + /* Avoid using vmalloc for smaller buffers. */ + size = npages * sizeof(struct page *); + if (size > PAGE_SIZE) + pages = vmalloc(size); + else + pages = kmalloc(size, GFP_KERNEL); + + if (!pages) + return NULL; + + /* Pin the user virtual address. */ + npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + if (npinned != npages) { + pr_err("SEV: Failure locking %lu pages.\n", npages); + goto err; + } + + *n = npages; + sev->pages_locked = locked; + + return pages; + +err: + if (npinned > 0) + release_pages(pages, npinned); + + kvfree(pages); + return NULL; +} + +static void sev_unpin_memory(struct kvm *kvm, struct page **pages, + unsigned long npages) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + release_pages(pages, npages); + kvfree(pages); + sev->pages_locked -= npages; +} + +static void sev_clflush_pages(struct page *pages[], unsigned long npages) +{ + uint8_t *page_virtual; + unsigned long i; + + if (npages == 0 || pages == NULL) + return; + + for (i = 0; i < npages; i++) { + page_virtual = kmap_atomic(pages[i]); + clflush_cache_range(page_virtual, PAGE_SIZE); + kunmap_atomic(page_virtual); + } +} + static void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &kvm->arch.sev_info; @@ -5767,7 +5847,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) return ret; } -static int sev_issue_cmd(int fd, int id, void *data, int *error) +static int __sev_issue_cmd(int fd, int id, void *data, int *error) { struct fd f; int ret; @@ -5782,6 +5862,13 @@ static int sev_issue_cmd(int fd, int id, void *data, int *error) return ret; } +static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + return __sev_issue_cmd(sev->fd, id, data, error); +} + static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &kvm->arch.sev_info; @@ -5829,7 +5916,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) start->policy = params.policy; /* create memory encryption context */ - ret = sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); if (ret) goto e_free_session; @@ -5858,6 +5945,103 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } +static int get_num_contig_pages(int idx, struct page **inpages, + unsigned long npages) +{ + unsigned long paddr, next_paddr; + int i = idx + 1, pages = 1; + + /* find the number of contiguous pages starting from idx */ + paddr = __sme_page_pa(inpages[idx]); + while (i < npages) { + next_paddr = __sme_page_pa(inpages[i++]); + if ((paddr + PAGE_SIZE) == next_paddr) { + pages++; + paddr = next_paddr; + continue; + } + break; + } + + return pages; +} + +static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + unsigned long vaddr, vaddr_end, next_vaddr, npages, size; + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_launch_update_data params; + struct sev_data_launch_update_data *data; + struct page **inpages; + int i, ret, pages; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + vaddr = params.uaddr; + size = params.len; + vaddr_end = vaddr + size; + + /* Lock the user memory. */ + inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + if (!inpages) { + ret = -ENOMEM; + goto e_free; + } + + /* + * The LAUNCH_UPDATE command will perform in-place encryption of the + * memory content (i.e it will write the same memory region with C=1). + * It's possible that the cache may contain the data with C=0, i.e., + * unencrypted so invalidate it first. + */ + sev_clflush_pages(inpages, npages); + + for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { + int offset, len; + + /* + * If the user buffer is not page-aligned, calculate the offset + * within the page. + */ + offset = vaddr & (PAGE_SIZE - 1); + + /* Calculate the number of pages that can be encrypted in one go. */ + pages = get_num_contig_pages(i, inpages, npages); + + len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); + + data->handle = sev->handle; + data->len = len; + data->address = __sme_page_pa(inpages[i]) + offset; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error); + if (ret) + goto e_unpin; + + size -= len; + next_vaddr = vaddr + len; + } + +e_unpin: + /* content of memory is updated, mark pages dirty */ + for (i = 0; i < npages; i++) { + set_page_dirty_lock(inpages[i]); + mark_page_accessed(inpages[i]); + } + /* unlock the user pages */ + sev_unpin_memory(kvm, inpages, npages); +e_free: + kfree(data); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -5878,6 +6062,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_UPDATE_DATA: + r = sev_launch_update_data(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; From 0d0736f76347f7e14a3e5bb7a202a75a53ce1e3b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:36 -0600 Subject: [PATCH 034/197] KVM: SVM: Add support for KVM_SEV_LAUNCH_MEASURE command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used to retrieve the measurement of contents encrypted through the KVM_SEV_LAUNCH_UPDATE_DATA command. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 88951cbef3ec14..74e010e6b5b912 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6042,6 +6042,77 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } +static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_launch_measure *data; + struct kvm_sev_launch_measure params; + void *blob = NULL; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + /* User wants to query the blob length */ + if (!params.len) + goto cmd; + + if (params.uaddr) { + if (params.len > SEV_FW_BLOB_MAX_SIZE) { + ret = -EINVAL; + goto e_free; + } + + if (!access_ok(VERIFY_WRITE, params.uaddr, params.len)) { + ret = -EFAULT; + goto e_free; + } + + ret = -ENOMEM; + blob = kmalloc(params.len, GFP_KERNEL); + if (!blob) + goto e_free; + + data->address = __psp_pa(blob); + data->len = params.len; + } + +cmd: + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error); + + /* + * If we query the session length, FW responded with expected data. + */ + if (!params.len) + goto done; + + if (ret) + goto e_free_blob; + + if (blob) { + if (copy_to_user((void __user *)(uintptr_t)params.uaddr, blob, params.len)) + ret = -EFAULT; + } + +done: + params.len = data->len; + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + ret = -EFAULT; +e_free_blob: + kfree(blob); +e_free: + kfree(data); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6065,6 +6136,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_UPDATE_DATA: r = sev_launch_update_data(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_MEASURE: + r = sev_launch_measure(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; From 5bdb0e2fa45eb007a1cc630c6b520a638e60c1f6 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:36 -0600 Subject: [PATCH 035/197] KVM: SVM: Add support for SEV LAUNCH_FINISH command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for finializing the SEV guest launch process. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 74e010e6b5b912..6a8d81311e9dbc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6113,6 +6113,26 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } +static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_launch_finish *data; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error); + + kfree(data); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6139,6 +6159,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_MEASURE: r = sev_launch_measure(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_FINISH: + r = sev_launch_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; From 255d9e75e2549a5bc85c3606f5a80e8a0ef4d170 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:37 -0600 Subject: [PATCH 036/197] KVM: SVM: Add support for SEV GUEST_STATUS command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for querying the SEV guest information. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6a8d81311e9dbc..f5da2753790db4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6133,6 +6133,36 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } +static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_guest_status params; + struct sev_data_guest_status *data; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error); + if (ret) + goto e_free; + + params.policy = data->policy; + params.state = data->state; + params.handle = data->handle; + + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + ret = -EFAULT; +e_free: + kfree(data); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6162,6 +6192,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_FINISH: r = sev_launch_finish(kvm, &sev_cmd); break; + case KVM_SEV_GUEST_STATUS: + r = sev_guest_status(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; From 24f41fb23a39bc2b6f190dcef35a5813a4bf183a Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:37 -0600 Subject: [PATCH 037/197] KVM: SVM: Add support for SEV DEBUG_DECRYPT command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for decrypting a guest memory region for debug purposes. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 152 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f5da2753790db4..ec2b864faec2b6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6163,6 +6163,155 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } +static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, + unsigned long dst, int size, + int *error, bool enc) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_dbg *data; + int ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + data->dst_addr = dst; + data->src_addr = src; + data->len = size; + + ret = sev_issue_cmd(kvm, + enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, + data, error); + kfree(data); + return ret; +} + +static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, + unsigned long dst_paddr, int sz, int *err) +{ + int offset; + + /* + * Its safe to read more than we are asked, caller should ensure that + * destination has enough space. + */ + src_paddr = round_down(src_paddr, 16); + offset = src_paddr & 15; + sz = round_up(sz + offset, 16); + + return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); +} + +static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, + unsigned long __user dst_uaddr, + unsigned long dst_paddr, + int size, int *err) +{ + struct page *tpage = NULL; + int ret, offset; + + /* if inputs are not 16-byte then use intermediate buffer */ + if (!IS_ALIGNED(dst_paddr, 16) || + !IS_ALIGNED(paddr, 16) || + !IS_ALIGNED(size, 16)) { + tpage = (void *)alloc_page(GFP_KERNEL); + if (!tpage) + return -ENOMEM; + + dst_paddr = __sme_page_pa(tpage); + } + + ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); + if (ret) + goto e_free; + + if (tpage) { + offset = paddr & 15; + if (copy_to_user((void __user *)(uintptr_t)dst_uaddr, + page_address(tpage) + offset, size)) + ret = -EFAULT; + } + +e_free: + if (tpage) + __free_page(tpage); + + return ret; +} + +static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) +{ + unsigned long vaddr, vaddr_end, next_vaddr; + unsigned long dst_vaddr, dst_vaddr_end; + struct page **src_p, **dst_p; + struct kvm_sev_dbg debug; + unsigned long n; + int ret, size; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug))) + return -EFAULT; + + vaddr = debug.src_uaddr; + size = debug.len; + vaddr_end = vaddr + size; + dst_vaddr = debug.dst_uaddr; + dst_vaddr_end = dst_vaddr + size; + + for (; vaddr < vaddr_end; vaddr = next_vaddr) { + int len, s_off, d_off; + + /* lock userspace source and destination page */ + src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); + if (!src_p) + return -EFAULT; + + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + if (!dst_p) { + sev_unpin_memory(kvm, src_p, n); + return -EFAULT; + } + + /* + * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the + * memory content (i.e it will write the same memory region with C=1). + * It's possible that the cache may contain the data with C=0, i.e., + * unencrypted so invalidate it first. + */ + sev_clflush_pages(src_p, 1); + sev_clflush_pages(dst_p, 1); + + /* + * Since user buffer may not be page aligned, calculate the + * offset within the page. + */ + s_off = vaddr & ~PAGE_MASK; + d_off = dst_vaddr & ~PAGE_MASK; + len = min_t(size_t, (PAGE_SIZE - s_off), size); + + ret = __sev_dbg_decrypt_user(kvm, + __sme_page_pa(src_p[0]) + s_off, + dst_vaddr, + __sme_page_pa(dst_p[0]) + d_off, + len, &argp->error); + + sev_unpin_memory(kvm, src_p, 1); + sev_unpin_memory(kvm, dst_p, 1); + + if (ret) + goto err; + + next_vaddr = vaddr + len; + dst_vaddr = dst_vaddr + len; + size -= len; + } +err: + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6195,6 +6344,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_GUEST_STATUS: r = sev_guest_status(kvm, &sev_cmd); break; + case KVM_SEV_DBG_DECRYPT: + r = sev_dbg_crypt(kvm, &sev_cmd, true); + break; default: r = -EINVAL; goto out; From 7d1594f5d94bb9fa60dacca2390a6c58398f005a Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:37 -0600 Subject: [PATCH 038/197] KVM: SVM: Add support for SEV DEBUG_ENCRYPT command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command copies a plaintext into guest memory and encrypts it using the VM encryption key. The command will be used for debug purposes (e.g setting breakpoints through gdbserver) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh --- arch/x86/kvm/svm.c | 98 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 93 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ec2b864faec2b6..11d4860997d96e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6240,6 +6240,83 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, return ret; } +static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, + unsigned long __user vaddr, + unsigned long dst_paddr, + unsigned long __user dst_vaddr, + int size, int *error) +{ + struct page *src_tpage = NULL; + struct page *dst_tpage = NULL; + int ret, len = size; + + /* If source buffer is not aligned then use an intermediate buffer */ + if (!IS_ALIGNED(vaddr, 16)) { + src_tpage = alloc_page(GFP_KERNEL); + if (!src_tpage) + return -ENOMEM; + + if (copy_from_user(page_address(src_tpage), + (void __user *)(uintptr_t)vaddr, size)) { + __free_page(src_tpage); + return -EFAULT; + } + + paddr = __sme_page_pa(src_tpage); + } + + /* + * If destination buffer or length is not aligned then do read-modify-write: + * - decrypt destination in an intermediate buffer + * - copy the source buffer in an intermediate buffer + * - use the intermediate buffer as source buffer + */ + if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { + int dst_offset; + + dst_tpage = alloc_page(GFP_KERNEL); + if (!dst_tpage) { + ret = -ENOMEM; + goto e_free; + } + + ret = __sev_dbg_decrypt(kvm, dst_paddr, + __sme_page_pa(dst_tpage), size, error); + if (ret) + goto e_free; + + /* + * If source is kernel buffer then use memcpy() otherwise + * copy_from_user(). + */ + dst_offset = dst_paddr & 15; + + if (src_tpage) + memcpy(page_address(dst_tpage) + dst_offset, + page_address(src_tpage), size); + else { + if (copy_from_user(page_address(dst_tpage) + dst_offset, + (void __user *)(uintptr_t)vaddr, size)) { + ret = -EFAULT; + goto e_free; + } + } + + paddr = __sme_page_pa(dst_tpage); + dst_paddr = round_down(dst_paddr, 16); + len = round_up(size, 16); + } + + ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true); + +e_free: + if (src_tpage) + __free_page(src_tpage); + if (dst_tpage) + __free_page(dst_tpage); + return ret; +} + static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) { unsigned long vaddr, vaddr_end, next_vaddr; @@ -6292,11 +6369,19 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) d_off = dst_vaddr & ~PAGE_MASK; len = min_t(size_t, (PAGE_SIZE - s_off), size); - ret = __sev_dbg_decrypt_user(kvm, - __sme_page_pa(src_p[0]) + s_off, - dst_vaddr, - __sme_page_pa(dst_p[0]) + d_off, - len, &argp->error); + if (dec) + ret = __sev_dbg_decrypt_user(kvm, + __sme_page_pa(src_p[0]) + s_off, + dst_vaddr, + __sme_page_pa(dst_p[0]) + d_off, + len, &argp->error); + else + ret = __sev_dbg_encrypt_user(kvm, + __sme_page_pa(src_p[0]) + s_off, + vaddr, + __sme_page_pa(dst_p[0]) + d_off, + dst_vaddr, + len, &argp->error); sev_unpin_memory(kvm, src_p, 1); sev_unpin_memory(kvm, dst_p, 1); @@ -6347,6 +6432,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_DBG_DECRYPT: r = sev_dbg_crypt(kvm, &sev_cmd, true); break; + case KVM_SEV_DBG_ENCRYPT: + r = sev_dbg_crypt(kvm, &sev_cmd, false); + break; default: r = -EINVAL; goto out; From 9f5b5b950aa96b3d303d132e069f93c8bc4c9b58 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:38 -0600 Subject: [PATCH 039/197] KVM: SVM: Add support for SEV LAUNCH_SECRET command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for injecting a secret into the guest memory region. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 11d4860997d96e..8a499425bf7ec6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6397,6 +6397,71 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) return ret; } +static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_launch_secret *data; + struct kvm_sev_launch_secret params; + struct page **pages; + void *blob, *hdr; + unsigned long n; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + if (!pages) + return -ENOMEM; + + /* + * The secret must be copied into contiguous memory region, lets verify + * that userspace memory pages are contiguous before we issue command. + */ + if (get_num_contig_pages(0, pages, n) != n) { + ret = -EINVAL; + goto e_unpin_memory; + } + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto e_unpin_memory; + + blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); + if (IS_ERR(blob)) { + ret = PTR_ERR(blob); + goto e_free; + } + + data->trans_address = __psp_pa(blob); + data->trans_len = params.trans_len; + + hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); + if (IS_ERR(hdr)) { + ret = PTR_ERR(hdr); + goto e_free_blob; + } + data->trans_address = __psp_pa(blob); + data->trans_len = params.trans_len; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error); + + kfree(hdr); + +e_free_blob: + kfree(blob); +e_free: + kfree(data); +e_unpin_memory: + sev_unpin_memory(kvm, pages, n); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6435,6 +6500,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_DBG_ENCRYPT: r = sev_dbg_crypt(kvm, &sev_cmd, false); break; + case KVM_SEV_LAUNCH_SECRET: + r = sev_launch_secret(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; From 1e80fdc09d121d8327cdf62eefbb5abadddca792 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:38 -0600 Subject: [PATCH 040/197] KVM: SVM: Pin guest memory when SEV is active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV memory encryption engine uses a tweak such that two identical plaintext pages at different location will have different ciphertext. So swapping or moving ciphertext of two pages will not result in plaintext being swapped. Relocating (or migrating) physical backing pages for a SEV guest will require some additional steps. The current SEV key management spec does not provide commands to swap or migrate (move) ciphertext pages. For now, we pin the guest memory registered through KVM_MEMORY_ENCRYPT_REG_REGION ioctl. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 132 ++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a0b021f1fd0508..262950f9f2d95e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -753,6 +753,7 @@ struct kvm_sev_info { unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ unsigned long pages_locked; /* Number of pages locked */ + struct list_head regions_list; /* List of registered regions */ }; struct kvm_arch { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8a499425bf7ec6..bdce5b4e71ccba 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -335,6 +335,14 @@ static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) +struct enc_region { + struct list_head list; + unsigned long npages; + struct page **pages; + unsigned long uaddr; + unsigned long size; +}; + static inline bool svm_sev_enabled(void) { return max_sev_asid; @@ -1649,13 +1657,46 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) } } +static void __unregister_enc_region_locked(struct kvm *kvm, + struct enc_region *region) +{ + /* + * The guest may change the memory encryption attribute from C=0 -> C=1 + * or vice versa for this memory range. Lets make sure caches are + * flushed to ensure that guest data gets written into memory with + * correct C-bit. + */ + sev_clflush_pages(region->pages, region->npages); + + sev_unpin_memory(kvm, region->pages, region->npages); + list_del(®ion->list); + kfree(region); +} + static void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct list_head *head = &sev->regions_list; + struct list_head *pos, *q; if (!sev_guest(kvm)) return; + mutex_lock(&kvm->lock); + + /* + * if userspace was terminated before unregistering the memory regions + * then lets unpin all the registered memory. + */ + if (!list_empty(head)) { + list_for_each_safe(pos, q, head) { + __unregister_enc_region_locked(kvm, + list_entry(pos, struct enc_region, list)); + } + } + + mutex_unlock(&kvm->lock); + sev_unbind_asid(kvm, sev->handle); sev_asid_free(kvm); } @@ -5814,6 +5855,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) sev->active = true; sev->asid = asid; + INIT_LIST_HEAD(&sev->regions_list); return 0; @@ -6516,6 +6558,94 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) return r; } +static int svm_register_enc_region(struct kvm *kvm, + struct kvm_enc_region *range) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct enc_region *region; + int ret = 0; + + if (!sev_guest(kvm)) + return -ENOTTY; + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return -ENOMEM; + + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + if (!region->pages) { + ret = -ENOMEM; + goto e_free; + } + + /* + * The guest may change the memory encryption attribute from C=0 -> C=1 + * or vice versa for this memory range. Lets make sure caches are + * flushed to ensure that guest data gets written into memory with + * correct C-bit. + */ + sev_clflush_pages(region->pages, region->npages); + + region->uaddr = range->addr; + region->size = range->size; + + mutex_lock(&kvm->lock); + list_add_tail(®ion->list, &sev->regions_list); + mutex_unlock(&kvm->lock); + + return ret; + +e_free: + kfree(region); + return ret; +} + +static struct enc_region * +find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct list_head *head = &sev->regions_list; + struct enc_region *i; + + list_for_each_entry(i, head, list) { + if (i->uaddr == range->addr && + i->size == range->size) + return i; + } + + return NULL; +} + + +static int svm_unregister_enc_region(struct kvm *kvm, + struct kvm_enc_region *range) +{ + struct enc_region *region; + int ret; + + mutex_lock(&kvm->lock); + + if (!sev_guest(kvm)) { + ret = -ENOTTY; + goto failed; + } + + region = find_enc_region(kvm, range); + if (!region) { + ret = -EINVAL; + goto failed; + } + + __unregister_enc_region_locked(kvm, region); + + mutex_unlock(&kvm->lock); + return 0; + +failed: + mutex_unlock(&kvm->lock); + return ret; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -6633,6 +6763,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .enable_smi_window = enable_smi_window, .mem_enc_op = svm_mem_enc_op, + .mem_enc_reg_region = svm_register_enc_region, + .mem_enc_unreg_region = svm_unregister_enc_region, }; static int __init svm_init(void) From 0ede79e1322479221fb0d0a7d2828d8ed033f060 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:39 -0600 Subject: [PATCH 041/197] KVM: SVM: Clear C-bit from the page fault address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SEV is active, on #VMEXIT the page fault address will contain the C-bit. We must clear the C-bit before handling the fault. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Borislav Petkov Signed-off-by: Brijesh Singh --- arch/x86/kvm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bdce5b4e71ccba..f350e9bac9eb00 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2430,7 +2430,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { - u64 fault_address = svm->vmcb->control.exit_info_2; + u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); u64 error_code = svm->vmcb->control.exit_info_1; return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, @@ -2440,7 +2440,7 @@ static int pf_interception(struct vcpu_svm *svm) static int npf_interception(struct vcpu_svm *svm) { - u64 fault_address = svm->vmcb->control.exit_info_2; + u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); u64 error_code = svm->vmcb->control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); From 35c6f649bbb2e3f367116307273f998f0bf3e08e Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:39 -0600 Subject: [PATCH 042/197] KVM: SVM: Do not install #UD intercept when SEV is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On #UD, x86_emulate_instruction() fetches the data from guest memory and decodes the instruction bytes to assist further. When SEV is enabled, the instruction bytes will be encrypted using the guest-specific key and the hypervisor will no longer able to fetch the instruction bytes to assist UD handling. By not installing intercept we let the guest receive and handle #UD. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Borislav Petkov Signed-off-by: Brijesh Singh --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f350e9bac9eb00..3e848f952b4fde 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1443,8 +1443,10 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } - if (sev_guest(svm->vcpu.kvm)) + if (sev_guest(svm->vcpu.kvm)) { svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; + clr_exception_intercept(svm, UD_VECTOR); + } mark_all_dirty(svm->vmcb); From 00b10fe1046c4b2232097a7ffaa9238c7e479388 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:40 -0600 Subject: [PATCH 043/197] KVM: X86: Restart the guest when insn_len is zero and SEV is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On AMD platforms, under certain conditions insn_len may be zero on #NPF. This can happen if a guest gets a page-fault on data access but the HW table walker is not able to read the instruction page (e.g instruction page is not present in memory). Typically, when insn_len is zero, x86_emulate_instruction() walks the guest page table and fetches the instruction bytes from guest memory. When SEV is enabled, the guest memory is encrypted with guest-specific key hence hypervisor will not able to fetch the instruction bytes. In those cases we simply restart the guest. I have encountered this issue when running kernbench inside the guest. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh --- arch/x86/kvm/mmu.c | 10 ++++++++++ arch/x86/kvm/svm.c | 6 ++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e5e66e5c664057..d5e5dbd0e5ad9d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4950,6 +4950,16 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; emulate: + /* + * On AMD platforms, under certain conditions insn_len may be zero on #NPF. + * This can happen if a guest gets a page-fault on data access but the HW + * table walker is not able to read the instruction page (e.g instruction + * page is not present in memory). In those cases we simply restart the + * guest. + */ + if (unlikely(insn && !insn_len)) + return 1; + er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); switch (er) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3e848f952b4fde..ec5df575299525 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2436,7 +2436,8 @@ static int pf_interception(struct vcpu_svm *svm) u64 error_code = svm->vmcb->control.exit_info_1; return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, - svm->vmcb->control.insn_bytes, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); } @@ -2447,7 +2448,8 @@ static int npf_interception(struct vcpu_svm *svm) trace_kvm_page_fault(fault_address, error_code); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, - svm->vmcb->control.insn_bytes, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); } From ae3e61e1c28338d077b704505570fa181df1e41f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 10:36:41 +0200 Subject: [PATCH 044/197] KVM: x86: add support for UMIP Add the CPUID bits, make the CR4.UMIP bit not reserved anymore, and add UMIP support for instructions that are already emulated by KVM. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/cpuid.c | 4 ++-- arch/x86/kvm/emulate.c | 8 ++++++++ arch/x86/kvm/x86.c | 3 +++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 51679843132829..ff79134d1d71af 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -86,7 +86,7 @@ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ - | X86_CR4_SMAP | X86_CR4_PKE)) + | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0099e10eb04525..77fb8732b47b30 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -387,8 +387,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | - 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512_VPOPCNTDQ) | F(UMIP); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index abe74f779f9d79..5edb2526762881 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3720,6 +3720,10 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, { struct desc_ptr desc_ptr; + if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && + ctxt->ops->cpl(ctxt) > 0) + return emulate_gp(ctxt, 0); + if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->op_bytes = 8; get(ctxt, &desc_ptr); @@ -3779,6 +3783,10 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) static int em_smsw(struct x86_emulate_ctxt *ctxt) { + if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && + ctxt->ops->cpl(ctxt) > 0) + return emulate_gp(ctxt, 0); + if (ctxt->dst.type == OP_MEM) ctxt->dst.bytes = 2; ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 56d036b9ad75d0..dd2e80ac49ffcd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -794,6 +794,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; From dd307d017b445a3af4379c7ff548cb3da5ecde31 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 10:35:51 +0200 Subject: [PATCH 045/197] KVM: x86: emulate sldt and str These are needed to handle the descriptor table vmexits when emulating UMIP. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5edb2526762881..fa703e3baa4bb8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3633,17 +3633,27 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) +static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment) { - if (ctxt->modrm_reg > VCPU_SREG_GS) - return emulate_ud(ctxt); + if (segment > VCPU_SREG_GS && + (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && + ctxt->ops->cpl(ctxt) > 0) + return emulate_gp(ctxt, 0); - ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg); + ctxt->dst.val = get_segment_selector(ctxt, segment); if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM) ctxt->dst.bytes = 2; return X86EMUL_CONTINUE; } +static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->modrm_reg > VCPU_SREG_GS) + return emulate_ud(ctxt); + + return em_store_sreg(ctxt, ctxt->modrm_reg); +} + static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; @@ -3659,6 +3669,11 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } +static int em_sldt(struct x86_emulate_ctxt *ctxt) +{ + return em_store_sreg(ctxt, VCPU_SREG_LDTR); +} + static int em_lldt(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; @@ -3668,6 +3683,11 @@ static int em_lldt(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); } +static int em_str(struct x86_emulate_ctxt *ctxt) +{ + return em_store_sreg(ctxt, VCPU_SREG_TR); +} + static int em_ltr(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; @@ -4372,8 +4392,8 @@ static const struct opcode group5[] = { }; static const struct opcode group6[] = { - DI(Prot | DstMem, sldt), - DI(Prot | DstMem, str), + II(Prot | DstMem, em_sldt, sldt), + II(Prot | DstMem, em_str, str), II(Prot | Priv | SrcMem16, em_lldt, lldt), II(Prot | Priv | SrcMem16, em_ltr, ltr), N, N, N, N, From 66336cab3531d3325ebde36a04725dddd0c42cb5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 10:36:41 +0200 Subject: [PATCH 046/197] KVM: x86: add support for emulating UMIP The User-Mode Instruction Prevention feature present in recent Intel processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) from being executed with CPL > 0. Otherwise, a general protection fault is issued. UMIP instructions in general are also able to trigger vmexits, so we can actually emulate UMIP on older processors. This commit sets up the infrastructure so that kvm-intel.ko and kvm-amd.ko can set the UMIP feature bit for CPUID even if the feature is not actually available in hardware. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 6 ++++++ 4 files changed, 15 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ff79134d1d71af..515db75081d131 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1017,6 +1017,7 @@ struct kvm_x86_ops { void (*handle_external_intr)(struct kvm_vcpu *vcpu); bool (*mpx_supported)(void); bool (*xsaves_supported)(void); + bool (*umip_emulated)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 77fb8732b47b30..2b3b06458f6f67 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -327,6 +327,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; + unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -473,6 +474,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ebx |= F(TSC_ADJUST); entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; cpuid_mask(&entry->ecx, CPUID_7_ECX); + entry->ecx |= f_umip; /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index eb714f1cdf7eee..1f3e7f210374a1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5204,6 +5204,11 @@ static bool svm_xsaves_supported(void) return false; } +static bool svm_umip_emulated(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -5597,6 +5602,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .invpcid_supported = svm_invpcid_supported, .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, + .umip_emulated = svm_umip_emulated, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eba631c4dbd50..b989cfe412b13b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9155,6 +9155,11 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_umip_emulated(void) +{ + return false; +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -12170,6 +12175,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, + .umip_emulated = vmx_umip_emulated, .check_nested_events = vmx_check_nested_events, From 0367f205a3b7c0efe774634eef1f4697c79a4132 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 10:44:55 +0200 Subject: [PATCH 047/197] KVM: vmx: add support for emulating UMIP UMIP can be emulated almost perfectly on Intel processor by enabling descriptor-table exits. SMSW does not cause a vmexit and hence it cannot be changed into a #GP fault, but all in all it's the most "innocuous" of the unprivileged instructions that UMIP blocks. In fact, Linux is _also_ emulating SMSW instructions on behalf of the program that executes them, because some 16-bit programs expect to use SMSW to detect vm86 mode, so this is an even smaller issue. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b989cfe412b13b..ebd50de981ee67 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3662,6 +3662,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_DESC | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -4369,6 +4370,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (to_vmx(vcpu)->rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + hw_cr4 &= ~X86_CR4_UMIP; + } else + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + if (cr4 & X86_CR4_VMXE) { /* * To use VMXON (and later other VMX instructions), a guest @@ -5308,6 +5317,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) struct kvm_vcpu *vcpu = &vmx->vcpu; u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) @@ -5326,6 +5336,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, + * in vmx_set_cr4. */ + exec_control &= ~SECONDARY_EXEC_DESC; + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD (handle_vmptrld). We can NOT enable shadow_vmcs here because we don't have yet @@ -6101,6 +6116,12 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) return kvm_set_cr4(vcpu, val); } +static int handle_desc(struct kvm_vcpu *vcpu) +{ + WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); + return emulate_instruction(vcpu, 0) == EMULATE_DONE; +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; @@ -8193,6 +8214,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_GDTR_IDTR] = handle_desc, + [EXIT_REASON_LDTR_TR] = handle_desc, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, @@ -9157,7 +9180,8 @@ static bool vmx_xsaves_supported(void) static bool vmx_umip_emulated(void) { - return false; + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; } static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) @@ -9755,7 +9779,8 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) u32 mask = SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_DESC; u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); From fb6d4d340e0532032c808a9933eaaa7b8de435ab Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 11:04:26 +0200 Subject: [PATCH 048/197] KVM: x86: emulate RDPID This is encoded as F3 0F C7 /7 with a register argument. The register argument is the second array in the group9 GroupDual, while F3 is the fourth element of a Prefix. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 7 ++++++- arch/x86/kvm/emulate.c | 22 +++++++++++++++++++++- arch/x86/kvm/vmx.c | 15 +++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2b3b06458f6f67..b94371146533ff 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -293,13 +293,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, { switch (func) { case 0: - entry->eax = 1; /* only one leaf currently */ + entry->eax = 7; ++*nent; break; case 1: entry->ecx = F(MOVBE); ++*nent; break; + case 7: + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + if (index == 0) + entry->ecx = F(RDPID); + ++*nent; default: break; } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fa703e3baa4bb8..cb929d0bb1bd11 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3514,6 +3514,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_rdpid(struct x86_emulate_ctxt *ctxt) +{ + u64 tsc_aux = 0; + + if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) + return emulate_gp(ctxt, 0); + ctxt->dst.val = tsc_aux; + return X86EMUL_CONTINUE; +} + static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { u64 tsc = 0; @@ -4424,10 +4434,20 @@ static const struct opcode group8[] = { F(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; +/* + * The "memory" destination is actually always a register, since we come + * from the register case of group9. + */ +static const struct gprefix pfx_0f_c7_7 = { + N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp), +}; + + static const struct group_dual group9 = { { N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { - N, N, N, N, N, N, N, N, + N, N, N, N, N, N, N, + GP(0, &pfx_0f_c7_7), } }; static const struct opcode group11[] = { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ebd50de981ee67..8dafc5d7d4d666 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11687,6 +11687,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * RDPID causes #UD if disabled through secondary execution controls. + * Because it is marked as EmulateOnUD, we need to intercept it here. + */ + if (info->intercept == x86_intercept_rdtscp && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + ctxt->exception.vector = UD_VECTOR; + ctxt->exception.error_code_valid = false; + return X86EMUL_PROPAGATE_FAULT; + } + + /* TODO: check more intercepts... */ return X86EMUL_CONTINUE; } From 858ac87fc2ff7be8fedfa8dd2ba47627068c2eae Mon Sep 17 00:00:00 2001 From: Gimcuan Hui Date: Fri, 3 Nov 2017 18:52:46 +0000 Subject: [PATCH 049/197] x86: kvm: mmu: make kvm_mmu_clear_all_pte_masks static The kvm_mmu_clear_all_pte_masks interface is only used by kvm_mmu_module_init locally, and does not need to be called by other module, make it static. This patch cleans up sparse warning: symbol 'kvm_mmu_clear_all_pte_masks' was not declared. Should it be static? Signed-off-by: Gimcuan Hui Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c4deb1f34faa6c..89da688784fa72 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -381,7 +381,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -void kvm_mmu_clear_all_pte_masks(void) +static void kvm_mmu_clear_all_pte_masks(void) { shadow_user_mask = 0; shadow_accessed_mask = 0; From 0d37d26f11bc1c0085d0c89ba603e3bb2c7f7076 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 30 Nov 2017 12:04:22 +0000 Subject: [PATCH 050/197] KVM: x86: MMU: make array audit_point_name static The array audit_point_name is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: arch/x86/kvm/mmu_audit.c:22:12: warning: symbol 'audit_point_name' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu_audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index d22ddbdf5e6ed5..1272861e77b9ec 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -19,7 +19,7 @@ #include -char const *audit_point_name[] = { +static char const *audit_point_name[] = { "pre page fault", "post page fault", "pre pte write", From 2a140f3b6e23a309453b6f68709a50ece543f0f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Wed, 29 Nov 2017 22:23:41 +0100 Subject: [PATCH 051/197] KVM: x86: prevent MWAIT in guest with buggy MONITOR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bug prevents MWAIT from waking up after a write to the monitored cache line. KVM might emulate a CPU model that shouldn't have the bug, so the guest would not employ a workaround and possibly miss wakeups. Better to avoid the situation. Reviewed-by: Alexander Graf Acked-by: Borislav Petkov Acked-by: Michael S. Tsirkin Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d0b95b7a90b4ec..81f5f50794f66c 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -281,6 +281,9 @@ static inline bool kvm_mwait_in_guest(void) return false; } + if (boot_cpu_has_bug(X86_BUG_MONITOR)) + return false; + /* * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as * they would allow guest to stop the CPU completely by disabling From 346f48fa311ceee7478b5b810ba050a4a22e0b2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Wed, 29 Nov 2017 22:23:42 +0100 Subject: [PATCH 052/197] KVM: x86: drop bogus MWAIT check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check was added in some iteration while trying to fix a reported OS X on Core 2 bug, but that bug is elsewhere. The comment is misleading because the guest can call MWAIT with ECX = 0 even if we enforce CPUID5_ECX_INTERRUPT_BREAK; the call would have the exactly the same effect as if the host didn't have the feature. A problem is that a QEMU feature exposes CPUID5_ECX_INTERRUPT_BREAK on CPUs that do not support it. Removing the check changes behavior on last Pentium 4 lines (Presler, Dempsey, and Tulsa, which had VMX and MONITOR while missing INTERRUPT_BREAK) when running a guest OS that uses MWAIT without checking for its presence (QEMU doesn't expose MONITOR). The only known OS that ignores the MONITOR flag is old Mac OS X and we allowed it to bug on Core 2 (MWAIT used to throw #UD and only that OS noticed), so we can save another 20 lines letting it bug on even older CPUs. Alternatively, we can return MWAIT exiting by default and let userspace toggle it. Reviewed-by: Alexander Graf Acked-by: Borislav Petkov Acked-by: Michael S. Tsirkin Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 81f5f50794f66c..d15859ec5e923d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -265,8 +265,6 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) static inline bool kvm_mwait_in_guest(void) { - unsigned int eax, ebx, ecx, edx; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) return false; @@ -275,29 +273,10 @@ static inline bool kvm_mwait_in_guest(void) /* All AMD CPUs have a working MWAIT implementation */ return true; case X86_VENDOR_INTEL: - /* Handle Intel below */ - break; + return !boot_cpu_has_bug(X86_BUG_MONITOR); default: return false; } - - if (boot_cpu_has_bug(X86_BUG_MONITOR)) - return false; - - /* - * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as - * they would allow guest to stop the CPU completely by disabling - * interrupts then invoking MWAIT. - */ - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) - return false; - - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); - - if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) - return false; - - return true; } #endif From 431f5d4443964bcc58ac2e4c6fd8ace6dcd3a8d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Wed, 29 Nov 2017 22:23:43 +0100 Subject: [PATCH 053/197] KVM: x86: simplify kvm_mwait_in_guest() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If Intel/AMD implements MWAIT, we expect that it works well and only reject known bugs; no reason to do it the other way around for minor vendors. (Not that they are relevant ATM.) This allows further simplification of kvm_mwait_in_guest(). And use boot_cpu_has() instead of "cpu_has(&boot_cpu_data," while at it. Reviewed-by: Alexander Graf Acked-by: Borislav Petkov Acked-by: Michael S. Tsirkin Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d15859ec5e923d..c69f973111cb26 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -265,18 +265,8 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) static inline bool kvm_mwait_in_guest(void) { - if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) - return false; - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - /* All AMD CPUs have a working MWAIT implementation */ - return true; - case X86_VENDOR_INTEL: - return !boot_cpu_has_bug(X86_BUG_MONITOR); - default: - return false; - } + return boot_cpu_has(X86_FEATURE_MWAIT) && + !boot_cpu_has_bug(X86_BUG_MONITOR); } #endif From 52797bf9a875c4a30f846196386684e646e08a91 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Wed, 15 Nov 2017 13:43:14 +0200 Subject: [PATCH 054/197] KVM: x86: Add emulation of MSR_SMI_COUNT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This MSR returns the number of #SMIs that occurred on CPU since boot. It was seen to be used frequently by ESXi guest. Patch adds a new vcpu-arch specific var called smi_count to save the number of #SMIs which occurred on CPU since boot. It is exposed as a read-only MSR to guest (causing #GP on wrmsr) in RDMSR/WRMSR emulation code. MSR_SMI_COUNT is also added to emulated_msrs[] to make sure user-space can save/restore it for migration purposes. Signed-off-by: Liran Alon Suggested-by: Paolo Bonzini Reviewed-by: Nikita Leshenko Reviewed-by: Bhavesh Davda Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 515db75081d131..340e3604dcc77f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -504,6 +504,7 @@ struct kvm_vcpu_arch { int mp_state; u64 ia32_misc_enable_msr; u64 smbase; + u64 smi_count; bool tpr_access_reporting; u64 ia32_xss; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dd2e80ac49ffcd..7ea9a57b0684a4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1039,6 +1039,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, + MSR_SMI_COUNT, MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, }; @@ -2231,6 +2232,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.smbase = data; break; + case MSR_SMI_COUNT: + if (!msr_info->host_initiated) + return 1; + vcpu->arch.smi_count = data; + break; case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; @@ -2505,6 +2511,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.smbase; break; + case MSR_SMI_COUNT: + msr_info->data = vcpu->arch.smi_count; + break; case MSR_IA32_PERF_STATUS: /* TSC increment by tick */ msr_info->data = 1000ULL; @@ -6451,6 +6460,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_x86_ops->queue_exception(vcpu); } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; + ++vcpu->arch.smi_count; enter_smm(vcpu); } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; @@ -7808,6 +7818,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.hflags = 0; vcpu->arch.smi_pending = 0; + vcpu->arch.smi_count = 0; atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; From 78588335065967993618106c664fe62d2e271435 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 21 Nov 2017 13:40:17 +0100 Subject: [PATCH 055/197] kvm_main: Use common error handling code in kvm_dev_ioctl_create_vm() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a jump target so that a bit of exception handling can be better reused at the end of this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Radim Krčmář --- virt/kvm/kvm_main.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 210bf820385a70..bc092e3d1d7316 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3186,21 +3186,18 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) return PTR_ERR(kvm); #ifdef CONFIG_KVM_MMIO r = kvm_coalesced_mmio_init(kvm); - if (r < 0) { - kvm_put_kvm(kvm); - return r; - } + if (r < 0) + goto put_kvm; #endif r = get_unused_fd_flags(O_CLOEXEC); - if (r < 0) { - kvm_put_kvm(kvm); - return r; - } + if (r < 0) + goto put_kvm; + file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (IS_ERR(file)) { put_unused_fd(r); - kvm_put_kvm(kvm); - return PTR_ERR(file); + r = PTR_ERR(file); + goto put_kvm; } /* @@ -3218,6 +3215,10 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) fd_install(r, file); return r; + +put_kvm: + kvm_put_kvm(kvm); + return r; } static long kvm_dev_ioctl(struct file *filp, From 80fef315a74d79d765dbf58d9481843a364c50d6 Mon Sep 17 00:00:00 2001 From: Yang Zhong Date: Wed, 22 Nov 2017 15:27:29 +0800 Subject: [PATCH 056/197] KVM: Expose new cpu features to guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel IceLake cpu has added new cpu features,AVX512_VBMI2/GFNI/ VAES/VPCLMULQDQ/AVX512_VNNI/AVX512_BITALG. Those new cpu features need expose to guest VM. The bit definition: CPUID.(EAX=7,ECX=0):ECX[bit 06] AVX512_VBMI2 CPUID.(EAX=7,ECX=0):ECX[bit 08] GFNI CPUID.(EAX=7,ECX=0):ECX[bit 09] VAES CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG The release document ref below link: https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf The kernel dependency commit in kvm.git: (c128dbfa0f879f8ce7b79054037889b0b2240728) Signed-off-by: Yang Zhong Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b94371146533ff..3db9e1cd4f63b7 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | - F(AVX512_VPOPCNTDQ) | F(UMIP); + F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | + F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = From 00647b44944a2f7212ac2c3825a465153d6e438f Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 27 Nov 2017 17:22:25 -0600 Subject: [PATCH 057/197] KVM: nVMX: Eliminate vmcs02 pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The potential performance advantages of a vmcs02 pool have never been realized. To simplify the code, eliminate the pool. Instead, a single vmcs02 is allocated per VCPU when the VCPU enters VMX operation. Signed-off-by: Jim Mattson Signed-off-by: Mark Kanda Reviewed-by: Ameya More Reviewed-by: David Hildenbrand Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 146 +++++++-------------------------------------- 1 file changed, 23 insertions(+), 123 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8dafc5d7d4d666..d1870f3c8c699c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -184,7 +184,6 @@ module_param(ple_window_max, int, S_IRUGO); extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 -#define VMCS02_POOL_SIZE 1 struct vmcs { u32 revision_id; @@ -225,7 +224,7 @@ struct shared_msr_entry { * stored in guest memory specified by VMPTRLD, but is opaque to the guest, * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the * underlying hardware which will be used to run L2. * This structure is packed to ensure that its layout is identical across * machines (necessary for live migration). @@ -408,13 +407,6 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE 0x1000 -/* Used to remember the last vmcs02 used for some recently used vmcs12s */ -struct vmcs02_list { - struct list_head list; - gpa_t vmptr; - struct loaded_vmcs vmcs02; -}; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -439,15 +431,15 @@ struct nested_vmx { */ bool sync_shadow_vmcs; - /* vmcs02_list cache of VMCSs recently used to run L2 guests */ - struct list_head vmcs02_pool; - int vmcs02_num; bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; + + struct loaded_vmcs vmcs02; + /* - * Guest pages referred to in vmcs02 with host-physical pointers, so - * we must keep them pinned while L2 runs. + * Guest pages referred to in the vmcs02 with host-physical + * pointers, so we must keep them pinned while L2 runs. */ struct page *apic_access_page; struct page *virtual_apic_page; @@ -6988,94 +6980,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu) return handle_nop(vcpu); } -/* - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. - * We could reuse a single VMCS for all the L2 guests, but we also want the - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this - * allows keeping them loaded on the processor, and in the future will allow - * optimizations where prepare_vmcs02 doesn't need to set all the fields on - * every entry if they never change. - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. - * - * The following functions allocate and free a vmcs02 in this pool. - */ - -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmx->nested.current_vmptr) { - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { - /* Recycle the least recently used VMCS. */ - item = list_last_entry(&vmx->nested.vmcs02_pool, - struct vmcs02_list, list); - item->vmptr = vmx->nested.current_vmptr; - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - /* Create a new VMCS */ - item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL); - if (!item) - return NULL; - item->vmcs02.vmcs = alloc_vmcs(); - item->vmcs02.shadow_vmcs = NULL; - if (!item->vmcs02.vmcs) { - kfree(item); - return NULL; - } - loaded_vmcs_init(&item->vmcs02); - item->vmptr = vmx->nested.current_vmptr; - list_add(&(item->list), &(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num++; - return &item->vmcs02; -} - -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmptr) { - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - return; - } -} - -/* - * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs - * must be &vmx->vmcs01. - */ -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item, *n; - - WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - /* - * Something will leak if the above WARN triggers. Better than - * a use-after-free. - */ - if (vmx->loaded_vmcs == &item->vmcs02) - continue; - - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - } -} - /* * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), * set the success or error code of an emulated VMX instruction, as specified @@ -7257,6 +7161,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + vmx->nested.vmcs02.vmcs = alloc_vmcs(); + vmx->nested.vmcs02.shadow_vmcs = NULL; + if (!vmx->nested.vmcs02.vmcs) + goto out_vmcs02; + loaded_vmcs_init(&vmx->nested.vmcs02); + if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); @@ -7279,9 +7189,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->vmcs01.shadow_vmcs = shadow_vmcs; } - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num = 0; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; @@ -7296,6 +7203,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) free_page((unsigned long)vmx->nested.msr_bitmap); out_msr_bitmap: + free_loaded_vmcs(&vmx->nested.vmcs02); + +out_vmcs02: return -ENOMEM; } @@ -7449,7 +7359,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); - /* Unpin physical memory we referred to in current vmcs02 */ + /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; @@ -7465,7 +7375,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.pi_desc = NULL; } - nested_free_all_saved_vmcss(vmx); + free_loaded_vmcs(&vmx->nested.vmcs02); } /* Emulate the VMXOFF instruction */ @@ -7508,8 +7418,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); - nested_free_vmcs02(vmx, vmptr); - nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } @@ -8423,10 +8331,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) /* * The host physical addresses of some pages of guest memory - * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU - * may write to these pages via their host physical address while - * L2 is running, bypassing any address-translation-based dirty - * tracking (e.g. EPT write protection). + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). * * Mark them dirty on every exit from L2 to prevent them from * getting out of sync with dirty tracking. @@ -10912,20 +10821,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct loaded_vmcs *vmcs02; u32 msr_entry_idx; u32 exit_qual; - vmcs02 = nested_get_current_vmcs02(vmx); - if (!vmcs02) - return -ENOMEM; - enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - vmx_switch_vmcs(vcpu, vmcs02); + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { @@ -11543,10 +11447,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); - /* if no vmcs02 cache requested, remove the one we used */ - if (VMCS02_POOL_SIZE == 0) - nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); From 276c796cfef5bdaf9aae055f520b8857eaa3fa19 Mon Sep 17 00:00:00 2001 From: Mark Kanda Date: Mon, 27 Nov 2017 17:22:26 -0600 Subject: [PATCH 058/197] KVM: nVMX: Add a WARN for freeing a loaded VMCS02 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When attempting to free a loaded VMCS02, add a WARN and avoid freeing it (to avoid use-after-free situations). Suggested-by: Paolo Bonzini Signed-off-by: Mark Kanda Reviewed-by: Ameya More Reviewed-by: Krish Sadhukhan Reviewed-by: David Hildenbrand Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d1870f3c8c699c..3b5f7028541409 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3846,6 +3846,19 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } +static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx) +{ + struct loaded_vmcs *loaded_vmcs = &vmx->nested.vmcs02; + + /* + * Just leak the VMCS02 if the WARN triggers. Better than + * a use-after-free. + */ + if (WARN_ON(vmx->loaded_vmcs == loaded_vmcs)) + return; + free_loaded_vmcs(loaded_vmcs); +} + static void free_kvm_area(void) { int cpu; @@ -7203,7 +7216,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) free_page((unsigned long)vmx->nested.msr_bitmap); out_msr_bitmap: - free_loaded_vmcs(&vmx->nested.vmcs02); + vmx_nested_free_vmcs02(vmx); out_vmcs02: return -ENOMEM; @@ -7375,7 +7388,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.pi_desc = NULL; } - free_loaded_vmcs(&vmx->nested.vmcs02); + vmx_nested_free_vmcs02(vmx); } /* Emulate the VMXOFF instruction */ From 74c55931c71352317ae0f5736ee9e4ca07ba4238 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 29 Nov 2017 01:31:20 -0800 Subject: [PATCH 059/197] KVM: VMX: Cache IA32_DEBUGCTL in memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSR_IA32_DEBUGCTLMSR is zeroed on VMEXIT, so it is saved/restored each time during world switch. This patch caches the host IA32_DEBUGCTL MSR and saves/restores the host IA32_DEBUGCTL msr when guest/host switches to avoid to save/restore each time during world switch. This saves about 100 clock cycles per vmexit. Suggested-by: Jim Mattson Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Signed-off-by: Wanpeng Li Reviewed-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b5f7028541409..4e7da90b442604 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -650,6 +650,8 @@ struct vcpu_vmx { u32 host_pkru; + unsigned long host_debugctlmsr; + /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included @@ -2318,6 +2320,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx_vcpu_pi_load(vcpu, cpu); vmx->host_pkru = read_pkru(); + vmx->host_debugctlmsr = get_debugctlmsr(); } static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -9261,7 +9264,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr3, cr4; + unsigned long cr3, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -9314,7 +9317,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vcpu->arch.pkru); atomic_switch_perf_msrs(vmx); - debugctlmsr = get_debugctlmsr(); vmx_arm_hv_timer(vcpu); @@ -9425,8 +9427,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ); /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (debugctlmsr) - update_debugctlmsr(debugctlmsr); + if (vmx->host_debugctlmsr) + update_debugctlmsr(vmx->host_debugctlmsr); #ifndef CONFIG_X86_64 /* From 9c48d517ce6da398b8cff0603b75b366759023c4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 1 Dec 2017 00:15:10 -0800 Subject: [PATCH 060/197] KVM: X86: Reduce the overhead when lapic_timer_advance is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When I run ebizzy in a 32 vCPUs guest on a 32 pCPUs Xeon box, I can observe ~8000 kvm_wait_lapic_expire CurAvg/s through kvm_stat tool even if the advance tscdeadline hrtimer expiration is disabled. Each call to wait_lapic_expire() will consume ~70 cycles when a timer fires since apic_timer_expire() will set expired_tscdeadline and then wait_lapic_expire() will do some caculation before bailing out. So total ~175us per second is lost on this 3.2Ghz machine. This patch reduces the overhead by skipping the function wait_lapic_expire() when lapic_timer_advance is disabled. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ea9a57b0684a4..54d66f238e2030 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7018,7 +7018,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - wait_lapic_expire(vcpu); + if (lapic_timer_advance_ns) + wait_lapic_expire(vcpu); guest_enter_irqoff(); if (unlikely(vcpu->arch.switch_db_regs)) { From 8eb73e2d410f00d383023fe41c0c25c6195b7389 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Tue, 12 Dec 2017 16:44:21 +0800 Subject: [PATCH 061/197] KVM: VMX: drop I/O permission bitmaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since KVM removes the only I/O port 0x80 bypass on Intel hosts, clear CPU_BASED_USE_IO_BITMAPS and set CPU_BASED_UNCOND_IO_EXITING bit. Then these I/O permission bitmaps are not used at all, so drop I/O permission bitmaps. Signed-off-by: Jim Mattson Signed-off-by: Radim KrÄmář Signed-off-by: Quan Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4e7da90b442604..b016cf1b9f773f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -937,8 +937,6 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); enum { - VMX_IO_BITMAP_A, - VMX_IO_BITMAP_B, VMX_MSR_BITMAP_LEGACY, VMX_MSR_BITMAP_LONGMODE, VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, @@ -952,8 +950,6 @@ enum { static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; -#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) -#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) #define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) #define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) #define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) @@ -3627,7 +3623,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) #endif CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_INVLPG_EXITING | @@ -5468,10 +5464,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) #endif int i; - /* I/O */ - vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); - if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); @@ -6783,10 +6775,6 @@ static __init int hardware_setup(void) memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); @@ -10563,8 +10551,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } /* - * Merging of IO bitmap not currently supported. - * Rather, exit every time. + * A vmexit (to either L1 hypervisor or L0 userspace) is always needed + * for I/O port accesses. */ exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; From ec7660ccdd2b71d8c7f0243f8590253713e9b75d Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:23 +0100 Subject: [PATCH 062/197] KVM: Take vcpu->mutex outside vcpu_load As we're about to call vcpu_load() from architecture-specific implementations of the KVM vcpu ioctls, but yet we access data structures protected by the vcpu->mutex in the generic code, factor this logic out from vcpu_load(). x86 is the only architecture which calls vcpu_load() outside of the main vcpu ioctl function, and these calls will no longer take the vcpu mutex following this patch. However, with the exception of kvm_arch_vcpu_postcreate (see below), the callers are either in the creation or destruction path of the VCPU, which means there cannot be any concurrent access to the data structure, because the file descriptor is not yet accessible, or is already gone. kvm_arch_vcpu_postcreate makes the newly created vcpu potentially accessible by other in-kernel threads through the kvm->vcpus array, and we therefore take the vcpu mutex in this case directly. Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 +--- arch/x86/kvm/x86.c | 20 +++++++------------- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 17 ++++++----------- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b016cf1b9f773f..ef7d13e536a4fd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9496,10 +9496,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - r = vcpu_load(vcpu); - BUG_ON(r); + vcpu_load(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); free_nested(vmx); vcpu_put(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54d66f238e2030..3f2c78f5857027 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7767,16 +7767,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - int r; - kvm_vcpu_mtrr_init(vcpu); - r = vcpu_load(vcpu); - if (r) - return r; + vcpu_load(vcpu); kvm_vcpu_reset(vcpu, false); kvm_mmu_setup(vcpu); vcpu_put(vcpu); - return r; + return 0; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -7786,13 +7782,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_hv_vcpu_postcreate(vcpu); - if (vcpu_load(vcpu)) + if (mutex_lock_killable(&vcpu->mutex)) return; + vcpu_load(vcpu); msr.data = 0x0; msr.index = MSR_IA32_TSC; msr.host_initiated = true; kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + mutex_unlock(&vcpu->mutex); if (!kvmclock_periodic_sync) return; @@ -7803,11 +7801,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int r; vcpu->arch.apf.msr_val = 0; - r = vcpu_load(vcpu); - BUG_ON(r); + vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); @@ -8179,9 +8175,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - int r; - r = vcpu_load(vcpu); - BUG_ON(r); + vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6bdd4b9f661154..09de0ff3d67735 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -533,7 +533,7 @@ static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); -int __must_check vcpu_load(struct kvm_vcpu *vcpu); +void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bc092e3d1d7316..c4d116b336f4ea 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -151,17 +151,12 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) /* * Switches to specified vcpu, until a matching vcpu_put() */ -int vcpu_load(struct kvm_vcpu *vcpu) +void vcpu_load(struct kvm_vcpu *vcpu) { - int cpu; - - if (mutex_lock_killable(&vcpu->mutex)) - return -EINTR; - cpu = get_cpu(); + int cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); - return 0; } EXPORT_SYMBOL_GPL(vcpu_load); @@ -171,7 +166,6 @@ void vcpu_put(struct kvm_vcpu *vcpu) kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); preempt_enable(); - mutex_unlock(&vcpu->mutex); } EXPORT_SYMBOL_GPL(vcpu_put); @@ -2560,9 +2554,9 @@ static long kvm_vcpu_ioctl(struct file *filp, #endif - r = vcpu_load(vcpu); - if (r) - return r; + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; + vcpu_load(vcpu); switch (ioctl) { case KVM_RUN: { struct pid *oldpid; @@ -2735,6 +2729,7 @@ static long kvm_vcpu_ioctl(struct file *filp, } out: vcpu_put(vcpu); + mutex_unlock(&vcpu->mutex); kfree(fpu); kfree(kvm_sregs); return r; From 8a32dd60ec9488b73e04e5b7bc82b77a2580b1b7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:24 +0100 Subject: [PATCH 063/197] KVM: Prepare for moving vcpu_load/vcpu_put into arch specific code In preparation for moving calls to vcpu_load() and vcpu_put() into the architecture specific implementations of the KVM vcpu ioctls, move the calls in the main kvm_vcpu_ioctl() dispatcher function to each case of the ioctl select statement. This allows us to move the vcpu_load() and vcpu_put() calls into architecture specific implementations of vcpu ioctls, one by one. Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c4d116b336f4ea..7bbaad8717a2fe 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2556,13 +2556,13 @@ static long kvm_vcpu_ioctl(struct file *filp, if (mutex_lock_killable(&vcpu->mutex)) return -EINTR; - vcpu_load(vcpu); switch (ioctl) { case KVM_RUN: { struct pid *oldpid; r = -EINVAL; if (arg) goto out; + vcpu_load(vcpu); oldpid = rcu_access_pointer(vcpu->pid); if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) { /* The thread running this VCPU changed. */ @@ -2574,6 +2574,7 @@ static long kvm_vcpu_ioctl(struct file *filp, put_pid(oldpid); } r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + vcpu_put(vcpu); trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } @@ -2584,7 +2585,9 @@ static long kvm_vcpu_ioctl(struct file *filp, kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); if (!kvm_regs) goto out; + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); + vcpu_put(vcpu); if (r) goto out_free1; r = -EFAULT; @@ -2604,7 +2607,9 @@ static long kvm_vcpu_ioctl(struct file *filp, r = PTR_ERR(kvm_regs); goto out; } + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); + vcpu_put(vcpu); kfree(kvm_regs); break; } @@ -2613,7 +2618,9 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -ENOMEM; if (!kvm_sregs) goto out; + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); + vcpu_put(vcpu); if (r) goto out; r = -EFAULT; @@ -2629,13 +2636,17 @@ static long kvm_vcpu_ioctl(struct file *filp, kvm_sregs = NULL; goto out; } + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); + vcpu_put(vcpu); break; } case KVM_GET_MP_STATE: { struct kvm_mp_state mp_state; + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); + vcpu_put(vcpu); if (r) goto out; r = -EFAULT; @@ -2650,7 +2661,9 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mp_state, argp, sizeof(mp_state))) goto out; + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); + vcpu_put(vcpu); break; } case KVM_TRANSLATE: { @@ -2659,7 +2672,9 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&tr, argp, sizeof(tr))) goto out; + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + vcpu_put(vcpu); if (r) goto out; r = -EFAULT; @@ -2674,7 +2689,9 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&dbg, argp, sizeof(dbg))) goto out; + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); + vcpu_put(vcpu); break; } case KVM_SET_SIGNAL_MASK: { @@ -2705,7 +2722,9 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -ENOMEM; if (!fpu) goto out; + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); + vcpu_put(vcpu); if (r) goto out; r = -EFAULT; @@ -2721,14 +2740,17 @@ static long kvm_vcpu_ioctl(struct file *filp, fpu = NULL; goto out; } + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); + vcpu_put(vcpu); break; } default: + vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); + vcpu_put(vcpu); } out: - vcpu_put(vcpu); mutex_unlock(&vcpu->mutex); kfree(fpu); kfree(kvm_sregs); From accb757d798c9b4d85cfe3e5972134c586525168 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:25 +0100 Subject: [PATCH 064/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_run Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_run(). Signed-off-by: Christoffer Dall Reviewed-by: Christian Borntraeger # s390 parts Reviewed-by: Cornelia Huck [Rebased. - Paolo] Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 3 +++ arch/powerpc/kvm/powerpc.c | 6 +++++- arch/s390/kvm/kvm-s390.c | 10 ++++++++-- arch/x86/kvm/x86.c | 3 ++- virt/kvm/arm/arm.c | 20 ++++++++++++++------ virt/kvm/kvm_main.c | 2 -- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 75fdeaa8c62f21..ba5ecf22bb96fc 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -446,6 +446,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r = -EINTR; + vcpu_load(vcpu); + kvm_sigset_activate(vcpu); if (vcpu->mmio_needed) { @@ -480,6 +482,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) out: kvm_sigset_deactivate(vcpu); + vcpu_put(vcpu); return r; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1915e86cef6f8f..4e2167a7ae19ec 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1408,6 +1408,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r; + vcpu_load(vcpu); + if (vcpu->mmio_needed) { vcpu->mmio_needed = 0; if (!vcpu->mmio_is_write) @@ -1422,7 +1424,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run); if (r == RESUME_HOST) { vcpu->mmio_needed = 1; - return r; + goto out; } } #endif @@ -1456,6 +1458,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_sigset_deactivate(vcpu); +out: + vcpu_put(vcpu); return r; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ec8b68e97d3cd4..7972598e60d03e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3373,9 +3373,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (kvm_run->immediate_exit) return -EINTR; + vcpu_load(vcpu); + if (guestdbg_exit_pending(vcpu)) { kvm_s390_prepare_debug_exit(vcpu); - return 0; + rc = 0; + goto out; } kvm_sigset_activate(vcpu); @@ -3385,7 +3388,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } else if (is_vcpu_stopped(vcpu)) { pr_err_ratelimited("can't run stopped vcpu %d\n", vcpu->vcpu_id); - return -EINVAL; + rc = -EINVAL; + goto out; } sync_regs(vcpu, kvm_run); @@ -3415,6 +3419,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_sigset_deactivate(vcpu); vcpu->stat.exit_userspace++; +out: + vcpu_put(vcpu); return rc; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f2c78f5857027..af9da75011bce1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7280,8 +7280,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; + vcpu_load(vcpu); kvm_sigset_activate(vcpu); - kvm_load_guest_fpu(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { @@ -7328,6 +7328,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); + vcpu_put(vcpu); return r; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 6b60c98a6e2294..a1b2e8a43ca0b7 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -619,21 +619,27 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) if (unlikely(!kvm_vcpu_initialized(vcpu))) return -ENOEXEC; + vcpu_load(vcpu); + ret = kvm_vcpu_first_run_init(vcpu); if (ret) - return ret; + goto out; if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_handle_mmio_return(vcpu, vcpu->run); if (ret) - return ret; - if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) - return 0; + goto out; + if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) { + ret = 0; + goto out; + } } - if (run->immediate_exit) - return -EINTR; + if (run->immediate_exit) { + ret = -EINTR; + goto out; + } kvm_sigset_activate(vcpu); @@ -772,6 +778,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_sigset_deactivate(vcpu); +out: + vcpu_put(vcpu); return ret; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7bbaad8717a2fe..0b149827570cc3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2562,7 +2562,6 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EINVAL; if (arg) goto out; - vcpu_load(vcpu); oldpid = rcu_access_pointer(vcpu->pid); if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) { /* The thread running this VCPU changed. */ @@ -2574,7 +2573,6 @@ static long kvm_vcpu_ioctl(struct file *filp, put_pid(oldpid); } r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); - vcpu_put(vcpu); trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } From 1fc9b76b3dd2c57ca0fe42742043a5c3cbdc41c1 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:26 +0100 Subject: [PATCH 065/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_get_regs Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_get_regs(). Signed-off-by: Christoffer Dall Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 3 +++ arch/powerpc/kvm/book3s.c | 3 +++ arch/powerpc/kvm/booke.c | 3 +++ arch/s390/kvm/kvm-s390.c | 2 ++ arch/x86/kvm/x86.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 6 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index ba5ecf22bb96fc..6023b5f808c0a3 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1162,6 +1162,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.gprs); i++) regs->gpr[i] = vcpu->arch.gprs[i]; @@ -1169,6 +1171,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->lo = vcpu->arch.lo; regs->pc = vcpu->arch.pc; + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 72d977e309523f..d85bfd733ccd42 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -497,6 +497,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + regs->pc = kvmppc_get_pc(vcpu); regs->cr = kvmppc_get_cr(vcpu); regs->ctr = kvmppc_get_ctr(vcpu); @@ -518,6 +520,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 83b485810aea2f..e0e4f04c55354d 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1431,6 +1431,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + regs->pc = vcpu->arch.pc; regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; @@ -1452,6 +1454,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); + vcpu_put(vcpu); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7972598e60d03e..44317813f49824 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2715,7 +2715,9 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); memcpy(®s->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs)); + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index af9da75011bce1..3364c6d4f74311 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7334,6 +7334,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); + if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { /* * We are here if userspace calls get_regs() in the middle of @@ -7367,6 +7369,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0b149827570cc3..6dab2e6f83217b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2583,9 +2583,7 @@ static long kvm_vcpu_ioctl(struct file *filp, kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); if (!kvm_regs) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); - vcpu_put(vcpu); if (r) goto out_free1; r = -EFAULT; From 875656fe0c8473c544860d557ca1512753d6aeef Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:27 +0100 Subject: [PATCH 066/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_regs Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_regs(). Signed-off-by: Christoffer Dall Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 3 +++ arch/powerpc/kvm/book3s.c | 3 +++ arch/powerpc/kvm/booke.c | 3 +++ arch/s390/kvm/kvm-s390.c | 2 ++ arch/x86/kvm/x86.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 6 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 6023b5f808c0a3..dd5f463b0e7264 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1148,6 +1148,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + for (i = 1; i < ARRAY_SIZE(vcpu->arch.gprs); i++) vcpu->arch.gprs[i] = regs->gpr[i]; vcpu->arch.gprs[0] = 0; /* zero is special, and cannot be set. */ @@ -1155,6 +1157,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.lo = regs->lo; vcpu->arch.pc = regs->pc; + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index d85bfd733ccd42..24bc7aabfc449d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -528,6 +528,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + kvmppc_set_pc(vcpu, regs->pc); kvmppc_set_cr(vcpu, regs->cr); kvmppc_set_ctr(vcpu, regs->ctr); @@ -548,6 +550,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index e0e4f04c55354d..bcbbeddc3430c0 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1462,6 +1462,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + vcpu->arch.pc = regs->pc; kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; @@ -1483,6 +1485,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); + vcpu_put(vcpu); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 44317813f49824..ab08823a854b21 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2709,7 +2709,9 @@ static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); memcpy(&vcpu->run->s.regs.gprs, ®s->gprs, sizeof(regs->gprs)); + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3364c6d4f74311..f82e87bb3a51e4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7375,6 +7375,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); + vcpu->arch.emulate_regs_need_sync_from_vcpu = true; vcpu->arch.emulate_regs_need_sync_to_vcpu = false; @@ -7404,6 +7406,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6dab2e6f83217b..e25c1a1a412028 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2603,9 +2603,7 @@ static long kvm_vcpu_ioctl(struct file *filp, r = PTR_ERR(kvm_regs); goto out; } - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); - vcpu_put(vcpu); kfree(kvm_regs); break; } From bcdec41cefbea525ad424050650acb0f2eed1378 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:28 +0100 Subject: [PATCH 067/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_get_sregs Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_get_sregs(). Signed-off-by: Christoffer Dall Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s.c | 8 +++++++- arch/powerpc/kvm/booke.c | 9 ++++++++- arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 5 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 24bc7aabfc449d..6cc2377549f786 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -484,7 +484,13 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); + int ret; + + vcpu_load(vcpu); + ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); + vcpu_put(vcpu); + + return ret; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index bcbbeddc3430c0..f647e121070e1f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1613,11 +1613,18 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + int ret; + + vcpu_load(vcpu); + sregs->pvr = vcpu->arch.pvr; get_sregs_base(vcpu, sregs); get_sregs_arch206(vcpu, sregs); - return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); + ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); + + vcpu_put(vcpu); + return ret; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ab08823a854b21..87544beff50779 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2734,8 +2734,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + vcpu_load(vcpu); + memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs)); memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs)); + + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f82e87bb3a51e4..66a8d8cd00f6a9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7425,6 +7425,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, { struct desc_ptr dt; + vcpu_load(vcpu); + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -7456,6 +7458,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e25c1a1a412028..4c2f6a4d185270 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2612,9 +2612,7 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -ENOMEM; if (!kvm_sregs) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); - vcpu_put(vcpu); if (r) goto out; r = -EFAULT; From b4ef9d4e8cb8938e6c0aa3be672b0aeeb791ecf3 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:29 +0100 Subject: [PATCH 068/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_sregs Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_sregs(). Signed-off-by: Christoffer Dall Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s.c | 8 +++++++- arch/powerpc/kvm/booke.c | 15 ++++++++++----- arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 12 +++++++++--- virt/kvm/kvm_main.c | 2 -- 5 files changed, 30 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6cc2377549f786..047651622cb861 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -496,7 +496,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); + int ret; + + vcpu_load(vcpu); + ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); + vcpu_put(vcpu); + + return ret; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index f647e121070e1f..38939a204e262e 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1630,20 +1630,25 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - int ret; + int ret = -EINVAL; + vcpu_load(vcpu); if (vcpu->arch.pvr != sregs->pvr) - return -EINVAL; + goto out; ret = set_sregs_base(vcpu, sregs); if (ret < 0) - return ret; + goto out; ret = set_sregs_arch206(vcpu, sregs); if (ret < 0) - return ret; + goto out; + + ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); - return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +out: + vcpu_put(vcpu); + return ret; } int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 87544beff50779..d27a1dd0986b91 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2726,8 +2726,12 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + vcpu_load(vcpu); + memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs)); memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); + + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 66a8d8cd00f6a9..f01a7d73dad784 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7525,15 +7525,18 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int mmu_reset_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; + int ret = -EINVAL; + + vcpu_load(vcpu); if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; + goto out; apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) - return -EINVAL; + goto out; dt.size = sregs->idt.limit; dt.address = sregs->idt.base; @@ -7599,7 +7602,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); - return 0; + ret = 0; +out: + vcpu_put(vcpu); + return ret; } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4c2f6a4d185270..e04a216bcd1498 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2628,9 +2628,7 @@ static long kvm_vcpu_ioctl(struct file *filp, kvm_sregs = NULL; goto out; } - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); - vcpu_put(vcpu); break; } case KVM_GET_MP_STATE: { From fd2325612c1493c85cce89ea16b2396baca83311 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:30 +0100 Subject: [PATCH 069/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_get_mpstate Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_get_mpstate(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 11 +++++++++-- arch/x86/kvm/x86.c | 3 +++ virt/kvm/arm/arm.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d27a1dd0986b91..4297f5094a8846 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2833,9 +2833,16 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int ret; + + vcpu_load(vcpu); + /* CHECK_STOP and LOAD are not supported yet */ - return is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED : - KVM_MP_STATE_OPERATING; + ret = is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED : + KVM_MP_STATE_OPERATING; + + vcpu_put(vcpu); + return ret; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f01a7d73dad784..80791939065fbf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7465,6 +7465,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); + kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && vcpu->arch.pv.pv_unhalted) @@ -7472,6 +7474,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, else mp_state->mp_state = vcpu->arch.mp_state; + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index a1b2e8a43ca0b7..65d50100ba3ca5 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -381,11 +381,14 @@ static void vcpu_power_off(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); + if (vcpu->arch.power_off) mp_state->mp_state = KVM_MP_STATE_STOPPED; else mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e04a216bcd1498..8e2f582417c38d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2634,9 +2634,7 @@ static long kvm_vcpu_ioctl(struct file *filp, case KVM_GET_MP_STATE: { struct kvm_mp_state mp_state; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); - vcpu_put(vcpu); if (r) goto out; r = -EFAULT; From e83dff5edf0c3f014e4b4ac5e1c86dbe797687c7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:31 +0100 Subject: [PATCH 070/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_mpstate Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_mpstate(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 3 +++ arch/x86/kvm/x86.c | 14 +++++++++++--- virt/kvm/arm/arm.c | 9 +++++++-- virt/kvm/kvm_main.c | 2 -- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4297f5094a8846..4e6304d45ca554 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2850,6 +2850,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { int rc = 0; + vcpu_load(vcpu); + /* user space knows about this interface - let it control the state */ vcpu->kvm->arch.user_cpu_state_ctrl = 1; @@ -2867,6 +2869,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, rc = -ENXIO; } + vcpu_put(vcpu); return rc; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 80791939065fbf..a9e6cca1f46ecc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7481,15 +7481,19 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int ret = -EINVAL; + + vcpu_load(vcpu); + if (!lapic_in_kernel(vcpu) && mp_state->mp_state != KVM_MP_STATE_RUNNABLE) - return -EINVAL; + goto out; /* INITs are latched while in SMM */ if ((is_smm(vcpu) || vcpu->arch.smi_pending) && (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) - return -EINVAL; + goto out; if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; @@ -7497,7 +7501,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, } else vcpu->arch.mp_state = mp_state->mp_state; kvm_make_request(KVM_REQ_EVENT, vcpu); - return 0; + + ret = 0; +out: + vcpu_put(vcpu); + return ret; } int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 65d50100ba3ca5..aa6167086211dd 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -395,6 +395,10 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int ret = 0; + + vcpu_load(vcpu); + switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: vcpu->arch.power_off = false; @@ -403,10 +407,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_power_off(vcpu); break; default: - return -EINVAL; + ret = -EINVAL; } - return 0; + vcpu_put(vcpu); + return ret; } /** diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8e2f582417c38d..4b5aeb4b8c5804 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2649,9 +2649,7 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mp_state, argp, sizeof(mp_state))) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); - vcpu_put(vcpu); break; } case KVM_TRANSLATE: { From 1da5b61dac98360a7e50b1565f6d499c6fc8123a Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:32 +0100 Subject: [PATCH 071/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_translate Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_translate(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/booke.c | 2 ++ arch/x86/kvm/x86.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 38939a204e262e..b5f46517c8396c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1791,7 +1791,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { int r; + vcpu_load(vcpu); r = kvmppc_core_vcpu_translate(vcpu, tr); + vcpu_put(vcpu); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a9e6cca1f46ecc..080e00dff426d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7684,6 +7684,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; int idx; + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -7692,6 +7694,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, tr->writeable = 1; tr->usermode = 0; + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4b5aeb4b8c5804..37217ec647b416 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2658,9 +2658,7 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&tr, argp, sizeof(tr))) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); - vcpu_put(vcpu); if (r) goto out; r = -EFAULT; From 66b5656222990f1a536f5900ccd98539f9cf231f Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:33 +0100 Subject: [PATCH 072/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_guest_debug Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_guest_debug(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/guest.c | 15 ++++++++++++--- arch/powerpc/kvm/book3s.c | 2 ++ arch/powerpc/kvm/booke.c | 19 +++++++++++++------ arch/s390/kvm/kvm-s390.c | 16 ++++++++++++---- arch/x86/kvm/x86.c | 4 +++- virt/kvm/kvm_main.c | 2 -- 6 files changed, 42 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 5c7f657dd20740..d7e3299a773460 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -361,10 +361,16 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { + int ret = 0; + + vcpu_load(vcpu); + trace_kvm_set_guest_debug(vcpu, dbg->control); - if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) - return -EINVAL; + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) { + ret = -EINVAL; + goto out; + } if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; @@ -378,7 +384,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, /* If not enabled clear all flags */ vcpu->guest_debug = 0; } - return 0; + +out: + vcpu_put(vcpu); + return ret; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 047651622cb861..234531d1bee1e2 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -755,7 +755,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { + vcpu_load(vcpu); vcpu->guest_debug = dbg->control; + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index b5f46517c8396c..6038e2e7aee03c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -2016,12 +2016,15 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, { struct debug_reg *dbg_reg; int n, b = 0, w = 0; + int ret = 0; + + vcpu_load(vcpu); if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->arch.dbg_reg.dbcr0 = 0; vcpu->guest_debug = 0; kvm_guest_protect_msr(vcpu, MSR_DE, false); - return 0; + goto out; } kvm_guest_protect_msr(vcpu, MSR_DE, true); @@ -2053,8 +2056,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, #endif if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - return 0; + goto out; + ret = -EINVAL; for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) { uint64_t addr = dbg->arch.bp[n].addr; uint32_t type = dbg->arch.bp[n].type; @@ -2065,21 +2069,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (type & ~(KVMPPC_DEBUG_WATCH_READ | KVMPPC_DEBUG_WATCH_WRITE | KVMPPC_DEBUG_BREAKPOINT)) - return -EINVAL; + goto out; if (type & KVMPPC_DEBUG_BREAKPOINT) { /* Setting H/W breakpoint */ if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++)) - return -EINVAL; + goto out; } else { /* Setting H/W watchpoint */ if (kvmppc_booke_add_watchpoint(dbg_reg, addr, type, w++)) - return -EINVAL; + goto out; } } - return 0; + ret = 0; +out: + vcpu_put(vcpu); + return ret; } void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4e6304d45ca554..1c82dda07bc57c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2801,13 +2801,19 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, { int rc = 0; + vcpu_load(vcpu); + vcpu->guest_debug = 0; kvm_s390_clear_bp_data(vcpu); - if (dbg->control & ~VALID_GUESTDBG_FLAGS) - return -EINVAL; - if (!sclp.has_gpere) - return -EINVAL; + if (dbg->control & ~VALID_GUESTDBG_FLAGS) { + rc = -EINVAL; + goto out; + } + if (!sclp.has_gpere) { + rc = -EINVAL; + goto out; + } if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; @@ -2827,6 +2833,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags); } +out: + vcpu_put(vcpu); return rc; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 080e00dff426d9..9ef0b9299e93d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7625,6 +7625,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; + vcpu_load(vcpu); + if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) @@ -7670,7 +7672,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, r = 0; out: - + vcpu_put(vcpu); return r; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 37217ec647b416..b44762fcaf84dd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2673,9 +2673,7 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&dbg, argp, sizeof(dbg))) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); - vcpu_put(vcpu); break; } case KVM_SET_SIGNAL_MASK: { From 1393123e1e24aba96413d351b9546086ea07504d Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:34 +0100 Subject: [PATCH 073/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_get_fpu Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_get_fpu(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 7 +++++-- virt/kvm/kvm_main.c | 2 -- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1c82dda07bc57c..39327888bb0645 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2762,6 +2762,8 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { + vcpu_load(vcpu); + /* make sure we have the latest values */ save_fpu_regs(); if (MACHINE_HAS_VX) @@ -2770,6 +2772,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) else memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs)); fpu->fpc = vcpu->run->s.regs.fpc; + + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ef0b9299e93d0..846d292ea33b3c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7702,9 +7702,11 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxregs_state *fxsave = - &vcpu->arch.guest_fpu.state.fxsave; + struct fxregs_state *fxsave; + vcpu_load(vcpu); + + fxsave = &vcpu->arch.guest_fpu.state.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -7714,6 +7716,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_dp = fxsave->rdp; memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b44762fcaf84dd..5791aa687233d0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2704,9 +2704,7 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -ENOMEM; if (!fpu) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); - vcpu_put(vcpu); if (r) goto out; r = -EFAULT; From 6a96bc7fa0cdd96bac2b8298d708a94f8de6f6d4 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:35 +0100 Subject: [PATCH 074/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_fpu Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_fpu(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 15 ++++++++++++--- arch/x86/kvm/x86.c | 8 ++++++-- virt/kvm/kvm_main.c | 2 -- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 39327888bb0645..3bd2f83e1fa981 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2749,15 +2749,24 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - if (test_fp_ctl(fpu->fpc)) - return -EINVAL; + int ret = 0; + + vcpu_load(vcpu); + + if (test_fp_ctl(fpu->fpc)) { + ret = -EINVAL; + goto out; + } vcpu->run->s.regs.fpc = fpu->fpc; if (MACHINE_HAS_VX) convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, (freg_t *) fpu->fprs); else memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); - return 0; + +out: + vcpu_put(vcpu); + return ret; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 846d292ea33b3c..981b61531ab746 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7722,8 +7722,11 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxregs_state *fxsave = - &vcpu->arch.guest_fpu.state.fxsave; + struct fxregs_state *fxsave; + + vcpu_load(vcpu); + + fxsave = &vcpu->arch.guest_fpu.state.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -7734,6 +7737,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->rdp = fpu->last_dp; memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5791aa687233d0..ca0ec9fb72ce9a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2720,9 +2720,7 @@ static long kvm_vcpu_ioctl(struct file *filp, fpu = NULL; goto out; } - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); - vcpu_put(vcpu); break; } default: From 9b062471e52a1692c5563ba1535c84d708e2ff6f Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:36 +0100 Subject: [PATCH 075/197] KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl Move the calls to vcpu_load() and vcpu_put() in to the architecture specific implementations of kvm_arch_vcpu_ioctl() which dispatches further architecture-specific ioctls on to other functions. Some architectures support asynchronous vcpu ioctls which cannot call vcpu_load() or take the vcpu->mutex, because that would prevent concurrent execution with a running VCPU, which is the intended purpose of these ioctls, for example because they inject interrupts. We repeat the separate checks for these specifics in the architecture code for MIPS, S390 and PPC, and avoid taking the vcpu->mutex and calling vcpu_load for these ioctls. Signed-off-by: Christoffer Dall Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 49 +++++++++++++++++++------------- arch/powerpc/kvm/powerpc.c | 13 +++++---- arch/s390/kvm/kvm-s390.c | 19 +++++++------ arch/x86/kvm/x86.c | 22 +++++++++++---- virt/kvm/arm/arm.c | 58 ++++++++++++++++++++++++++------------ virt/kvm/kvm_main.c | 2 -- 6 files changed, 103 insertions(+), 60 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index dd5f463b0e7264..9200b3def440a7 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -910,56 +910,65 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, void __user *argp = (void __user *)arg; long r; + if (ioctl == KVM_INTERRUPT) { + struct kvm_mips_interrupt irq; + + if (copy_from_user(&irq, argp, sizeof(irq))) + return -EFAULT; + kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, + irq.irq); + + return kvm_vcpu_ioctl_interrupt(vcpu, &irq); + } + + vcpu_load(vcpu); + switch (ioctl) { case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) - return -EFAULT; + break; if (ioctl == KVM_SET_ONE_REG) - return kvm_mips_set_reg(vcpu, ®); + r = kvm_mips_set_reg(vcpu, ®); else - return kvm_mips_get_reg(vcpu, ®); + r = kvm_mips_get_reg(vcpu, ®); + break; } case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; struct kvm_reg_list reg_list; unsigned n; + r = -EFAULT; if (copy_from_user(®_list, user_list, sizeof(reg_list))) - return -EFAULT; + break; n = reg_list.n; reg_list.n = kvm_mips_num_regs(vcpu); if (copy_to_user(user_list, ®_list, sizeof(reg_list))) - return -EFAULT; + break; + r = -E2BIG; if (n < reg_list.n) - return -E2BIG; - return kvm_mips_copy_reg_indices(vcpu, user_list->reg); - } - case KVM_INTERRUPT: - { - struct kvm_mips_interrupt irq; - - if (copy_from_user(&irq, argp, sizeof(irq))) - return -EFAULT; - kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, - irq.irq); - - r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); break; - } + r = kvm_mips_copy_reg_indices(vcpu, user_list->reg); + break; + } case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; + r = -EFAULT; if (copy_from_user(&cap, argp, sizeof(cap))) - return -EFAULT; + break; r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } default: r = -ENOIOCTLCMD; } + + vcpu_put(vcpu); return r; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4e2167a7ae19ec..ba8134a989c1bb 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1614,16 +1614,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; long r; - switch (ioctl) { - case KVM_INTERRUPT: { + if (ioctl == KVM_INTERRUPT) { struct kvm_interrupt irq; - r = -EFAULT; if (copy_from_user(&irq, argp, sizeof(irq))) - goto out; - r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - goto out; + return -EFAULT; + return kvm_vcpu_ioctl_interrupt(vcpu, &irq); } + vcpu_load(vcpu); + + switch (ioctl) { case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; @@ -1663,6 +1663,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } out: + vcpu_put(vcpu); return r; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3bd2f83e1fa981..9700d71cb69116 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3737,24 +3737,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_S390_IRQ: { struct kvm_s390_irq s390irq; - r = -EFAULT; if (copy_from_user(&s390irq, argp, sizeof(s390irq))) - break; - r = kvm_s390_inject_vcpu(vcpu, &s390irq); - break; + return -EFAULT; + return kvm_s390_inject_vcpu(vcpu, &s390irq); } case KVM_S390_INTERRUPT: { struct kvm_s390_interrupt s390int; struct kvm_s390_irq s390irq; - r = -EFAULT; if (copy_from_user(&s390int, argp, sizeof(s390int))) - break; + return -EFAULT; if (s390int_to_s390irq(&s390int, &s390irq)) return -EINVAL; - r = kvm_s390_inject_vcpu(vcpu, &s390irq); - break; + return kvm_s390_inject_vcpu(vcpu, &s390irq); } + } + + vcpu_load(vcpu); + + switch (ioctl) { case KVM_S390_STORE_STATUS: idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_s390_vcpu_store_status(vcpu, arg); @@ -3879,6 +3880,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, default: r = -ENOTTY; } + + vcpu_put(vcpu); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 981b61531ab746..fafaef072f1b5d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3484,6 +3484,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void *buffer; } u; + vcpu_load(vcpu); + u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { @@ -3509,8 +3511,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!lapic_in_kernel(vcpu)) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); - if (IS_ERR(u.lapic)) - return PTR_ERR(u.lapic); + if (IS_ERR(u.lapic)) { + r = PTR_ERR(u.lapic); + goto out_nofree; + } r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); break; @@ -3684,8 +3688,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); - if (IS_ERR(u.xsave)) - return PTR_ERR(u.xsave); + if (IS_ERR(u.xsave)) { + r = PTR_ERR(u.xsave); + goto out_nofree; + } r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -3707,8 +3713,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); - if (IS_ERR(u.xcrs)) - return PTR_ERR(u.xcrs); + if (IS_ERR(u.xcrs)) { + r = PTR_ERR(u.xcrs); + goto out_nofree; + } r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; @@ -3752,6 +3760,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } out: kfree(u.buffer); +out_nofree: + vcpu_put(vcpu); return r; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index aa6167086211dd..cd7d90c9f644d1 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1003,66 +1003,88 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; struct kvm_device_attr attr; + long r; + + vcpu_load(vcpu); switch (ioctl) { case KVM_ARM_VCPU_INIT: { struct kvm_vcpu_init init; + r = -EFAULT; if (copy_from_user(&init, argp, sizeof(init))) - return -EFAULT; + break; - return kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + break; } case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; + break; + r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) - return -EFAULT; + break; + if (ioctl == KVM_SET_ONE_REG) - return kvm_arm_set_reg(vcpu, ®); + r = kvm_arm_set_reg(vcpu, ®); else - return kvm_arm_get_reg(vcpu, ®); + r = kvm_arm_get_reg(vcpu, ®); + break; } case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; struct kvm_reg_list reg_list; unsigned n; + r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; + break; + r = -EFAULT; if (copy_from_user(®_list, user_list, sizeof(reg_list))) - return -EFAULT; + break; n = reg_list.n; reg_list.n = kvm_arm_num_regs(vcpu); if (copy_to_user(user_list, ®_list, sizeof(reg_list))) - return -EFAULT; + break; + r = -E2BIG; if (n < reg_list.n) - return -E2BIG; - return kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; + r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; } case KVM_SET_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_set_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_set_attr(vcpu, &attr); + break; } case KVM_GET_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_get_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_get_attr(vcpu, &attr); + break; } case KVM_HAS_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_has_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_has_attr(vcpu, &attr); + break; } default: - return -EINVAL; + r = -EINVAL; } + + vcpu_put(vcpu); + return r; } /** diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ca0ec9fb72ce9a..19c184fa1839ac 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2724,9 +2724,7 @@ static long kvm_vcpu_ioctl(struct file *filp, break; } default: - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); - vcpu_put(vcpu); } out: mutex_unlock(&vcpu->mutex); From 5cb0944c0c66004c0d9006a7f0fba5782ae38f69 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Dec 2017 17:41:34 +0100 Subject: [PATCH 076/197] KVM: introduce kvm_arch_vcpu_async_ioctl After the vcpu_load/vcpu_put pushdown, the handling of asynchronous VCPU ioctl is already much clearer in that it is obvious that they bypass vcpu_load and vcpu_put. However, it is still not perfect in that the different state of the VCPU mutex is still hidden in the caller. Separate those ioctls into a new function kvm_arch_vcpu_async_ioctl that returns -ENOIOCTLCMD for more "traditional" synchronous ioctls. Cc: James Hogan Cc: Paul Mackerras Cc: Christian Borntraeger Reviewed-by: Christoffer Dall Reviewed-by: Cornelia Huck Suggested-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/Kconfig | 1 + arch/mips/kvm/mips.c | 15 ++++++++++++--- arch/powerpc/kvm/Kconfig | 1 + arch/powerpc/kvm/powerpc.c | 14 +++++++++++--- arch/s390/kvm/Kconfig | 1 + arch/s390/kvm/kvm-s390.c | 16 ++++++++++++---- include/linux/kvm_host.h | 12 ++++++++++++ virt/kvm/Kconfig | 3 +++ virt/kvm/kvm_main.c | 12 +++++------- 9 files changed, 58 insertions(+), 17 deletions(-) diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index b17447ce887314..76b93a9c8c9b25 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select MMU_NOTIFIER select SRCU diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9200b3def440a7..2549fdd27ee168 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -903,12 +903,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; - long r; if (ioctl == KVM_INTERRUPT) { struct kvm_mips_interrupt irq; @@ -921,6 +920,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, return kvm_vcpu_ioctl_interrupt(vcpu, &irq); } + return -ENOIOCTLCMD; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + long r; + vcpu_load(vcpu); switch (ioctl) { diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index b12b8eb39c2978..f884a0529dfeb0 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_EVENTFD + select HAVE_KVM_VCPU_ASYNC_IOCTL select SRCU select KVM_VFIO select IRQ_BYPASS_MANAGER diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ba8134a989c1bb..66a310779de547 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1607,12 +1607,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; } -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_async_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; - long r; if (ioctl == KVM_INTERRUPT) { struct kvm_interrupt irq; @@ -1620,6 +1619,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return -EFAULT; return kvm_vcpu_ioctl_interrupt(vcpu, &irq); } + return -ENOIOCTLCMD; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + long r; vcpu_load(vcpu); diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 9a4594e0a1ffe2..a3dbd459cce91b 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -23,6 +23,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_EVENTFD select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9700d71cb69116..40f0ae5a883f0f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3725,13 +3725,11 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_async_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; - int idx; - long r; switch (ioctl) { case KVM_S390_IRQ: { @@ -3752,6 +3750,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return kvm_s390_inject_vcpu(vcpu, &s390irq); } } + return -ENOIOCTLCMD; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int idx; + long r; vcpu_load(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 09de0ff3d67735..ac0062b74aed04 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1260,4 +1260,16 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ +#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL +long kvm_arch_vcpu_async_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); +#else +static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, + unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} +#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ + #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 70691c08e1edc4..cca7e065a075d8 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -51,3 +51,6 @@ config KVM_COMPAT config HAVE_KVM_IRQ_BYPASS bool + +config HAVE_KVM_VCPU_ASYNC_IOCTL + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 19c184fa1839ac..b4414842b02392 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2544,15 +2544,13 @@ static long kvm_vcpu_ioctl(struct file *filp, if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) return -EINVAL; -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) /* - * Special cases: vcpu ioctls that are asynchronous to vcpu execution, - * so vcpu_load() would break it. + * Some architectures have vcpu ioctls that are asynchronous to vcpu + * execution; mutex_lock() would break them. */ - if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT) - return kvm_arch_vcpu_ioctl(filp, ioctl, arg); -#endif - + r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + if (r != -ENOIOCTLCMD) + return r; if (mutex_lock_killable(&vcpu->mutex)) return -EINTR; From 0c0543a128bd1c6a4c8610d0d9d869053fa2fbf5 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sat, 25 Nov 2017 18:40:31 +0100 Subject: [PATCH 077/197] arm64: KVM: Hide PMU from guests when disabled Since commit 93390c0a1b20 ("arm64: KVM: Hide unsupported AArch64 CPU features from guests") we can hide cpu features from guests. Apply this to a long standing issue where guests see a PMU available, but it's not, because it was not enabled by KVM's userspace. Signed-off-by: Andrew Jones Signed-off-by: Christoffer Dall --- arch/arm64/kvm/sys_regs.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1830ebc227d18d..503144f13af022 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -881,18 +881,25 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu, } /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(struct sys_reg_desc const *r, bool raz) +static u64 read_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r, + bool raz) { u32 id = sys_reg((u32)r->Op0, (u32)r->Op1, (u32)r->CRn, (u32)r->CRm, (u32)r->Op2); u64 val = raz ? 0 : read_sanitised_ftr_reg(id); - if (id == SYS_ID_AA64PFR0_EL1) { + switch (id) { + case SYS_ID_AA64DFR0_EL1: + if (!kvm_arm_pmu_v3_ready(vcpu)) + val &= ~(0xfUL << ID_AA64DFR0_PMUVER_SHIFT); + break; + case SYS_ID_AA64PFR0_EL1: if (val & (0xfUL << ID_AA64PFR0_SVE_SHIFT)) pr_err_once("kvm [%i]: SVE unsupported for guests, suppressing\n", task_pid_nr(current)); - val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT); + break; } return val; @@ -908,7 +915,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_id_reg(r, raz); + p->regval = read_id_reg(vcpu, r, raz); return true; } @@ -937,17 +944,17 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ -static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, - bool raz) +static int __get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + void __user *uaddr, bool raz) { const u64 id = sys_reg_to_index(rd); - const u64 val = read_id_reg(rd, raz); + const u64 val = read_id_reg(vcpu, rd, raz); return reg_to_user(uaddr, &val, id); } -static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, - bool raz) +static int __set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + void __user *uaddr, bool raz) { const u64 id = sys_reg_to_index(rd); int err; @@ -958,7 +965,7 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, return err; /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(rd, raz)) + if (val != read_id_reg(vcpu, rd, raz)) return -EINVAL; return 0; @@ -967,25 +974,25 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(rd, uaddr, false); + return __get_id_reg(vcpu, rd, uaddr, false); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(rd, uaddr, false); + return __set_id_reg(vcpu, rd, uaddr, false); } static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(rd, uaddr, true); + return __get_id_reg(vcpu, rd, uaddr, true); } static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(rd, uaddr, true); + return __set_id_reg(vcpu, rd, uaddr, true); } /* sys_reg_desc initialiser for known cpufeature ID registers */ From 4404b336cf32bf709942067fe71d8b5cf70fb2b2 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Tue, 28 Nov 2017 23:48:17 +0100 Subject: [PATCH 078/197] KVM: arm: Use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: virt/kvm/arm/vgic/vgic-its.c:971:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Vasyl Gomonovych Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic/vgic-its.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 8e633bd9cc1e74..465095355666ec 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -1034,10 +1034,8 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, device = vgic_its_alloc_device(its, device_id, itt_addr, num_eventid_bits); - if (IS_ERR(device)) - return PTR_ERR(device); - return 0; + return PTR_ERR_OR_ZERO(device); } /* From 5a24575032971c5a9a4580417a791c427ebdb8e5 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 8 Sep 2017 07:07:13 -0700 Subject: [PATCH 079/197] KVM: arm/arm64: Remove redundant preemptible checks The __this_cpu_read() and __this_cpu_write() functions already implement checks for the required preemption levels when using CONFIG_DEBUG_PREEMPT which gives you nice error messages and such. Therefore there is no need to explicitly check this using a BUG_ON() in the code (which we don't do for other uses of per cpu variables either). Acked-by: Marc Zyngier Reviewed-by: Andre Przywara Signed-off-by: Christoffer Dall --- virt/kvm/arm/arm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 6b60c98a6e2294..3610e132df8bd9 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -71,7 +71,6 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) { - BUG_ON(preemptible()); __this_cpu_write(kvm_arm_running_vcpu, vcpu); } @@ -81,7 +80,6 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) */ struct kvm_vcpu *kvm_arm_get_running_vcpu(void) { - BUG_ON(preemptible()); return __this_cpu_read(kvm_arm_running_vcpu); } From 6c1b7521f4a07cc63bbe2dfe290efed47cdb780a Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 14 Sep 2017 11:08:45 -0700 Subject: [PATCH 080/197] KVM: arm/arm64: Factor out functionality to get vgic mmio requester_vcpu We are about to distinguish between userspace accesses and mmio traps for a number of the mmio handlers. When the requester vcpu is NULL, it means we are handling a userspace access. Factor out the functionality to get the request vcpu into its own function, mostly so we have a common place to document the semantics of the return value. Also take the chance to move the functionality outside of holding a spinlock and instead explicitly disable and enable preemption. This supports PREEMPT_RT kernels as well. Acked-by: Marc Zyngier Reviewed-by: Andre Przywara Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic/vgic-mmio.c | 44 ++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index deb51ee16a3da4..fdad95f62fa3bb 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -122,6 +122,27 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, return value; } +/* + * This function will return the VCPU that performed the MMIO access and + * trapped from within the VM, and will return NULL if this is a userspace + * access. + * + * We can disable preemption locally around accessing the per-CPU variable, + * and use the resolved vcpu pointer after enabling preemption again, because + * even if the current thread is migrated to another CPU, reading the per-CPU + * value later will give us the same value as we update the per-CPU variable + * in the preempt notifier handlers. + */ +static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void) +{ + struct kvm_vcpu *vcpu; + + preempt_disable(); + vcpu = kvm_arm_get_running_vcpu(); + preempt_enable(); + return vcpu; +} + void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) @@ -184,24 +205,10 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool new_active_state) { - struct kvm_vcpu *requester_vcpu; unsigned long flags; - spin_lock_irqsave(&irq->irq_lock, flags); + struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu(); - /* - * The vcpu parameter here can mean multiple things depending on how - * this function is called; when handling a trap from the kernel it - * depends on the GIC version, and these functions are also called as - * part of save/restore from userspace. - * - * Therefore, we have to figure out the requester in a reliable way. - * - * When accessing VGIC state from user space, the requester_vcpu is - * NULL, which is fine, because we guarantee that no VCPUs are running - * when accessing VGIC state from user space so irq->vcpu->cpu is - * always -1. - */ - requester_vcpu = kvm_arm_get_running_vcpu(); + spin_lock_irqsave(&irq->irq_lock, flags); /* * If this virtual IRQ was written into a list register, we @@ -213,6 +220,11 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, * vgic_change_active_prepare) and still has to sync back this IRQ, * so we release and re-acquire the spin_lock to let the other thread * sync back the IRQ. + * + * When accessing VGIC state from user space, requester_vcpu is + * NULL, which is fine, because we guarantee that no VCPUs are running + * when accessing VGIC state from user space so irq->vcpu->cpu is + * always -1. */ while (irq->vcpu && /* IRQ may have state in an LR somewhere */ irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */ From 70450a9fbe0658e864f882d4351e5bae018b2647 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Sep 2017 11:56:37 +0200 Subject: [PATCH 081/197] KVM: arm/arm64: Don't cache the timer IRQ level The timer logic was designed after a strict idea of modeling an interrupt line level in software, meaning that only transitions in the level need to be reported to the VGIC. This works well for the timer, because the arch timer code is in complete control of the device and can track the transitions of the line. However, as we are about to support using the HW bit in the VGIC not just for the timer, but also for VFIO which cannot track transitions of the interrupt line, we have to decide on an interface between the GIC and other subsystems for level triggered mapped interrupts, which both the timer and VFIO can use. VFIO only sees an asserting transition of the physical interrupt line, and tells the VGIC when that happens. That means that part of the interrupt flow is offloaded to the hardware. To use the same interface for VFIO devices and the timer, we therefore have to change the timer (we cannot change VFIO because it doesn't know the details of the device it is assigning to a VM). Luckily, changing the timer is simple, we just need to stop 'caching' the line level, but instead let the VGIC know the state of the timer every time there is a potential change in the line level, and when the line level should be asserted from the timer ISR. The VGIC can ignore extra notifications using its validate mechanism. Reviewed-by: Marc Zyngier Reviewed-by: Andre Przywara Reviewed-by: Julien Thierry Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index f9555b1e7f158f..9376fe03bf2e96 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -99,11 +99,9 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) } vtimer = vcpu_vtimer(vcpu); - if (!vtimer->irq.level) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - if (kvm_timer_irq_can_fire(vtimer)) - kvm_timer_update_irq(vcpu, true, vtimer); - } + vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); + if (kvm_timer_irq_can_fire(vtimer)) + kvm_timer_update_irq(vcpu, true, vtimer); if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_vtimer_update_mask_user(vcpu); @@ -324,12 +322,20 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + bool level; if (unlikely(!timer->enabled)) return; - if (kvm_timer_should_fire(vtimer) != vtimer->irq.level) - kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer); + /* + * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part + * of its lifecycle is offloaded to the hardware, and we therefore may + * not have lowered the irq.level value before having to signal a new + * interrupt, but have to signal an interrupt every time the level is + * asserted. + */ + level = kvm_timer_should_fire(vtimer); + kvm_timer_update_irq(vcpu, level, vtimer); if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); From e40cc57bac792713ff6a1b80a1f67b54c05e5e21 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 29 Aug 2017 10:40:44 +0200 Subject: [PATCH 082/197] KVM: arm/arm64: vgic: Support level-triggered mapped interrupts Level-triggered mapped IRQs are special because we only observe rising edges as input to the VGIC, and we don't set the EOI flag and therefore are not told when the level goes down, so that we can re-queue a new interrupt when the level goes up. One way to solve this problem is to side-step the logic of the VGIC and special case the validation in the injection path, but it has the unfortunate drawback of having to peak into the physical GIC state whenever we want to know if the interrupt is pending on the virtual distributor. Instead, we can maintain the current semantics of a level triggered interrupt by sort of treating it as an edge-triggered interrupt, following from the fact that we only observe an asserting edge. This requires us to be a bit careful when populating the LRs and when folding the state back in though: * We lower the line level when populating the LR, so that when subsequently observing an asserting edge, the VGIC will do the right thing. * If the guest never acked the interrupt while running (for example if it had masked interrupts at the CPU level while running), we have to preserve the pending state of the LR and move it back to the line_level field of the struct irq when folding LR state. If the guest never acked the interrupt while running, but changed the device state and lowered the line (again with interrupts masked) then we need to observe this change in the line_level. Both of the above situations are solved by sampling the physical line and set the line level when folding the LR back. * Finally, if the guest never acked the interrupt while running and sampling the line reveals that the device state has changed and the line has been lowered, we must clear the physical active state, since we will otherwise never be told when the interrupt becomes asserted again. This has the added benefit of making the timer optimization patches (https://lists.cs.columbia.edu/pipermail/kvmarm/2017-July/026343.html) a bit simpler, because the timer code doesn't have to clear the active state on the sync anymore. It also potentially improves the performance of the timer implementation because the GIC knows the state or the LR and only needs to clear the active state when the pending bit in the LR is still set, where the timer has to always clear it when returning from running the guest with an injected timer interrupt. Reviewed-by: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic/vgic-v2.c | 29 +++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic-v3.c | 29 +++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic.c | 23 +++++++++++++++++++++++ virt/kvm/arm/vgic/vgic.h | 7 +++++++ 4 files changed, 88 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 80897102da26ce..c32d7b93ffd194 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -105,6 +105,26 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) irq->pending_latch = false; } + /* + * Level-triggered mapped IRQs are special because we only + * observe rising edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample + * the physical line and set the line level, because the + * device state could have changed or we simply need to + * process the still pending interrupt later. + * + * If this causes us to lower the level, we have to also clear + * the physical active state, since we will otherwise never be + * told when the interrupt becomes asserted again. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) { + irq->line_level = vgic_get_phys_line_level(irq); + + if (!irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -162,6 +182,15 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= GICH_LR_EOI; } + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v2_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) + irq->line_level = false; + /* The GICv2 LR only holds five bits of priority. */ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index f47e8481fa452d..6b329414e57a3c 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -96,6 +96,26 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) irq->pending_latch = false; } + /* + * Level-triggered mapped IRQs are special because we only + * observe rising edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample + * the physical line and set the line level, because the + * device state could have changed or we simply need to + * process the still pending interrupt later. + * + * If this causes us to lower the level, we have to also clear + * the physical active state, since we will otherwise never be + * told when the interrupt becomes asserted again. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) { + irq->line_level = vgic_get_phys_line_level(irq); + + if (!irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -145,6 +165,15 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= ICH_LR_EOI; } + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v3_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) + irq->line_level = false; + /* * We currently only support Group1 interrupts, which is a * known defect. This needs to be addressed at some point. diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index ecb8e25f5fe56d..4318e9edd075d4 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -144,6 +144,29 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) kfree(irq); } +/* Get the input level of a mapped IRQ directly from the physical GIC */ +bool vgic_get_phys_line_level(struct vgic_irq *irq) +{ + bool line_level; + + BUG_ON(!irq->hw); + + WARN_ON(irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &line_level)); + return line_level; +} + +/* Set/Clear the physical active state */ +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) +{ + + BUG_ON(!irq->hw); + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_ACTIVE, + active)); +} + /** * kvm_vgic_target_oracle - compute the target vcpu for an irq * diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index efbcf8f96f9c1a..d0787983a35734 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -104,6 +104,11 @@ static inline bool irq_is_pending(struct vgic_irq *irq) return irq->pending_latch || irq->line_level; } +static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) +{ + return irq->config == VGIC_CONFIG_LEVEL && irq->hw; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC @@ -140,6 +145,8 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); +bool vgic_get_phys_line_level(struct vgic_irq *irq); +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags); void vgic_kick_vcpus(struct kvm *kvm); From b6909a659f8d5de2df36cabb1c0505d262996a24 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 27 Oct 2017 19:30:09 +0200 Subject: [PATCH 083/197] KVM: arm/arm64: Support a vgic interrupt line level sample function The GIC sometimes need to sample the physical line of a mapped interrupt. As we know this to be notoriously slow, provide a callback function for devices (such as the timer) which can do this much faster than talking to the distributor, for example by comparing a few in-memory values. Fall back to the good old method of poking the physical GIC if no callback is provided. Reviewed-by: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 13 ++++++++++++- virt/kvm/arm/arch_timer.c | 3 ++- virt/kvm/arm/vgic/vgic.c | 13 +++++++++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 8c896540a72cf4..cdbd142ca7f2ea 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -130,6 +130,17 @@ struct vgic_irq { u8 priority; enum vgic_irq_config config; /* Level or edge */ + /* + * Callback function pointer to in-kernel devices that can tell us the + * state of the input level of mapped level-triggered IRQ faster than + * peaking into the physical GIC. + * + * Always called in non-preemptible section and the functions can use + * kvm_arm_get_running_vcpu() to get the vcpu pointer for private + * IRQs. + */ + bool (*get_input_level)(int vintid); + void *owner; /* Opaque pointer to reserve an interrupt for in-kernel devices. */ }; @@ -331,7 +342,7 @@ void kvm_vgic_init_cpu_hardware(void); int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, bool level, void *owner); int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, - u32 vintid); + u32 vintid, bool (*get_input_level)(int vindid)); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid); bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 9376fe03bf2e96..fb0533ed903d40 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -834,7 +834,8 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq); + ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq, + NULL); if (ret) return ret; diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 4318e9edd075d4..d57b3bf8eb362f 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -144,13 +144,15 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) kfree(irq); } -/* Get the input level of a mapped IRQ directly from the physical GIC */ bool vgic_get_phys_line_level(struct vgic_irq *irq) { bool line_level; BUG_ON(!irq->hw); + if (irq->get_input_level) + return irq->get_input_level(irq->intid); + WARN_ON(irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, &line_level)); @@ -436,7 +438,8 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, /* @irq->irq_lock must be held */ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - unsigned int host_irq) + unsigned int host_irq, + bool (*get_input_level)(int vindid)) { struct irq_desc *desc; struct irq_data *data; @@ -456,6 +459,7 @@ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, irq->hw = true; irq->host_irq = host_irq; irq->hwintid = data->hwirq; + irq->get_input_level = get_input_level; return 0; } @@ -464,10 +468,11 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) { irq->hw = false; irq->hwintid = 0; + irq->get_input_level = NULL; } int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, - u32 vintid) + u32 vintid, bool (*get_input_level)(int vindid)) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); unsigned long flags; @@ -476,7 +481,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, BUG_ON(!irq); spin_lock_irqsave(&irq->irq_lock, flags); - ret = kvm_vgic_map_irq(vcpu, irq, host_irq); + ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level); spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); From df635c5b184da05145db1d63e1dcb1c853f425c2 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 1 Sep 2017 16:25:12 +0200 Subject: [PATCH 084/197] KVM: arm/arm64: Support VGIC dist pend/active changes for mapped IRQs For mapped IRQs (with the HW bit set in the LR) we have to follow some rules of the architecture. One of these rules is that VM must not be allowed to deactivate a virtual interrupt with the HW bit set unless the physical interrupt is also active. This works fine when injecting mapped interrupts, because we leave it up to the injector to either set EOImode==1 or manually set the active state of the physical interrupt. However, the guest can set virtual interrupt to be pending or active by writing to the virtual distributor, which could lead to deactivating a virtual interrupt with the HW bit set without the physical interrupt being active. We could set the physical interrupt to active whenever we are about to enter the VM with a HW interrupt either pending or active, but that would be really slow, especially on GICv2. So we take the long way around and do the hard work when needed, which is expected to be extremely rare. When the VM sets the pending state for a HW interrupt on the virtual distributor we set the active state on the physical distributor, because the virtual interrupt can become active and then the guest can deactivate it. When the VM clears the pending state we also clear it on the physical side, because the injector might otherwise raise the interrupt. We also clear the physical active state when the virtual interrupt is not active, since otherwise a SPEND/CPEND sequence from the guest would prevent signaling of future interrupts. Changing the state of mapped interrupts from userspace is not supported, and it's expected that userspace unmaps devices from VFIO before attempting to set the interrupt state, because the interrupt state is driven by hardware. Reviewed-by: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic/vgic-mmio.c | 71 ++++++++++++++++++++++++++++++++--- virt/kvm/arm/vgic/vgic.c | 7 ++++ virt/kvm/arm/vgic/vgic.h | 1 + 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index fdad95f62fa3bb..83d82bd7dc4e71 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "vgic.h" @@ -143,10 +144,22 @@ static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void) return vcpu; } +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->pending_latch = true; + vgic_irq_set_phys_active(irq, true); +} + void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { + bool is_uaccess = !vgic_get_mmio_requester_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; @@ -155,17 +168,45 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - + if (irq->hw) + vgic_hw_irq_spending(vcpu, irq, is_uaccess); + else + irq->pending_latch = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } } +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->pending_latch = false; + + /* + * We don't want the guest to effectively mask the physical + * interrupt by doing a write to SPENDR followed by a write to + * CPENDR for HW interrupts, so we clear the active state on + * the physical side if the virtual interrupt is not active. + * This may lead to taking an additional interrupt on the + * host, but that should not be a problem as the worst that + * can happen is an additional vgic injection. We also clear + * the pending state to maintain proper semantics for edge HW + * interrupts. + */ + vgic_irq_set_phys_pending(irq, false); + if (!irq->active) + vgic_irq_set_phys_active(irq, false); +} + void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { + bool is_uaccess = !vgic_get_mmio_requester_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; @@ -175,7 +216,10 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = false; + if (irq->hw) + vgic_hw_irq_cpending(vcpu, irq, is_uaccess); + else + irq->pending_latch = false; spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); @@ -202,8 +246,19 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, return value; } +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool active, bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->active = active; + vgic_irq_set_phys_active(irq, active); +} + static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool new_active_state) + bool active) { unsigned long flags; struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu(); @@ -231,8 +286,12 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, irq->vcpu->cpu != -1) /* VCPU thread is running */ cond_resched_lock(&irq->irq_lock); - irq->active = new_active_state; - if (new_active_state) + if (irq->hw) + vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); + else + irq->active = active; + + if (irq->active) vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else spin_unlock_irqrestore(&irq->irq_lock, flags); diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index d57b3bf8eb362f..c7c5ef190afa0c 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -144,6 +144,13 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) kfree(irq); } +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) +{ + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + pending)); +} + bool vgic_get_phys_line_level(struct vgic_irq *irq) { bool line_level; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index d0787983a35734..12c37b89f7a382 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -146,6 +146,7 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags); From 4c60e360d6dfa4d9c3586b687f348eeb3fd675dd Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 27 Oct 2017 19:34:30 +0200 Subject: [PATCH 085/197] KVM: arm/arm64: Provide a get_input_level for the arch timer The VGIC can now support the life-cycle of mapped level-triggered interrupts, and we no longer have to read back the timer state on every exit from the VM if we had an asserted timer interrupt signal, because the VGIC already knows if we hit the unlikely case where the guest disables the timer without ACKing the virtual timer interrupt. This means we rework a bit of the code to factor out the functionality to snapshot the timer state from vtimer_save_state(), and we can reuse this functionality in the sync path when we have an irqchip in userspace, and also to support our implementation of the get_input_level() function for the timer. This change also means that we can no longer rely on the timer's view of the interrupt line to set the active state, because we no longer maintain this state for mapped interrupts when exiting from the guest. Instead, we only set the active state if the virtual interrupt is active, and otherwise we simply let the timer fire again and raise the virtual interrupt from the ISR. Reviewed-by: Eric Auger Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_arch_timer.h | 2 + virt/kvm/arm/arch_timer.c | 84 ++++++++++++++++-------------------- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6e45608b239981..b1dcfde0a3ef16 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -90,6 +90,8 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); void kvm_timer_init_vhe(void); +bool kvm_arch_timer_get_input_level(int vintid); + #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index fb0533ed903d40..d845d67b7062f3 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -343,6 +343,12 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) phys_timer_emulate(vcpu); } +static void __timer_snapshot_state(struct arch_timer_context *timer) +{ + timer->cnt_ctl = read_sysreg_el0(cntv_ctl); + timer->cnt_cval = read_sysreg_el0(cntv_cval); +} + static void vtimer_save_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; @@ -354,10 +360,8 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu) if (!vtimer->loaded) goto out; - if (timer->enabled) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - vtimer->cnt_cval = read_sysreg_el0(cntv_cval); - } + if (timer->enabled) + __timer_snapshot_state(vtimer); /* Disable the virtual timer */ write_sysreg_el0(0, cntv_ctl); @@ -454,8 +458,7 @@ static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu) bool phys_active; int ret; - phys_active = vtimer->irq.level || - kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); + phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); ret = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, @@ -535,54 +538,27 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) set_cntvoff(0); } -static void unmask_vtimer_irq(struct kvm_vcpu *vcpu) +/* + * With a userspace irqchip we have to check if the guest de-asserted the + * timer and if so, unmask the timer irq signal on the host interrupt + * controller to ensure that we see future timer signals. + */ +static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { - kvm_vtimer_update_mask_user(vcpu); - return; - } - - /* - * If the guest disabled the timer without acking the interrupt, then - * we must make sure the physical and virtual active states are in - * sync by deactivating the physical interrupt, because otherwise we - * wouldn't see the next timer interrupt in the host. - */ - if (!kvm_vgic_map_is_active(vcpu, vtimer->irq.irq)) { - int ret; - ret = irq_set_irqchip_state(host_vtimer_irq, - IRQCHIP_STATE_ACTIVE, - false); - WARN_ON(ret); + __timer_snapshot_state(vtimer); + if (!kvm_timer_should_fire(vtimer)) { + kvm_timer_update_irq(vcpu, false, vtimer); + kvm_vtimer_update_mask_user(vcpu); + } } } -/** - * kvm_timer_sync_hwstate - sync timer state from cpu - * @vcpu: The vcpu pointer - * - * Check if any of the timers have expired while we were running in the guest, - * and inject an interrupt if that was the case. - */ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - /* - * If we entered the guest with the vtimer output asserted we have to - * check if the guest has modified the timer so that we should lower - * the line at this point. - */ - if (vtimer->irq.level) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - vtimer->cnt_cval = read_sysreg_el0(cntv_cval); - if (!kvm_timer_should_fire(vtimer)) { - kvm_timer_update_irq(vcpu, false, vtimer); - unmask_vtimer_irq(vcpu); - } - } + unmask_vtimer_irq_user(vcpu); } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) @@ -813,6 +789,22 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) return true; } +bool kvm_arch_timer_get_input_level(int vintid) +{ + struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu(); + struct arch_timer_context *timer; + + if (vintid == vcpu_vtimer(vcpu)->irq.irq) + timer = vcpu_vtimer(vcpu); + else + BUG(); /* We only map the vtimer so far */ + + if (timer->loaded) + __timer_snapshot_state(timer); + + return kvm_timer_should_fire(timer); +} + int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; @@ -835,7 +827,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) } ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq, - NULL); + kvm_arch_timer_get_input_level); if (ret) return ret; From 61bbe38027334780964e4b2658f722ed0b3cb2f9 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 27 Oct 2017 19:57:51 +0200 Subject: [PATCH 086/197] KVM: arm/arm64: Avoid work when userspace iqchips are not used We currently check if the VM has a userspace irqchip in several places along the critical path, and if so, we do some work which is only required for having an irqchip in userspace. This is unfortunate, as we could avoid doing any work entirely, if we didn't have to support irqchip in userspace. Realizing the userspace irqchip on ARM is mostly a developer or hobby feature, and is unlikely to be used in servers or other scenarios where performance is a priority, we can use a refcounted static key to only check the irqchip configuration when we have at least one VM that uses an irqchip in userspace. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 2 ++ arch/arm64/include/asm/kvm_host.h | 2 ++ virt/kvm/arm/arch_timer.c | 6 ++-- virt/kvm/arm/arm.c | 59 ++++++++++++++++++++++--------- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index a9f7d3f47134a9..6394fb99da7f0b 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -48,6 +48,8 @@ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) +DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); + u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ea6cb5b24258be..e7218cf7df2a2c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -47,6 +47,8 @@ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) +DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); + int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index d845d67b7062f3..cfcd0323deabed 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -103,7 +103,8 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) if (kvm_timer_irq_can_fire(vtimer)) kvm_timer_update_irq(vcpu, true, vtimer); - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + if (static_branch_unlikely(&userspace_irqchip_in_use) && + unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_vtimer_update_mask_user(vcpu); return IRQ_HANDLED; @@ -284,7 +285,8 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level); - if (likely(irqchip_in_kernel(vcpu->kvm))) { + if (!static_branch_unlikely(&userspace_irqchip_in_use) && + likely(irqchip_in_kernel(vcpu->kvm))) { ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level, diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 3610e132df8bd9..59d8e04c19fa28 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -74,6 +74,8 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) __this_cpu_write(kvm_arm_running_vcpu, vcpu); } +DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); + /** * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU. * Must be called from non-preemptible context @@ -302,6 +304,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) + static_branch_dec(&userspace_irqchip_in_use); kvm_arch_vcpu_free(vcpu); } @@ -522,14 +526,22 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) vcpu->arch.has_run_once = true; - /* - * Map the VGIC hardware resources before running a vcpu the first - * time on this VM. - */ - if (unlikely(irqchip_in_kernel(kvm) && !vgic_ready(kvm))) { - ret = kvm_vgic_map_resources(kvm); - if (ret) - return ret; + if (likely(irqchip_in_kernel(kvm))) { + /* + * Map the VGIC hardware resources before running a vcpu the + * first time on this VM. + */ + if (unlikely(!vgic_ready(kvm))) { + ret = kvm_vgic_map_resources(kvm); + if (ret) + return ret; + } + } else { + /* + * Tell the rest of the code that there are userspace irqchip + * VMs in the wild. + */ + static_branch_inc(&userspace_irqchip_in_use); } ret = kvm_timer_enable(vcpu); @@ -664,18 +676,29 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_vgic_flush_hwstate(vcpu); /* - * If we have a singal pending, or need to notify a userspace - * irqchip about timer or PMU level changes, then we exit (and - * update the timer level state in kvm_timer_update_run - * below). + * Exit if we have a signal pending so that we can deliver the + * signal to user space. */ - if (signal_pending(current) || - kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { + if (signal_pending(current)) { ret = -EINTR; run->exit_reason = KVM_EXIT_INTR; } + /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + } + } + /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. @@ -688,7 +711,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_request_pending(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; kvm_pmu_sync_hwstate(vcpu); - kvm_timer_sync_hwstate(vcpu); + if (static_branch_unlikely(&userspace_irqchip_in_use)) + kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); preempt_enable(); @@ -732,7 +756,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * we don't want vtimer interrupts to race with syncing the * timer virtual interrupt state. */ - kvm_timer_sync_hwstate(vcpu); + if (static_branch_unlikely(&userspace_irqchip_in_use)) + kvm_timer_sync_hwstate(vcpu); /* * We may have taken a host interrupt in HYP mode (ie From e5a2cfb492d5f25f96fa06170dba7494355c5969 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 12 Dec 2017 21:37:21 +0100 Subject: [PATCH 087/197] KVM: arm/arm64: Delete outdated forwarded irq documentation The reason I added this documentation originally was that the concept of "never taking the interrupt", but just use the timer to generate an exit from the guest, was confusing to most, and we had to explain it several times over. But as we can clearly see, we've failed to update the documentation as the code has evolved, and people who need to understand these details are probably better off reading the code. Let's lighten our maintenance burden slightly and just get rid of this. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- .../virtual/kvm/arm/vgic-mapped-irqs.txt | 187 ------------------ 1 file changed, 187 deletions(-) delete mode 100644 Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt deleted file mode 100644 index 38bca2835278c1..00000000000000 --- a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt +++ /dev/null @@ -1,187 +0,0 @@ -KVM/ARM VGIC Forwarded Physical Interrupts -========================================== - -The KVM/ARM code implements software support for the ARM Generic -Interrupt Controller's (GIC's) hardware support for virtualization by -allowing software to inject virtual interrupts to a VM, which the guest -OS sees as regular interrupts. The code is famously known as the VGIC. - -Some of these virtual interrupts, however, correspond to physical -interrupts from real physical devices. One example could be the -architected timer, which itself supports virtualization, and therefore -lets a guest OS program the hardware device directly to raise an -interrupt at some point in time. When such an interrupt is raised, the -host OS initially handles the interrupt and must somehow signal this -event as a virtual interrupt to the guest. Another example could be a -passthrough device, where the physical interrupts are initially handled -by the host, but the device driver for the device lives in the guest OS -and KVM must therefore somehow inject a virtual interrupt on behalf of -the physical one to the guest OS. - -These virtual interrupts corresponding to a physical interrupt on the -host are called forwarded physical interrupts, but are also sometimes -referred to as 'virtualized physical interrupts' and 'mapped interrupts'. - -Forwarded physical interrupts are handled slightly differently compared -to virtual interrupts generated purely by a software emulated device. - - -The HW bit ----------- -Virtual interrupts are signalled to the guest by programming the List -Registers (LRs) on the GIC before running a VCPU. The LR is programmed -with the virtual IRQ number and the state of the interrupt (Pending, -Active, or Pending+Active). When the guest ACKs and EOIs a virtual -interrupt, the LR state moves from Pending to Active, and finally to -inactive. - -The LRs include an extra bit, called the HW bit. When this bit is set, -KVM must also program an additional field in the LR, the physical IRQ -number, to link the virtual with the physical IRQ. - -When the HW bit is set, KVM must EITHER set the Pending OR the Active -bit, never both at the same time. - -Setting the HW bit causes the hardware to deactivate the physical -interrupt on the physical distributor when the guest deactivates the -corresponding virtual interrupt. - - -Forwarded Physical Interrupts Life Cycle ----------------------------------------- - -The state of forwarded physical interrupts is managed in the following way: - - - The physical interrupt is acked by the host, and becomes active on - the physical distributor (*). - - KVM sets the LR.Pending bit, because this is the only way the GICV - interface is going to present it to the guest. - - LR.Pending will stay set as long as the guest has not acked the interrupt. - - LR.Pending transitions to LR.Active on the guest read of the IAR, as - expected. - - On guest EOI, the *physical distributor* active bit gets cleared, - but the LR.Active is left untouched (set). - - KVM clears the LR on VM exits when the physical distributor - active state has been cleared. - -(*): The host handling is slightly more complicated. For some forwarded -interrupts (shared), KVM directly sets the active state on the physical -distributor before entering the guest, because the interrupt is never actually -handled on the host (see details on the timer as an example below). For other -forwarded interrupts (non-shared) the host does not deactivate the interrupt -when the host ISR completes, but leaves the interrupt active until the guest -deactivates it. Leaving the interrupt active is allowed, because Linux -configures the physical GIC with EOIMode=1, which causes EOI operations to -perform a priority drop allowing the GIC to receive other interrupts of the -default priority. - - -Forwarded Edge and Level Triggered PPIs and SPIs ------------------------------------------------- -Forwarded physical interrupts injected should always be active on the -physical distributor when injected to a guest. - -Level-triggered interrupts will keep the interrupt line to the GIC -asserted, typically until the guest programs the device to deassert the -line. This means that the interrupt will remain pending on the physical -distributor until the guest has reprogrammed the device. Since we -always run the VM with interrupts enabled on the CPU, a pending -interrupt will exit the guest as soon as we switch into the guest, -preventing the guest from ever making progress as the process repeats -over and over. Therefore, the active state on the physical distributor -must be set when entering the guest, preventing the GIC from forwarding -the pending interrupt to the CPU. As soon as the guest deactivates the -interrupt, the physical line is sampled by the hardware again and the host -takes a new interrupt if and only if the physical line is still asserted. - -Edge-triggered interrupts do not exhibit the same problem with -preventing guest execution that level-triggered interrupts do. One -option is to not use HW bit at all, and inject edge-triggered interrupts -from a physical device as pure virtual interrupts. But that would -potentially slow down handling of the interrupt in the guest, because a -physical interrupt occurring in the middle of the guest ISR would -preempt the guest for the host to handle the interrupt. Additionally, -if you configure the system to handle interrupts on a separate physical -core from that running your VCPU, you still have to interrupt the VCPU -to queue the pending state onto the LR, even though the guest won't use -this information until the guest ISR completes. Therefore, the HW -bit should always be set for forwarded edge-triggered interrupts. With -the HW bit set, the virtual interrupt is injected and additional -physical interrupts occurring before the guest deactivates the interrupt -simply mark the state on the physical distributor as Pending+Active. As -soon as the guest deactivates the interrupt, the host takes another -interrupt if and only if there was a physical interrupt between injecting -the forwarded interrupt to the guest and the guest deactivating the -interrupt. - -Consequently, whenever we schedule a VCPU with one or more LRs with the -HW bit set, the interrupt must also be active on the physical -distributor. - - -Forwarded LPIs --------------- -LPIs, introduced in GICv3, are always edge-triggered and do not have an -active state. They become pending when a device signal them, and as -soon as they are acked by the CPU, they are inactive again. - -It therefore doesn't make sense, and is not supported, to set the HW bit -for physical LPIs that are forwarded to a VM as virtual interrupts, -typically virtual SPIs. - -For LPIs, there is no other choice than to preempt the VCPU thread if -necessary, and queue the pending state onto the LR. - - -Putting It Together: The Architected Timer ------------------------------------------- -The architected timer is a device that signals interrupts with level -triggered semantics. The timer hardware is directly accessed by VCPUs -which program the timer to fire at some point in time. Each VCPU on a -system programs the timer to fire at different times, and therefore the -hardware is multiplexed between multiple VCPUs. This is implemented by -context-switching the timer state along with each VCPU thread. - -However, this means that a scenario like the following is entirely -possible, and in fact, typical: - -1. KVM runs the VCPU -2. The guest programs the time to fire in T+100 -3. The guest is idle and calls WFI (wait-for-interrupts) -4. The hardware traps to the host -5. KVM stores the timer state to memory and disables the hardware timer -6. KVM schedules a soft timer to fire in T+(100 - time since step 2) -7. KVM puts the VCPU thread to sleep (on a waitqueue) -8. The soft timer fires, waking up the VCPU thread -9. KVM reprograms the timer hardware with the VCPU's values -10. KVM marks the timer interrupt as active on the physical distributor -11. KVM injects a forwarded physical interrupt to the guest -12. KVM runs the VCPU - -Notice that KVM injects a forwarded physical interrupt in step 11 without -the corresponding interrupt having actually fired on the host. That is -exactly why we mark the timer interrupt as active in step 10, because -the active state on the physical distributor is part of the state -belonging to the timer hardware, which is context-switched along with -the VCPU thread. - -If the guest does not idle because it is busy, the flow looks like this -instead: - -1. KVM runs the VCPU -2. The guest programs the time to fire in T+100 -4. At T+100 the timer fires and a physical IRQ causes the VM to exit - (note that this initially only traps to EL2 and does not run the host ISR - until KVM has returned to the host). -5. With interrupts still disabled on the CPU coming back from the guest, KVM - stores the virtual timer state to memory and disables the virtual hw timer. -6. KVM looks at the timer state (in memory) and injects a forwarded physical - interrupt because it concludes the timer has expired. -7. KVM marks the timer interrupt as active on the physical distributor -7. KVM enables the timer, enables interrupts, and runs the VCPU - -Notice that again the forwarded physical interrupt is injected to the -guest without having actually been handled on the host. In this case it -is because the physical interrupt is never actually seen by the host because the -timer is disabled upon guest return, and the virtual forwarded interrupt is -injected on the KVM guest entry path. From f3721c70fc2b1216db26ce8aabf23b4d7cc4a0a6 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 8 Jan 2018 15:19:31 +0100 Subject: [PATCH 088/197] Revert "arm64: KVM: Hide PMU from guests when disabled" Commit 0c0543a128bd1c6a4c8610d0d9d869053fa2fbf5 breaks migration and introduces a regression with existing userspace because it introduces an ordering requirement of setting up all VCPU features before writing ID registers which we didn't have before. Revert this commit for now until we have a proper fix. Signed-off-by: Christoffer Dall --- arch/arm64/kvm/sys_regs.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 503144f13af022..1830ebc227d18d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -881,25 +881,18 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu, } /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(struct kvm_vcpu *vcpu, - struct sys_reg_desc const *r, - bool raz) +static u64 read_id_reg(struct sys_reg_desc const *r, bool raz) { u32 id = sys_reg((u32)r->Op0, (u32)r->Op1, (u32)r->CRn, (u32)r->CRm, (u32)r->Op2); u64 val = raz ? 0 : read_sanitised_ftr_reg(id); - switch (id) { - case SYS_ID_AA64DFR0_EL1: - if (!kvm_arm_pmu_v3_ready(vcpu)) - val &= ~(0xfUL << ID_AA64DFR0_PMUVER_SHIFT); - break; - case SYS_ID_AA64PFR0_EL1: + if (id == SYS_ID_AA64PFR0_EL1) { if (val & (0xfUL << ID_AA64PFR0_SVE_SHIFT)) pr_err_once("kvm [%i]: SVE unsupported for guests, suppressing\n", task_pid_nr(current)); + val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT); - break; } return val; @@ -915,7 +908,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_id_reg(vcpu, r, raz); + p->regval = read_id_reg(r, raz); return true; } @@ -944,17 +937,17 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ -static int __get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - void __user *uaddr, bool raz) +static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, + bool raz) { const u64 id = sys_reg_to_index(rd); - const u64 val = read_id_reg(vcpu, rd, raz); + const u64 val = read_id_reg(rd, raz); return reg_to_user(uaddr, &val, id); } -static int __set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - void __user *uaddr, bool raz) +static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, + bool raz) { const u64 id = sys_reg_to_index(rd); int err; @@ -965,7 +958,7 @@ static int __set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return err; /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(vcpu, rd, raz)) + if (val != read_id_reg(rd, raz)) return -EINVAL; return 0; @@ -974,25 +967,25 @@ static int __set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(vcpu, rd, uaddr, false); + return __get_id_reg(rd, uaddr, false); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(vcpu, rd, uaddr, false); + return __set_id_reg(rd, uaddr, false); } static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(vcpu, rd, uaddr, true); + return __get_id_reg(rd, uaddr, true); } static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(vcpu, rd, uaddr, true); + return __set_id_reg(rd, uaddr, true); } /* sys_reg_desc initialiser for known cpufeature ID registers */ From d68119864ef4b253a585a1c897cda6936d4b5de9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:14 +0100 Subject: [PATCH 089/197] KVM: arm/arm64: Detangle kvm_mmu.h from kvm_hyp.h kvm_hyp.h has an odd dependency on kvm_mmu.h, which makes the opposite inclusion impossible. Let's start with breaking that useless dependency. Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_hyp.h | 1 - arch/arm/kvm/hyp/switch.c | 1 + arch/arm/kvm/hyp/tlb.c | 1 + arch/arm64/include/asm/kvm_hyp.h | 1 - arch/arm64/kvm/hyp/debug-sr.c | 1 + arch/arm64/kvm/hyp/switch.c | 1 + arch/arm64/kvm/hyp/tlb.c | 1 + virt/kvm/arm/hyp/vgic-v2-sr.c | 1 + 8 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index ab20ffa8b9e765..76368de7237baf 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #define __hyp_text __section(.hyp.text) notrace diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index 330c9ce34ba5f6..ae45ae96aac28b 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -18,6 +18,7 @@ #include #include +#include __asm__(".arch_extension virt"); diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c index 6d810af2d9fd7c..c0edd450e10459 100644 --- a/arch/arm/kvm/hyp/tlb.c +++ b/arch/arm/kvm/hyp/tlb.c @@ -19,6 +19,7 @@ */ #include +#include /** * Flush per-VMID TLBs diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 08d3bb66c8b75b..f26f9cd70c721a 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -20,7 +20,6 @@ #include #include -#include #include #define __hyp_text __section(.hyp.text) notrace diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c index 321c9c05dd9e09..360455f863461a 100644 --- a/arch/arm64/kvm/hyp/debug-sr.c +++ b/arch/arm64/kvm/hyp/debug-sr.c @@ -21,6 +21,7 @@ #include #include #include +#include #define read_debug(r,n) read_sysreg(r##n##_el1) #define write_debug(v,r,n) write_sysreg(v, r##n##_el1) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index f7c651f3a8c0e8..f3d8bed096f50a 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index 73464a96c3657e..131c7772703c29 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -16,6 +16,7 @@ */ #include +#include #include static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index d7fd46fe9efb35..4fe6e797e8b3c5 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -21,6 +21,7 @@ #include #include +#include static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) { From a15f693935a9f1fec8241cafaca27be4483d4464 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:15 +0100 Subject: [PATCH 090/197] KVM: arm/arm64: Split dcache/icache flushing As we're about to introduce opportunistic invalidation of the icache, let's split dcache and icache flushing. Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 60 ++++++++++++++++++++++---------- arch/arm64/include/asm/kvm_mmu.h | 13 +++++-- virt/kvm/arm/mmu.c | 20 ++++++++--- 3 files changed, 67 insertions(+), 26 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index fa6f2174276bdd..9fa4b2520974d4 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -126,21 +126,12 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) return (vcpu_cp15(vcpu, c1_SCTLR) & 0b101) == 0b101; } -static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, - kvm_pfn_t pfn, - unsigned long size) +static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu, + kvm_pfn_t pfn, + unsigned long size) { /* - * If we are going to insert an instruction page and the icache is - * either VIPT or PIPT, there is a potential problem where the host - * (or another VM) may have used the same page as this guest, and we - * read incorrect data from the icache. If we're using a PIPT cache, - * we can invalidate just that page, but if we are using a VIPT cache - * we need to invalidate the entire icache - damn shame - as written - * in the ARM ARM (DDI 0406C.b - Page B3-1393). - * - * VIVT caches are tagged using both the ASID and the VMID and doesn't - * need any kind of flushing (DDI 0406C.b - Page B3-1392). + * Clean the dcache to the Point of Coherency. * * We need to do this through a kernel mapping (using the * user-space mapping has proved to be the wrong @@ -155,19 +146,52 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_flush_dcache_to_poc(va, PAGE_SIZE); - if (icache_is_pipt()) - __cpuc_coherent_user_range((unsigned long)va, - (unsigned long)va + PAGE_SIZE); - size -= PAGE_SIZE; pfn++; kunmap_atomic(va); } +} - if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) { +static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu, + kvm_pfn_t pfn, + unsigned long size) +{ + /* + * If we are going to insert an instruction page and the icache is + * either VIPT or PIPT, there is a potential problem where the host + * (or another VM) may have used the same page as this guest, and we + * read incorrect data from the icache. If we're using a PIPT cache, + * we can invalidate just that page, but if we are using a VIPT cache + * we need to invalidate the entire icache - damn shame - as written + * in the ARM ARM (DDI 0406C.b - Page B3-1393). + * + * VIVT caches are tagged using both the ASID and the VMID and doesn't + * need any kind of flushing (DDI 0406C.b - Page B3-1392). + */ + + VM_BUG_ON(size & ~PAGE_MASK); + + if (icache_is_vivt_asid_tagged()) + return; + + if (!icache_is_pipt()) { /* any kind of VIPT cache */ __flush_icache_all(); + return; + } + + /* PIPT cache. As for the d-side, use a temporary kernel mapping. */ + while (size) { + void *va = kmap_atomic_pfn(pfn); + + __cpuc_coherent_user_range((unsigned long)va, + (unsigned long)va + PAGE_SIZE); + + size -= PAGE_SIZE; + pfn++; + + kunmap_atomic(va); } } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 672c8684d5c2a7..8034b96fb3a412 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -230,19 +230,26 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; } -static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, - kvm_pfn_t pfn, - unsigned long size) +static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu, + kvm_pfn_t pfn, + unsigned long size) { void *va = page_address(pfn_to_page(pfn)); kvm_flush_dcache_to_poc(va, size); +} +static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu, + kvm_pfn_t pfn, + unsigned long size) +{ if (icache_is_aliasing()) { /* any kind of VIPT cache */ __flush_icache_all(); } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) { /* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */ + void *va = page_address(pfn_to_page(pfn)); + flush_icache_range((unsigned long)va, (unsigned long)va + size); } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index b36945d49986dd..2174244f631777 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1257,10 +1257,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, - unsigned long size) +static void clean_dcache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, + unsigned long size) { - __coherent_cache_guest_page(vcpu, pfn, size); + __clean_dcache_guest_page(vcpu, pfn, size); +} + +static void invalidate_icache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, + unsigned long size) +{ + __invalidate_icache_guest_page(vcpu, pfn, size); } static void kvm_send_hwpoison_signal(unsigned long address, @@ -1391,7 +1397,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, new_pmd = kvm_s2pmd_mkwrite(new_pmd); kvm_set_pfn_dirty(pfn); } - coherent_cache_guest_page(vcpu, pfn, PMD_SIZE); + clean_dcache_guest_page(vcpu, pfn, PMD_SIZE); + invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE); + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { pte_t new_pte = pfn_pte(pfn, mem_type); @@ -1401,7 +1409,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } - coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE); + clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE); + invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } From 4fee94736603cd6fd83c1ea1ee0388d1d2dbe11b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:16 +0100 Subject: [PATCH 091/197] arm64: KVM: Add invalidate_icache_range helper We currently tightly couple dcache clean with icache invalidation, but KVM could do without the initial flush to PoU, as we've already flushed things to PoC. Let's introduce invalidate_icache_range which is limited to invalidating the icache from the linear mapping (and thus has none of the userspace fault handling complexity), and wire it in KVM instead of flush_icache_range. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/assembler.h | 21 +++++++++++++++++++ arch/arm64/include/asm/cacheflush.h | 7 +++++++ arch/arm64/include/asm/kvm_mmu.h | 4 ++-- arch/arm64/mm/cache.S | 32 ++++++++++++++++++++--------- 4 files changed, 52 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index aef72d88667775..0884e1fdfd3032 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -387,6 +387,27 @@ alternative_endif dsb \domain .endm +/* + * Macro to perform an instruction cache maintenance for the interval + * [start, end) + * + * start, end: virtual addresses describing the region + * label: A label to branch to on user fault. + * Corrupts: tmp1, tmp2 + */ + .macro invalidate_icache_by_line start, end, tmp1, tmp2, label + icache_line_size \tmp1, \tmp2 + sub \tmp2, \tmp1, #1 + bic \tmp2, \start, \tmp2 +9997: +USER(\label, ic ivau, \tmp2) // invalidate I line PoU + add \tmp2, \tmp2, \tmp1 + cmp \tmp2, \end + b.lo 9997b + dsb ish + isb + .endm + /* * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present */ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 955130762a3c6a..bef9f418f08986 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -52,6 +52,12 @@ * - start - virtual start address * - end - virtual end address * + * invalidate_icache_range(start, end) + * + * Invalidate the I-cache in the region described by start, end. + * - start - virtual start address + * - end - virtual end address + * * __flush_cache_user_range(start, end) * * Ensure coherency between the I-cache and the D-cache in the @@ -66,6 +72,7 @@ * - size - region size */ extern void flush_icache_range(unsigned long start, unsigned long end); +extern int invalidate_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); extern void __inval_dcache_area(void *addr, size_t len); extern void __clean_dcache_area_poc(void *addr, size_t len); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 8034b96fb3a412..56b3e03c85e747 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -250,8 +250,8 @@ static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu, /* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */ void *va = page_address(pfn_to_page(pfn)); - flush_icache_range((unsigned long)va, - (unsigned long)va + size); + invalidate_icache_range((unsigned long)va, + (unsigned long)va + size); } } diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 7f1dbe962cf581..bedd23da83f4ed 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -60,16 +60,7 @@ user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE b.lo 1b dsb ish - icache_line_size x2, x3 - sub x3, x2, #1 - bic x4, x0, x3 -1: -USER(9f, ic ivau, x4 ) // invalidate I line PoU - add x4, x4, x2 - cmp x4, x1 - b.lo 1b - dsb ish - isb + invalidate_icache_by_line x0, x1, x2, x3, 9f mov x0, #0 1: uaccess_ttbr0_disable x1 @@ -80,6 +71,27 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU ENDPROC(flush_icache_range) ENDPROC(__flush_cache_user_range) +/* + * invalidate_icache_range(start,end) + * + * Ensure that the I cache is invalid within specified region. + * + * - start - virtual start address of region + * - end - virtual end address of region + */ +ENTRY(invalidate_icache_range) + uaccess_ttbr0_enable x2, x3 + + invalidate_icache_by_line x0, x1, x2, x3, 2f + mov x0, xzr +1: + uaccess_ttbr0_disable x1 + ret +2: + mov x0, #-EFAULT + b 1b +ENDPROC(invalidate_icache_range) + /* * __flush_dcache_area(kaddr, size) * From 91c703e0382a1212d249adf34af4943a5da90d54 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:17 +0100 Subject: [PATCH 092/197] arm: KVM: Add optimized PIPT icache flushing Calling __cpuc_coherent_user_range to invalidate the icache on a PIPT icache machine has some pointless overhead, as it starts by cleaning the dcache to the PoU, while we're guaranteed to have already cleaned it to the PoC. As KVM is the only user of such a feature, let's implement some ad-hoc cache flushing in kvm_mmu.h. Should it become useful to other subsystems, it can be moved to a more global location. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_hyp.h | 2 ++ arch/arm/include/asm/kvm_mmu.h | 32 +++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index 76368de7237baf..1ab8329e9ff75c 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -68,6 +68,8 @@ #define HIFAR __ACCESS_CP15(c6, 4, c0, 2) #define HPFAR __ACCESS_CP15(c6, 4, c0, 4) #define ICIALLUIS __ACCESS_CP15(c7, 0, c1, 0) +#define BPIALLIS __ACCESS_CP15(c7, 0, c1, 6) +#define ICIMVAU __ACCESS_CP15(c7, 0, c5, 1) #define ATS1CPR __ACCESS_CP15(c7, 0, c8, 0) #define TLBIALLIS __ACCESS_CP15(c8, 0, c3, 0) #define TLBIALL __ACCESS_CP15(c8, 0, c7, 0) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 9fa4b2520974d4..bc8d21e76637f2 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -37,6 +37,8 @@ #include #include +#include +#include #include #include @@ -157,6 +159,8 @@ static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, unsigned long size) { + u32 iclsz; + /* * If we are going to insert an instruction page and the icache is * either VIPT or PIPT, there is a potential problem where the host @@ -181,18 +185,40 @@ static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu, return; } - /* PIPT cache. As for the d-side, use a temporary kernel mapping. */ + /* + * CTR IminLine contains Log2 of the number of words in the + * cache line, so we can get the number of words as + * 2 << (IminLine - 1). To get the number of bytes, we + * multiply by 4 (the number of bytes in a 32-bit word), and + * get 4 << (IminLine). + */ + iclsz = 4 << (read_cpuid(CPUID_CACHETYPE) & 0xf); + while (size) { void *va = kmap_atomic_pfn(pfn); + void *end = va + PAGE_SIZE; + void *addr = va; - __cpuc_coherent_user_range((unsigned long)va, - (unsigned long)va + PAGE_SIZE); + do { + write_sysreg(addr, ICIMVAU); + addr += iclsz; + } while (addr < end); + + dsb(ishst); + isb(); size -= PAGE_SIZE; pfn++; kunmap_atomic(va); } + + /* Check if we need to invalidate the BTB */ + if ((read_cpuid_ext(CPUID_EXT_MMFR1) >> 28) != 4) { + write_sysreg(0, BPIALLIS); + dsb(ishst); + isb(); + } } static inline void __kvm_flush_dcache_pte(pte_t pte) From fefb876b9b96fa7e4ed3d906979ea45b4cf07349 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:18 +0100 Subject: [PATCH 093/197] arm64: KVM: PTE/PMD S2 XN bit definition As we're about to make S2 page-tables eXecute Never by default, add the required bits for both PMDs and PTEs. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/pgtable-hwdef.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index eb0c2bd90de903..af035331fb096a 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -177,9 +177,11 @@ */ #define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ #define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ +#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */ #define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */ #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ +#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) From d0e22b4ac3ba23c611739f554392bf5e217df49f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:19 +0100 Subject: [PATCH 094/197] KVM: arm/arm64: Limit icache invalidation to prefetch aborts We've so far eagerly invalidated the icache, no matter how the page was faulted in (data or prefetch abort). But we can easily track execution by setting the XN bits in the S2 page tables, get the prefetch abort at HYP and perform the icache invalidation at that time only. As for most VMs, the instruction working set is pretty small compared to the data set, this is likely to save some traffic (specially as the invalidation is broadcast). Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 12 ++++++++++++ arch/arm/include/asm/pgtable.h | 4 ++-- arch/arm64/include/asm/kvm_mmu.h | 12 ++++++++++++ arch/arm64/include/asm/pgtable-prot.h | 4 ++-- virt/kvm/arm/mmu.c | 19 +++++++++++++++---- 5 files changed, 43 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index bc8d21e76637f2..4d7a54cbb3ab56 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -85,6 +85,18 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) return pmd; } +static inline pte_t kvm_s2pte_mkexec(pte_t pte) +{ + pte_val(pte) &= ~L_PTE_XN; + return pte; +} + +static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) +{ + pmd_val(pmd) &= ~PMD_SECT_XN; + return pmd; +} + static inline void kvm_set_s2pte_readonly(pte_t *pte) { pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 150ece66ddf345..a757401129f956 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -102,8 +102,8 @@ extern pgprot_t pgprot_s2_device; #define PAGE_HYP_EXEC _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY) #define PAGE_HYP_RO _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN) #define PAGE_HYP_DEVICE _MOD_PROT(pgprot_hyp_device, L_PTE_HYP) -#define PAGE_S2 _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY) -#define PAGE_S2_DEVICE _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY) +#define PAGE_S2 _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY | L_PTE_XN) +#define PAGE_S2_DEVICE _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY | L_PTE_XN) #define __PAGE_NONE __pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN | L_PTE_NONE) #define __PAGE_SHARED __pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 56b3e03c85e747..1e1b20cb348f28 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -173,6 +173,18 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) return pmd; } +static inline pte_t kvm_s2pte_mkexec(pte_t pte) +{ + pte_val(pte) &= ~PTE_S2_XN; + return pte; +} + +static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) +{ + pmd_val(pmd) &= ~PMD_S2_XN; + return pmd; +} + static inline void kvm_set_s2pte_readonly(pte_t *pte) { pteval_t old_pteval, pteval; diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 0a5635fb0ef984..4e12dabd342b06 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -60,8 +60,8 @@ #define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) -#define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) -#define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) +#define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY | PTE_S2_XN) +#define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN) #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 2174244f631777..0417c8e2a81cb2 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1292,7 +1292,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long fault_status) { int ret; - bool write_fault, writable, hugetlb = false, force_pte = false; + bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -1304,7 +1304,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long flags = 0; write_fault = kvm_is_write_fault(vcpu); - if (fault_status == FSC_PERM && !write_fault) { + exec_fault = kvm_vcpu_trap_is_iabt(vcpu); + VM_BUG_ON(write_fault && exec_fault); + + if (fault_status == FSC_PERM && !write_fault && !exec_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } @@ -1398,7 +1401,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); } clean_dcache_guest_page(vcpu, pfn, PMD_SIZE); - invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE); + + if (exec_fault) { + new_pmd = kvm_s2pmd_mkexec(new_pmd); + invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE); + } ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { @@ -1410,7 +1417,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mark_page_dirty(kvm, gfn); } clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE); - invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE); + + if (exec_fault) { + new_pte = kvm_s2pte_mkexec(new_pte); + invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE); + } ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } From a9c0e12ebee56ef06b7eccdbc73bab71d0018df8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:20 +0100 Subject: [PATCH 095/197] KVM: arm/arm64: Only clean the dcache on translation fault The only case where we actually need to perform a dcache maintenance is when we map the page for the first time, and subsequent permission faults do not require cache maintenance. Let's make it conditional on not being a permission fault (and thus a translation fault). Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 0417c8e2a81cb2..f956efbd933d26 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1400,7 +1400,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, new_pmd = kvm_s2pmd_mkwrite(new_pmd); kvm_set_pfn_dirty(pfn); } - clean_dcache_guest_page(vcpu, pfn, PMD_SIZE); + + if (fault_status != FSC_PERM) + clean_dcache_guest_page(vcpu, pfn, PMD_SIZE); if (exec_fault) { new_pmd = kvm_s2pmd_mkexec(new_pmd); @@ -1416,7 +1418,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } - clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE); + + if (fault_status != FSC_PERM) + clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE); if (exec_fault) { new_pte = kvm_s2pte_mkexec(new_pte); From 7a3796d2ef5bb948f709467eef1bf96edbfc67a0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:21 +0100 Subject: [PATCH 096/197] KVM: arm/arm64: Preserve Exec permission across R/W permission faults So far, we loose the Exec property whenever we take permission faults, as we always reconstruct the PTE/PMD from scratch. This can be counter productive as we can end-up with the following fault sequence: X -> RO -> ROX -> RW -> RWX Instead, we can lookup the existing PTE/PMD and clear the XN bit in the new entry if it was already cleared in the old one, leadig to a much nicer fault sequence: X -> ROX -> RWX Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 10 ++++++++++ arch/arm64/include/asm/kvm_mmu.h | 10 ++++++++++ virt/kvm/arm/mmu.c | 27 +++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 4d7a54cbb3ab56..aab64fe5214656 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -107,6 +107,11 @@ static inline bool kvm_s2pte_readonly(pte_t *pte) return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; } +static inline bool kvm_s2pte_exec(pte_t *pte) +{ + return !(pte_val(*pte) & L_PTE_XN); +} + static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) { pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; @@ -117,6 +122,11 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; } +static inline bool kvm_s2pmd_exec(pmd_t *pmd) +{ + return !(pmd_val(*pmd) & PMD_SECT_XN); +} + static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 1e1b20cb348f28..126abefffe7f68 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -203,6 +203,11 @@ static inline bool kvm_s2pte_readonly(pte_t *pte) return (pte_val(*pte) & PTE_S2_RDWR) == PTE_S2_RDONLY; } +static inline bool kvm_s2pte_exec(pte_t *pte) +{ + return !(pte_val(*pte) & PTE_S2_XN); +} + static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) { kvm_set_s2pte_readonly((pte_t *)pmd); @@ -213,6 +218,11 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) return kvm_s2pte_readonly((pte_t *)pmd); } +static inline bool kvm_s2pmd_exec(pmd_t *pmd) +{ + return !(pmd_val(*pmd) & PMD_S2_XN); +} + static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index f956efbd933d26..b83b5a8442bb60 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -926,6 +926,25 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache return 0; } +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +{ + pmd_t *pmdp; + pte_t *ptep; + + pmdp = stage2_get_pmd(kvm, NULL, addr); + if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) + return false; + + if (pmd_thp_or_huge(*pmdp)) + return kvm_s2pmd_exec(pmdp); + + ptep = pte_offset_kernel(pmdp, addr); + if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) + return false; + + return kvm_s2pte_exec(ptep); +} + static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr, const pte_t *new_pte, unsigned long flags) @@ -1407,6 +1426,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault) { new_pmd = kvm_s2pmd_mkexec(new_pmd); invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE); + } else if (fault_status == FSC_PERM) { + /* Preserve execute if XN was already cleared */ + if (stage2_is_exec(kvm, fault_ipa)) + new_pmd = kvm_s2pmd_mkexec(new_pmd); } ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); @@ -1425,6 +1448,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault) { new_pte = kvm_s2pte_mkexec(new_pte); invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE); + } else if (fault_status == FSC_PERM) { + /* Preserve execute if XN was already cleared */ + if (stage2_is_exec(kvm, fault_ipa)) + new_pte = kvm_s2pte_mkexec(new_pte); } ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); From 17ab9d57debaa53d665651e425a0efc4a893c039 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 23 Oct 2017 17:11:22 +0100 Subject: [PATCH 097/197] KVM: arm/arm64: Drop vcpu parameter from guest cache maintenance operartions The vcpu parameter isn't used for anything, and gets in the way of further cleanups. Let's get rid of it. Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 7 ++----- arch/arm64/include/asm/kvm_mmu.h | 7 ++----- virt/kvm/arm/mmu.c | 18 ++++++++---------- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index aab64fe5214656..bc70a1f0f42d4c 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -150,9 +150,7 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) return (vcpu_cp15(vcpu, c1_SCTLR) & 0b101) == 0b101; } -static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu, - kvm_pfn_t pfn, - unsigned long size) +static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) { /* * Clean the dcache to the Point of Coherency. @@ -177,8 +175,7 @@ static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu, } } -static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu, - kvm_pfn_t pfn, +static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) { u32 iclsz; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 126abefffe7f68..06f1f979467996 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -252,17 +252,14 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; } -static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu, - kvm_pfn_t pfn, - unsigned long size) +static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) { void *va = page_address(pfn_to_page(pfn)); kvm_flush_dcache_to_poc(va, size); } -static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu, - kvm_pfn_t pfn, +static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) { if (icache_is_aliasing()) { diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index b83b5a8442bb60..a1ea43fa75cfd4 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1276,16 +1276,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static void clean_dcache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, - unsigned long size) +static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) { - __clean_dcache_guest_page(vcpu, pfn, size); + __clean_dcache_guest_page(pfn, size); } -static void invalidate_icache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, - unsigned long size) +static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) { - __invalidate_icache_guest_page(vcpu, pfn, size); + __invalidate_icache_guest_page(pfn, size); } static void kvm_send_hwpoison_signal(unsigned long address, @@ -1421,11 +1419,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } if (fault_status != FSC_PERM) - clean_dcache_guest_page(vcpu, pfn, PMD_SIZE); + clean_dcache_guest_page(pfn, PMD_SIZE); if (exec_fault) { new_pmd = kvm_s2pmd_mkexec(new_pmd); - invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE); + invalidate_icache_guest_page(pfn, PMD_SIZE); } else if (fault_status == FSC_PERM) { /* Preserve execute if XN was already cleared */ if (stage2_is_exec(kvm, fault_ipa)) @@ -1443,11 +1441,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } if (fault_status != FSC_PERM) - clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE); + clean_dcache_guest_page(pfn, PAGE_SIZE); if (exec_fault) { new_pte = kvm_s2pte_mkexec(new_pte); - invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE); + invalidate_icache_guest_page(pfn, PAGE_SIZE); } else if (fault_status == FSC_PERM) { /* Preserve execute if XN was already cleared */ if (stage2_is_exec(kvm, fault_ipa)) From 448fadc8a4a36cad560e07b4fbc94cba49526956 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 9 Jan 2018 11:51:58 +0100 Subject: [PATCH 098/197] arm64: mm: Add additional parameter to uaccess_ttbr0_enable Add an extra temporary register parameter to uaccess_ttbr0_enable which is about to be required for arm64 PAN support. This patch doesn't introduce any functional change but ensures that the kernel compiles once the KVM/ARM tree is merged with the arm64 tree by ensuring a trivially mergable conflict with commit 27a921e75711d924617269e0ba4adb8bae9fd0d1 ("arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN"). Cc: Will Deacon Cc: Catalin Marinas Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/asm-uaccess.h | 4 ++-- arch/arm64/mm/cache.S | 4 ++-- arch/arm64/xen/hypercall.S | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index b3da6c88683596..b67563d2024c7e 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -31,7 +31,7 @@ alternative_if_not ARM64_HAS_PAN alternative_else_nop_endif .endm - .macro uaccess_ttbr0_enable, tmp1, tmp2 + .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 alternative_if_not ARM64_HAS_PAN save_and_disable_irq \tmp2 // avoid preemption __uaccess_ttbr0_enable \tmp1 @@ -42,7 +42,7 @@ alternative_else_nop_endif .macro uaccess_ttbr0_disable, tmp1 .endm - .macro uaccess_ttbr0_enable, tmp1, tmp2 + .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 .endm #endif diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index bedd23da83f4ed..5a52811f47e9dc 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -49,7 +49,7 @@ ENTRY(flush_icache_range) * - end - virtual end address of region */ ENTRY(__flush_cache_user_range) - uaccess_ttbr0_enable x2, x3 + uaccess_ttbr0_enable x2, x3, x4 dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x0, x3 @@ -80,7 +80,7 @@ ENDPROC(__flush_cache_user_range) * - end - virtual end address of region */ ENTRY(invalidate_icache_range) - uaccess_ttbr0_enable x2, x3 + uaccess_ttbr0_enable x2, x3, x4 invalidate_icache_by_line x0, x1, x2, x3, 2f mov x0, xzr diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 401ceb71540c74..acdbd2c9e899c1 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -101,7 +101,7 @@ ENTRY(privcmd_call) * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation * is enabled (it implies that hardware UAO and PAN disabled). */ - uaccess_ttbr0_enable x6, x7 + uaccess_ttbr0_enable x6, x7, x8 hvc XEN_IMM /* From 81ceca05a42e24768c73d4d6dbd3701a60f1ed85 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 19 Dec 2017 15:56:24 +0100 Subject: [PATCH 099/197] KVM: PPC: Book3S HV: Remove vcpu->arch.dec usage On Book3S in HV mode, we don't use the vcpu->arch.dec field at all. Instead, all logic is built around vcpu->arch.dec_expires. So let's remove the one remaining piece of code that was setting it. Signed-off-by: Alexander Graf Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 2659844784b817..c8ffd69adfec7a 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -957,7 +957,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) mftb r7 subf r3,r7,r8 mtspr SPRN_DEC,r3 - std r3,VCPU_DEC(r4) ld r5, VCPU_SPRG0(r4) ld r6, VCPU_SPRG1(r4) From 1627301020cb460f5a74e13c291f8db3b2a8062e Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 7 Jan 2018 10:07:36 +0100 Subject: [PATCH 100/197] KVM: PPC: Use seq_puts() in kvmppc_exit_timing_show() A headline should be quickly put into a sequence. Thus use the function "seq_puts" instead of "seq_printf" for this purpose. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/timing.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index e44d2b2ea97e34..1c03c978eb184c 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -143,8 +143,7 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private) int i; u64 min, max, sum, sum_quad; - seq_printf(m, "%s", "type count min max sum sum_squared\n"); - + seq_puts(m, "type count min max sum sum_squared\n"); for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { From 5855564c8ab2d9cefca7b2933bd19818eb795e40 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 12 Jan 2018 20:55:20 +1100 Subject: [PATCH 101/197] KVM: PPC: Book3S HV: Enable migration of decrementer register This adds a register identifier for use with the one_reg interface to allow the decrementer expiry time to be read and written by userspace. The decrementer expiry time is in guest timebase units and is equal to the sum of the decrementer and the guest timebase. (The expiry time is used rather than the decrementer value itself because the expiry time is not constantly changing, though the decrementer value is, while the guest vcpu is not running.) Without this, a guest vcpu migrated to a new host will see its decrementer set to some random value. On POWER8 and earlier, the decrementer is 32 bits wide and counts down at 512MHz, so the guest vcpu will potentially see no decrementer interrupts for up to about 4 seconds, which will lead to a stall. With POWER9, the decrementer is now 56 bits side, so the stall can be much longer (up to 2.23 years) and more noticeable. To help work around the problem in cases where userspace has not been updated to migrate the decrementer expiry time, we now set the default decrementer expiry at vcpu creation time to the current time rather than the maximum possible value. This should mean an immediate decrementer interrupt when a migrated vcpu starts running. In cases where the decrementer is 32 bits wide and more than 4 seconds elapse between the creation of the vcpu and when it first runs, the decrementer would have wrapped around to positive values and there may still be a stall - but this is no worse than the current situation. In the large-decrementer case, we are sure to get an immediate decrementer interrupt (assuming the time from vcpu creation to first run is less than 2.23 years) and we thus avoid a very long stall. Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 1 + arch/powerpc/include/uapi/asm/kvm.h | 2 ++ arch/powerpc/kvm/book3s_hv.c | 8 ++++++++ arch/powerpc/kvm/powerpc.c | 2 +- 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f670e4b9e7f33f..c6f9eebb79f2c0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1841,6 +1841,7 @@ registers, find a list below: PPC | KVM_REG_PPC_DBSR | 32 PPC | KVM_REG_PPC_TIDR | 64 PPC | KVM_REG_PPC_PSSCR | 64 + PPC | KVM_REG_PPC_DEC_EXPIRY | 64 PPC | KVM_REG_PPC_TM_GPR0 | 64 ... PPC | KVM_REG_PPC_TM_GPR31 | 64 diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 61d6049f4c1ece..8aaec831053af0 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -607,6 +607,8 @@ struct kvm_ppc_rmmu_info { #define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc) #define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd) +#define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) + /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c4f0bebfc5ba48..b2d448c750086b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1497,6 +1497,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_ARCH_COMPAT: *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); break; + case KVM_REG_PPC_DEC_EXPIRY: + *val = get_reg_val(id, vcpu->arch.dec_expires + + vcpu->arch.vcore->tb_offset); + break; default: r = -EINVAL; break; @@ -1724,6 +1728,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_ARCH_COMPAT: r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); break; + case KVM_REG_PPC_DEC_EXPIRY: + vcpu->arch.dec_expires = set_reg_val(id, *val) - + vcpu->arch.vcore->tb_offset; + break; default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index c2c7ef33055325..7c9e45f54186dc 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -758,7 +758,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; - vcpu->arch.dec_expires = ~(u64)0; + vcpu->arch.dec_expires = get_tb(); #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); From 9696594158ec43d210137c1b93313efde8f7315a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 10 Nov 2017 16:18:05 +0100 Subject: [PATCH 102/197] s390x/mm: cleanup gmap_pte_op_walk() gmap_mprotect_notify() refuses shadow gmaps. Turns out that a) gmap_protect_range() b) gmap_read_table() c) gmap_pte_op_walk() Are never called for gmap shadows. And never should be. This dates back to gmap shadow prototypes where we allowed to call mprotect_notify() on the gmap shadow (to get notified about the prefix pages getting removed). This is avoided by always getting notified about any change on the gmap shadow. The only real function for walking page tables on shadow gmaps is gmap_table_walk(). So, essentially, these functions should never get called and gmap_pte_op_walk() can be cleaned up. Add some checks to callers of gmap_pte_op_walk(). Signed-off-by: David Hildenbrand Message-Id: <20171110151805.7541-1-david@redhat.com> Reviewed-by: Janosch Frank Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/mm/gmap.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 05d459b638f55d..54cfd51a5a27e4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -815,27 +815,17 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, * @ptl: pointer to the spinlock pointer * * Returns a pointer to the locked pte for a guest address, or NULL - * - * Note: Can also be called for shadow gmaps. */ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, spinlock_t **ptl) { unsigned long *table; - if (gmap_is_shadow(gmap)) - spin_lock(&gmap->guest_table_lock); + BUG_ON(gmap_is_shadow(gmap)); /* Walk the gmap page table, lock and get pte pointer */ table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ - if (!table || *table & _SEGMENT_ENTRY_INVALID) { - if (gmap_is_shadow(gmap)) - spin_unlock(&gmap->guest_table_lock); + if (!table || *table & _SEGMENT_ENTRY_INVALID) return NULL; - } - if (gmap_is_shadow(gmap)) { - *ptl = &gmap->guest_table_lock; - return pte_offset_map((pmd_t *) table, gaddr); - } return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); } @@ -889,8 +879,6 @@ static void gmap_pte_op_end(spinlock_t *ptl) * -EFAULT if gaddr is invalid (or mapping for shadows is missing). * * Called with sg->mm->mmap_sem in read. - * - * Note: Can also be called for shadow gmaps. */ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, unsigned long len, int prot, unsigned long bits) @@ -900,6 +888,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, pte_t *ptep; int rc; + BUG_ON(gmap_is_shadow(gmap)); while (len) { rc = -EAGAIN; ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); @@ -960,7 +949,8 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify); * @val: pointer to the unsigned long value to return * * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT - * if reading using the virtual address failed. + * if reading using the virtual address failed. -EINVAL if called on a gmap + * shadow. * * Called with gmap->mm->mmap_sem in read. */ @@ -971,6 +961,9 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) pte_t *ptep, pte; int rc; + if (gmap_is_shadow(gmap)) + return -EINVAL; + while (1) { rc = -EAGAIN; ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); From 241e3ec0faf5ab1a0d9b1f6c43eefa919fb9c112 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 16 Nov 2017 15:12:52 +0100 Subject: [PATCH 103/197] KVM: s390: use created_vcpus in more places commit a03825bbd0c3 ("KVM: s390: use kvm->created_vcpus") introduced kvm->created_vcpus to avoid races with the existing kvm->online_vcpus scheme. One place was "forgotten" and one new place was "added". Let's fix those. Reported-by: Halil Pasic Signed-off-by: Christian Borntraeger Reviewed-by: Halil Pasic Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Fixes: 4e0b1ab72b8a ("KVM: s390: gs support for kvm guests") Fixes: a03825bbd0c3 ("KVM: s390: use kvm->created_vcpus") --- arch/s390/kvm/kvm-s390.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ec8b68e97d3cd4..f579bf5c262036 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -573,7 +573,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) case KVM_CAP_S390_GS: r = -EINVAL; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { r = -EBUSY; } else if (test_facility(133)) { set_kvm_facility(kvm->arch.model.fac_mask, 133); @@ -1094,7 +1094,7 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, return -EINVAL; mutex_lock(&kvm->lock); - if (!atomic_read(&kvm->online_vcpus)) { + if (!kvm->created_vcpus) { bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat, KVM_S390_VM_CPU_FEAT_NR_BITS); ret = 0; From 2f8311c912ab67084ce3657096df601c87f49a58 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 16 Nov 2017 12:30:15 +0100 Subject: [PATCH 104/197] KVM: s390: add debug tracing for cpu features of CPU model The cpu model already traces the cpu facilities, the ibc and guest CPU ids. We should do the same for the cpu features (on success only). Signed-off-by: Christian Borntraeger Reviewed-by: Halil Pasic Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand --- arch/s390/kvm/kvm-s390.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f579bf5c262036..d9f94bf241487c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1084,7 +1084,6 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_s390_vm_cpu_feat data; - int ret = -EBUSY; if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data))) return -EFAULT; @@ -1094,13 +1093,18 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, return -EINVAL; mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { - bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat, - KVM_S390_VM_CPU_FEAT_NR_BITS); - ret = 0; + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; } + bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); mutex_unlock(&kvm->lock); - return ret; + VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", + data.feat[0], + data.feat[1], + data.feat[2]); + return 0; } static int kvm_s390_set_processor_subfunc(struct kvm *kvm, @@ -1202,6 +1206,10 @@ static int kvm_s390_get_processor_feat(struct kvm *kvm, KVM_S390_VM_CPU_FEAT_NR_BITS); if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", + data.feat[0], + data.feat[1], + data.feat[2]); return 0; } @@ -1215,6 +1223,10 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm, KVM_S390_VM_CPU_FEAT_NR_BITS); if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) return -EFAULT; + VM_EVENT(kvm, 3, "GET: host feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", + data.feat[0], + data.feat[1], + data.feat[2]); return 0; } From 588629385c4b96c436cef55dff75eef7f90cd5f0 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Mon, 20 Nov 2017 10:37:30 +0100 Subject: [PATCH 105/197] KVM: s390: drop use of spin lock in __floating_irq_kick It is not required to take to a lock to protect access to the cpuflags of the local interrupt structure of a vcpu as the performed operation is an atomic_or. Signed-off-by: Michael Mueller Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 024ad8bcc51655..818aa4248b0f36 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1569,7 +1569,6 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) /* make the VCPU drop out of the SIE, or wake it up if sleeping */ li = &dst_vcpu->arch.local_int; - spin_lock(&li->lock); switch (type) { case KVM_S390_MCHK: atomic_or(CPUSTAT_STOP_INT, li->cpuflags); @@ -1581,7 +1580,6 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) atomic_or(CPUSTAT_EXT_INT, li->cpuflags); break; } - spin_unlock(&li->lock); kvm_s390_vcpu_wakeup(dst_vcpu); } From 1a5c79125afdc16e999c1700b0d3dfef1bef1c84 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 12 Dec 2017 09:29:09 +0100 Subject: [PATCH 106/197] kvm_config: add CONFIG_S390_GUEST make kvmconfig currently does not select CONFIG_S390_GUEST. Since the virtio-ccw transport depends on CONFIG_S390_GUEST, we want to add CONFIG_S390_GUEST to kvmconfig. Signed-off-by: Christian Borntraeger Acked-by: Cornelia Huck --- kernel/configs/kvm_guest.config | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 8d964376714293..108fecc20fc148 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -18,6 +18,7 @@ CONFIG_VIRTUALIZATION=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y +CONFIG_S390_GUEST=y CONFIG_VIRTIO=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BLK=y From a9f6c9a92f3771000493f6bbacbd7677b46d8706 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Jan 2018 20:37:47 +0100 Subject: [PATCH 107/197] KVM: s390: cleanup struct kvm_s390_float_interrupt "wq" is not used at all. "cpuflags" can be access directly via the vcpu, just as "float_int" via vcpu->kvm. While at it, reuse _set_cpuflag() to make the code look nicer. Signed-off-by: David Hildenbrand Message-Id: <20180108193747.10818-1-david@redhat.com> Reviewed-by: Cornelia Huck Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 3 --- arch/s390/kvm/interrupt.c | 25 +++++++++++-------------- arch/s390/kvm/kvm-s390.c | 3 --- arch/s390/kvm/kvm-s390.h | 2 +- arch/s390/kvm/sigp.c | 12 ++++-------- 5 files changed, 16 insertions(+), 29 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e14f381757f67b..e16a9f2a44ad69 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -515,9 +515,6 @@ struct kvm_s390_irq_payload { struct kvm_s390_local_interrupt { spinlock_t lock; - struct kvm_s390_float_interrupt *float_int; - struct swait_queue_head *wq; - atomic_t *cpuflags; DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); struct kvm_s390_irq_payload irq; unsigned long pending_irqs; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 818aa4248b0f36..f8eb2cfa763acd 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -107,12 +107,11 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc, expect; if (!kvm_s390_use_sca_entries()) return; - atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags); + atomic_andnot(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -279,13 +278,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); - set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); + set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); - clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); + clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -1228,7 +1227,7 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) li->irq.ext = irq->u.ext; set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs); - atomic_or(CPUSTAT_EXT_INT, li->cpuflags); + __set_cpuflag(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1253,7 +1252,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) return -EBUSY; *extcall = irq->u.extcall; - atomic_or(CPUSTAT_EXT_INT, li->cpuflags); + __set_cpuflag(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1329,7 +1328,7 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu, set_bit(irq->u.emerg.code, li->sigp_emerg_pending); set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); - atomic_or(CPUSTAT_EXT_INT, li->cpuflags); + __set_cpuflag(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1373,7 +1372,7 @@ static int __inject_ckc(struct kvm_vcpu *vcpu) 0, 0); set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); - atomic_or(CPUSTAT_EXT_INT, li->cpuflags); + __set_cpuflag(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1386,7 +1385,7 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu) 0, 0); set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); - atomic_or(CPUSTAT_EXT_INT, li->cpuflags); + __set_cpuflag(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1546,7 +1545,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) static void __floating_irq_kick(struct kvm *kvm, u64 type) { struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; - struct kvm_s390_local_interrupt *li; struct kvm_vcpu *dst_vcpu; int sigcpu, online_vcpus, nr_tries = 0; @@ -1568,16 +1566,15 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) dst_vcpu = kvm_get_vcpu(kvm, sigcpu); /* make the VCPU drop out of the SIE, or wake it up if sleeping */ - li = &dst_vcpu->arch.local_int; switch (type) { case KVM_S390_MCHK: - atomic_or(CPUSTAT_STOP_INT, li->cpuflags); + __set_cpuflag(dst_vcpu, CPUSTAT_STOP_INT); break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - atomic_or(CPUSTAT_IO_INT, li->cpuflags); + __set_cpuflag(dst_vcpu, CPUSTAT_IO_INT); break; default: - atomic_or(CPUSTAT_EXT_INT, li->cpuflags); + __set_cpuflag(dst_vcpu, CPUSTAT_EXT_INT); break; } kvm_s390_vcpu_wakeup(dst_vcpu); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d9f94bf241487c..cc5a1b4d74b7da 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2509,9 +2509,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); - vcpu->arch.local_int.float_int = &kvm->arch.float_int; - vcpu->arch.local_int.wq = &vcpu->wq; - vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; seqcount_init(&vcpu->arch.cputm_seqcount); rc = kvm_vcpu_init(vcpu, kvm, id); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 5e46ba429bcb4d..8877116f015955 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -54,7 +54,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); + return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index c1f5cde2c878e6..5cafd1e2651bc3 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -20,14 +20,11 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, u64 *reg) { - struct kvm_s390_local_interrupt *li; int cpuflags; int rc; int ext_call_pending; - li = &dst_vcpu->arch.local_int; - - cpuflags = atomic_read(li->cpuflags); + cpuflags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu); if (!(cpuflags & CPUSTAT_STOPPED) && !ext_call_pending) rc = SIGP_CC_ORDER_CODE_ACCEPTED; @@ -211,7 +208,7 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, int flags; int rc; - flags = atomic_read(dst_vcpu->arch.local_int.cpuflags); + flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); if (!(flags & CPUSTAT_STOPPED)) { *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INCORRECT_STATE; @@ -231,7 +228,6 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, static int __sigp_sense_running(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, u64 *reg) { - struct kvm_s390_local_interrupt *li; int rc; if (!test_kvm_facility(vcpu->kvm, 9)) { @@ -240,8 +236,8 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, return SIGP_CC_STATUS_STORED; } - li = &dst_vcpu->arch.local_int; - if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) { + if (atomic_read(&dst_vcpu->arch.sie_block->cpuflags) & + CPUSTAT_RUNNING) { /* running */ rc = SIGP_CC_ORDER_CODE_ACCEPTED; } else { From fa55eedd6328d3072e82218a2346b8752253af2d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Dec 2017 17:33:01 -0800 Subject: [PATCH 108/197] KVM: X86: Add KVM_VCPU_PREEMPTED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The next patch will add another bit to the preempted field in kvm_steal_time. Define a constant for bit 0 (the only one that is currently used). Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Peter Zijlstra Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/uapi/asm/kvm_para.h | 2 ++ arch/x86/kernel/kvm.c | 2 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 09cc06483bed42..15685bd22faa48 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -51,6 +51,8 @@ struct kvm_steal_time { __u32 pad[11]; }; +#define KVM_VCPU_PREEMPTED (1 << 0) + #define KVM_CLOCK_PAIRING_WALLCLOCK 0 struct kvm_clock_pairing { __s64 sec; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b40ffbf156c181..6610b92fc6a59d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -643,7 +643,7 @@ __visible bool __kvm_vcpu_is_preempted(long cpu) { struct kvm_steal_time *src = &per_cpu(steal_time, cpu); - return !!src->preempted; + return !!(src->preempted & KVM_VCPU_PREEMPTED); } PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fafaef072f1b5d..897f4795513fb8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2916,7 +2916,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - vcpu->arch.st.steal.preempted = 1; + vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, From 858a43aae23672d46fe802a41f4748f322965182 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Dec 2017 17:33:02 -0800 Subject: [PATCH 109/197] KVM: X86: use paravirtualized TLB Shootdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remote TLB flush does a busy wait which is fine in bare-metal scenario. But with-in the guest, the vcpus might have been pre-empted or blocked. In this scenario, the initator vcpu would end up busy-waiting for a long amount of time; it also consumes CPU unnecessarily to wake up the target of the shootdown. This patch set adds support for KVM's new paravirtualized TLB flush; remote TLB flush does not wait for vcpus that are sleeping, instead KVM will flush the TLB as soon as the vCPU starts running again. The improvement is clearly visible when the host is overcommitted; in this case, the PV TLB flush (in addition to avoiding the wait on the main CPU) prevents preempted vCPUs from stealing precious execution time from the running ones. Testing on a Xeon Gold 6142 2.6GHz 2 sockets, 32 cores, 64 threads, so 64 pCPUs, and each VM is 64 vCPUs. ebizzy -M vanilla optimized boost 1VM 46799 48670 4% 2VM 23962 42691 78% 3VM 16152 37539 132% Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Peter Zijlstra Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/cpuid.txt | 4 +++ arch/x86/include/uapi/asm/kvm_para.h | 2 ++ arch/x86/kernel/kvm.c | 47 ++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 3c65feb8301013..dcab6dc11e3b08 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -54,6 +54,10 @@ KVM_FEATURE_PV_UNHALT || 7 || guest checks this feature bit || || before enabling paravirtualized || || spinlock support. ------------------------------------------------------------------------------ +KVM_FEATURE_PV_TLB_FLUSH || 9 || guest checks this feature bit + || || before enabling paravirtualized + || || tlb flush. +------------------------------------------------------------------------------ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side || || per-cpu warps are expected in || || kvmclock. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 15685bd22faa48..7a2ade4aa23538 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -25,6 +25,7 @@ #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 #define KVM_FEATURE_PV_UNHALT 7 +#define KVM_FEATURE_PV_TLB_FLUSH 9 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -52,6 +53,7 @@ struct kvm_steal_time { }; #define KVM_VCPU_PREEMPTED (1 << 0) +#define KVM_VCPU_FLUSH_TLB (1 << 1) #define KVM_CLOCK_PAIRING_WALLCLOCK 0 struct kvm_clock_pairing { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6610b92fc6a59d..4e37d1a851a62d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -498,6 +498,34 @@ static void __init kvm_apf_trap_init(void) update_intr_gate(X86_TRAP_PF, async_page_fault); } +static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask); + +static void kvm_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask); + + cpumask_copy(flushmask, cpumask); + /* + * We have to call flush only on online vCPUs. And + * queue flush_on_enter for pre-empted vCPUs + */ + for_each_cpu(cpu, flushmask) { + src = &per_cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(&src->preempted, &state, + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_others(flushmask, info); +} + static void __init kvm_guest_init(void) { int i; @@ -517,6 +545,9 @@ static void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) + pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); @@ -598,6 +629,22 @@ static __init int activate_jump_labels(void) } arch_initcall(activate_jump_labels); +static __init int kvm_setup_pv_tlb_flush(void) +{ + int cpu; + + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) { + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + pr_info("KVM setup pv remote TLB flush\n"); + } + + return 0; +} +arch_initcall(kvm_setup_pv_tlb_flush); + #ifdef CONFIG_PARAVIRT_SPINLOCKS /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ From c2ba05ccfde2f069a66c0462e5b5ef8a517dcc9c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Dec 2017 17:33:03 -0800 Subject: [PATCH 110/197] KVM: X86: introduce invalidate_gpa argument to tlb flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new bool invalidate_gpa argument to kvm_x86_ops->tlb_flush, it will be used by later patches to just flush guest tlb. For VMX, this will use INVVPID instead of INVEPT, which will invalidate combined mappings while keeping guest-physical mappings. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Peter Zijlstra Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 14 +++++++------- arch/x86/kvm/vmx.c | 21 +++++++++++---------- arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 340e3604dcc77f..44de261e9223da 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -966,7 +966,7 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*tlb_flush)(struct kvm_vcpu *vcpu); + void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1f3e7f210374a1..14cca8c601a912 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -285,7 +285,7 @@ static int vgif = true; module_param(vgif, int, 0444); static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm); @@ -2035,7 +2035,7 @@ static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@ -2385,7 +2385,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -2989,7 +2989,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; svm->nested.intercept = nested_vmcb->control.intercept; - svm_flush_tlb(&svm->vcpu); + svm_flush_tlb(&svm->vcpu, true); svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; @@ -4785,7 +4785,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static void svm_flush_tlb(struct kvm_vcpu *vcpu) +static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5076,7 +5076,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) @@ -5090,7 +5090,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static int is_disabled(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ef7d13e536a4fd..c1791759f1e644 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4140,9 +4140,10 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, + bool invalidate_gpa) { - if (enable_ept) { + if (enable_ept && (invalidate_gpa || !enable_vpid)) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); @@ -4151,15 +4152,15 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) } } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) { if (enable_ept) - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@ -4357,7 +4358,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } @@ -7932,7 +7933,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); @@ -10614,11 +10615,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true); } } else { vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } } @@ -11323,7 +11324,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * L1's vpid. TODO: move to a more elaborate solution, giving * each L2 its own vpid and exposing the vpid feature to L1. */ - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } /* Restore posted intr vector. */ if (nested_cpu_has_posted_intr(vmcs12)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 897f4795513fb8..1e6d9f2d1f0bc5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6781,10 +6781,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); + kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -6855,7 +6855,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu); + kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; From f38a7b75267f1fb240a8178cbcb16d66dd37aac8 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Dec 2017 17:33:04 -0800 Subject: [PATCH 111/197] KVM: X86: support paravirtualized help for TLB shootdowns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running on a virtual machine, IPIs are expensive when the target CPU is sleeping. Thus, it is nice to be able to avoid them for TLB shootdowns. KVM can just do the flush via INVVPID on the guest's behalf the next time the CPU is scheduled. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Peter Zijlstra Signed-off-by: Wanpeng Li [Use "&" to test the bit instead of "==". - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3db9e1cd4f63b7..a0e6c975f3a8bb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -602,7 +602,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | - (1 << KVM_FEATURE_PV_UNHALT); + (1 << KVM_FEATURE_PV_UNHALT) | + (1 << KVM_FEATURE_PV_TLB_FLUSH); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1e6d9f2d1f0bc5..35d0b3de697bc5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2122,6 +2122,12 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.pv_time_enabled = false; } +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); +} + static void record_steal_time(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) @@ -2131,7 +2137,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; - vcpu->arch.st.steal.preempted = 0; + /* + * Doing a TLB flush here, on the guest's behalf, can avoid + * expensive IPIs. + */ + if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb(vcpu, false); if (vcpu->arch.st.steal.version & 1) vcpu->arch.st.steal.version += 1; /* first time write, random junk */ @@ -6781,12 +6792,6 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) -{ - ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); -} - void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end) { From efdab992813fb2ed825745625b83c05032e9cda2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 13 Dec 2017 10:46:40 +0100 Subject: [PATCH 112/197] KVM: x86: fix escape of guest dr6 to the host MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller reported: WARNING: CPU: 0 PID: 12927 at arch/x86/kernel/traps.c:780 do_debug+0x222/0x250 CPU: 0 PID: 12927 Comm: syz-executor Tainted: G OE 4.15.0-rc2+ #16 RIP: 0010:do_debug+0x222/0x250 Call Trace: <#DB> debug+0x3e/0x70 RIP: 0010:copy_user_enhanced_fast_string+0x10/0x20 _copy_from_user+0x5b/0x90 SyS_timer_create+0x33/0x80 entry_SYSCALL_64_fastpath+0x23/0x9a The testcase sets a watchpoint (with perf_event_open) on a buffer that is passed to timer_create() as the struct sigevent argument. In timer_create(), copy_from_user()'s rep movsb triggers the BP. The testcase also sets the debug registers for the guest. However, KVM only restores host debug registers when the host has active watchpoints, which triggers a race condition when running the testcase with multiple threads. The guest's DR6.BS bit can escape to the host before another thread invokes timer_create(), and do_debug() complains. The fix is to respect do_debug()'s dr6 invariant when leaving KVM. Reported-by: Dmitry Vyukov Cc: Paolo Bonzini Cc: Radim Krčmář Cc: David Hildenbrand Cc: Dmitry Vyukov Reviewed-by: David Hildenbrand Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35d0b3de697bc5..a3dd44bb6f1e61 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2961,6 +2961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) pagefault_enable(); kvm_x86_ops->vcpu_put(vcpu); vcpu->arch.last_host_tsc = rdtsc(); + /* + * If userspace has set any breakpoints or watchpoints, dr6 is restored + * on every vmexit, but if not, we might have a stale dr6 from the + * guest. do_debug expects dr6 to be cleared after it runs, do the same. + */ + set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, From 476b7adaa3272557168b287175b1e9e943913404 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Dec 2017 13:51:32 +0100 Subject: [PATCH 113/197] KVM: x86: avoid unnecessary XSETBV on guest entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xsetbv can be expensive when running on nested virtualization, try to avoid it. Reviewed-by: Jim Mattson Reviewed-by: Wanpeng Li Reviewed-by: Quan Xu Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3dd44bb6f1e61..56d8a1e11e50b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -702,7 +702,8 @@ static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && !vcpu->guest_xcr0_loaded) { /* kvm_set_xcr() also depends on this */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); vcpu->guest_xcr0_loaded = 1; } } From 505c9e94d8329dc215617cf99505629e924f001e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jan 2018 13:10:38 +0100 Subject: [PATCH 114/197] KVM: x86: prefer "depends on" to "select" for SEV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid reverse dependencies. Instead, SEV will only be enabled if the PSP driver is available. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 148ea324b90c40..92fd433c50b9b5 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -85,9 +85,7 @@ config KVM_AMD_SEV def_bool y bool "AMD Secure Encrypted Virtualization (SEV) support" depends on KVM_AMD && X86_64 - select CRYPTO_DEV_CCP - select CRYPTO_DEV_CCP_DD - select CRYPTO_DEV_SP_PSP + depends on CRYPTO_DEV_CCP && CRYPTO_DEV_CCP_DD && CRYPTO_DEV_SP_PSP ---help--- Provides support for launching Encrypted VMs on AMD processors. From b8d7044bcff7a955257b242515bcf1e5045edd9b Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 20 Dec 2017 15:29:28 +0800 Subject: [PATCH 115/197] x86/mm: add a function to check if a pfn is UC/UC-/WC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check whether the PAT memory type of a pfn cannot be overridden by MTRR UC memory type, i.e. the PAT memory type is UC, UC- or WC. This function will be used by KVM to distinguish MMIO pfns and give them UC memory type in the EPT page tables (on Intel processors, EPT memory types work like MTRRs). Signed-off-by: Haozhong Zhang Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/pat.h | 2 ++ arch/x86/mm/pat.c | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 8a3ee355b4222e..92015c65fa2ac9 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -22,4 +22,6 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end, void io_free_memtype(resource_size_t start, resource_size_t end); +bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn); + #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index fe7d57a8fb6003..1555bd7d34493f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -677,6 +677,25 @@ static enum page_cache_mode lookup_memtype(u64 paddr) return rettype; } +/** + * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type + * of @pfn cannot be overridden by UC MTRR memory type. + * + * Only to be called when PAT is enabled. + * + * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. + * Returns false in other cases. + */ +bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) +{ + enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); + + return cm == _PAGE_CACHE_MODE_UC || + cm == _PAGE_CACHE_MODE_UC_MINUS || + cm == _PAGE_CACHE_MODE_WC; +} +EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); + /** * io_reserve_memtype - Request a memory type mapping for a region of memory * @start: start (physical address) of the region From aa2e063aea7961fb58eafecb924f8818e3d4ecfc Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 20 Dec 2017 15:29:29 +0800 Subject: [PATCH 116/197] KVM: MMU: consider host cache mode in MMIO page check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some reserved pages, such as those from NVDIMM DAX devices, are not for MMIO, and can be mapped with cached memory type for better performance. However, the above check misconceives those pages as MMIO. Because KVM maps MMIO pages with UC memory type, the performance of guest accesses to those pages would be harmed. Therefore, we check the host memory type in addition and only treat UC/UC-/WC pages as MMIO. Signed-off-by: Haozhong Zhang Reported-by: Cuevas Escareno, Ivan D Reported-by: Kumar, Karthik Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/mmu.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff1e9ee259cf7c..1f1da400fcdeff 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -2708,7 +2709,18 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && + /* + * Some reserved pages, such as those from NVDIMM + * DAX devices, are not for MMIO, and can be mapped + * with cached memory type for better performance. + * However, the above check misconceives those pages + * as MMIO, and results in KVM mapping them with UC + * memory type, which would hurt the performance. + * Therefore, we check the host memory type in addition + * and only treat UC/UC-/WC pages as MMIO. + */ + (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); return true; } From a6cb099a43314ddb2b0214c122f61288d3d6c0ba Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 20 Dec 2017 12:50:28 +0100 Subject: [PATCH 117/197] kvm/vmx: Use local vmx variable in vmx_get_msr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... just like in vmx_set_msr(). No functionality change. Signed-off-by: Borislav Petkov Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7b1d71665bc2fb..14976cd4e0c102 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3248,6 +3248,7 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, */ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; switch (msr_info->index) { @@ -3259,8 +3260,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(to_vmx(vcpu)); - msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + vmx_load_host_state(vmx); + msr_info->data = vmx->msr_guest_kernel_gs_base; break; #endif case MSR_EFER: @@ -3286,13 +3287,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MCG_EXT_CTL: if (!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & + !(vmx->msr_ia32_feature_control & FEATURE_CONTROL_LMCE)) return 1; msr_info->data = vcpu->arch.mcg_ext_ctl; break; case MSR_IA32_FEATURE_CONTROL: - msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; + msr_info->data = vmx->msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) @@ -3309,7 +3310,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - msr = find_msr_entry(to_vmx(vcpu), msr_info->index); + msr = find_msr_entry(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; From 5c7d4f9ad39d980728b39752304ce10bb2960cbf Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 19 Nov 2017 18:25:43 +0200 Subject: [PATCH 118/197] KVM: nVMX: Fix bug of injecting L2 exception into L1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_clear_exception_queue() should clear pending exception. This also includes exceptions which were only marked pending but not yet injected. This is because exception.pending is used for both L1 and L2 to determine if an exception should be raised to guest. Note that an exception which is pending but not yet injected will be raised again once the guest will be resumed. Consider the following scenario: 1) L0 KVM with ignore_msrs=false. 2) L1 prepare vmcs12 with the following: a) No intercepts on MSR (MSR_BITMAP exist and is filled with 0). b) No intercept for #GP. c) vmx-preemption-timer is configured. 3) L1 enters into L2. 4) L2 reads an unhandled MSR that exists in MSR_BITMAP (such as 0x1fff). L2 RDMSR could be handled as described below: 1) L2 exits to L0 on RDMSR and calls handle_rdmsr(). 2) handle_rdmsr() calls kvm_inject_gp() which sets KVM_REQ_EVENT, exception.pending=true and exception.injected=false. 3) vcpu_enter_guest() consumes KVM_REQ_EVENT and calls inject_pending_event() which calls vmx_check_nested_events() which sees that exception.pending=true but nested_vmx_check_exception() returns 0 and therefore does nothing at this point. However let's assume it later sees vmx-preemption-timer expired and therefore exits from L2 to L1 by calling nested_vmx_vmexit(). 4) nested_vmx_vmexit() calls prepare_vmcs12() which calls vmcs12_save_pending_event() but it does nothing as exception.injected is false. Also prepare_vmcs12() calls kvm_clear_exception_queue() which does nothing as exception.injected is already false. 5) We now return from vmx_check_nested_events() with 0 while still having exception.pending=true! 6) Therefore inject_pending_event() continues and we inject L2 exception to L1!... This commit will fix above issue by changing step (4) to clear exception.pending in kvm_clear_exception_queue(). Fixes: 664f8e26b00c ("KVM: X86: Fix loss of exception which has not yet been injected") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan Signed-off-by: Krish Sadhukhan Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 1 - arch/x86/kvm/x86.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 14976cd4e0c102..866c867b558a6e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11052,7 +11052,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if (block_nested_events) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); - vcpu->arch.exception.pending = false; return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c69f973111cb26..b91215d1fd80d6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -12,6 +12,7 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { + vcpu->arch.exception.pending = false; vcpu->arch.exception.injected = false; } From fa59cc003804f7606b3d0aa8e16509f9103f7377 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 24 Dec 2017 18:12:53 +0200 Subject: [PATCH 119/197] KVM: x86: Optimization: Create SVM stubs for sync_pir_to_irr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sync_pir_to_irr() is only called if vcpu->arch.apicv_active()==true. In case it is false, VMX code make sure to set sync_pir_to_irr to NULL. Therefore, having SVM stubs allows to remove check for if sync_pir_to_irr != NULL from all calling sites. Signed-off-by: Liran Alon Suggested-by: Paolo Bonzini Reviewed-by: Nikita Leshenko Reviewed-by: Liam Merwick [Return highest IRR in the SVM case. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 1 + arch/x86/kvm/x86.c | 10 ++++------ 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e2c1fb8d35cea2..0928608750e371 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -581,7 +581,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; - if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active) + if (apic->vcpu->arch.apicv_active) highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5d83f0474020c5..b613d331d0310e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6740,6 +6740,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, + .sync_pir_to_irr = kvm_lapic_find_highest_irr, .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d2e1459adc979..0204b2b8a293f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2973,7 +2973,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) + if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); @@ -6820,7 +6820,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) + if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -7046,10 +7046,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * This handles the case where a posted interrupt was * notified with kvm_vcpu_kick. */ - if (kvm_lapic_enabled(vcpu)) { - if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); - } + if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { From e7387b0e27ec3a203bda4910dc4a107f6c5f912f Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 24 Dec 2017 18:12:54 +0200 Subject: [PATCH 120/197] KVM: x86: Change __kvm_apic_update_irr() to also return if max IRR updated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit doesn't change semantics. It is done as a preparation for future commits. Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Liam Merwick Signed-off-by: Liam Merwick Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 23 ++++++++++++++++------- arch/x86/kvm/lapic.h | 4 ++-- arch/x86/kvm/vmx.c | 5 +++-- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0928608750e371..924ac8ce9d5004 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -364,32 +364,41 @@ static u8 count_vectors(void *bitmap) return count; } -int __kvm_apic_update_irr(u32 *pir, void *regs) +bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) { u32 i, vec; - u32 pir_val, irr_val; - int max_irr = -1; + u32 pir_val, irr_val, prev_irr_val; + int max_updated_irr; + + max_updated_irr = -1; + *max_irr = -1; for (i = vec = 0; i <= 7; i++, vec += 32) { pir_val = READ_ONCE(pir[i]); irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10)); if (pir_val) { + prev_irr_val = irr_val; irr_val |= xchg(&pir[i], 0); *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val; + if (prev_irr_val != irr_val) { + max_updated_irr = + __fls(irr_val ^ prev_irr_val) + vec; + } } if (irr_val) - max_irr = __fls(irr_val) + vec; + *max_irr = __fls(irr_val) + vec; } - return max_irr; + return ((max_updated_irr != -1) && + (max_updated_irr == *max_irr)); } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; - return __kvm_apic_update_irr(pir, apic->regs); + return __kvm_apic_update_irr(pir, apic->regs, max_irr); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4b9935a3834792..56c36014f7b760 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -75,8 +75,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); -int __kvm_apic_update_irr(u32 *pir, void *regs); -int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 866c867b558a6e..5ea482bb1b9c10 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5085,7 +5085,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr != 256) { vapic_page = kmap(vmx->nested.virtual_apic_page); - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, + vapic_page, &max_irr); kunmap(vmx->nested.virtual_apic_page); status = vmcs_read16(GUEST_INTR_STATUS); @@ -8986,7 +8987,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } From f27a85c4988d408a2918a3bcbc3d7fe4fb2dc2b3 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 24 Dec 2017 18:12:55 +0200 Subject: [PATCH 121/197] KVM: nVMX: Re-evaluate L1 pending events when running L2 and L1 got posted-interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case posted-interrupt was delivered to CPU while it is in host (outside guest), then posted-interrupt delivery will be done by calling sync_pir_to_irr() at vmentry after interrupts are disabled. sync_pir_to_irr() will check vmx->pi_desc.control ON bit and if set, it will sync vmx->pi_desc.pir to IRR and afterwards update RVI to ensure virtual-interrupt-delivery will dispatch interrupt to guest. However, it is possible that L1 will receive a posted-interrupt while CPU runs at host and is about to enter L2. In this case, the call to sync_pir_to_irr() will indeed update the L1's APIC IRR but vcpu_enter_guest() will then just resume into L2 guest without re-evaluating if it should exit from L2 to L1 as a result of this new pending L1 event. To address this case, if sync_pir_to_irr() has a new L1 injectable interrupt and CPU is running L2, we force exit GUEST_MODE which will result in another iteration of vcpu_run() run loop which will call kvm_vcpu_running() which will call check_nested_events() which will handle the pending L1 event properly. Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan Reviewed-by: Liam Merwick Signed-off-by: Liam Merwick Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5ea482bb1b9c10..5fe94e375d2d5d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8978,6 +8978,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; + bool max_irr_updated; WARN_ON(!vcpu->arch.apicv_active); if (pi_test_on(&vmx->pi_desc)) { @@ -8987,7 +8988,16 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + max_irr_updated = + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + + /* + * If we are running L2 and L1 has a new pending interrupt + * which can be injected, we should re-evaluate + * what should be done with this new L1 interrupt. + */ + if (is_guest_mode(vcpu) && max_irr_updated) + kvm_vcpu_exiting_guest_mode(vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } From 851c1a18c5412fd321e387cfe60739387cdbf37d Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 24 Dec 2017 18:12:56 +0200 Subject: [PATCH 122/197] KVM: nVMX: Fix injection to L2 when L1 don't intercept external-interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before each vmentry to guest, vcpu_enter_guest() calls sync_pir_to_irr() which calls vmx_hwapic_irr_update() to update RVI. Currently, vmx_hwapic_irr_update() contains a tweak in case it is called when CPU is running L2 and L1 don't intercept external-interrupts. In that case, code injects interrupt directly into L2 instead of updating RVI. Besides being hacky (wouldn't expect function updating RVI to also inject interrupt), it also doesn't handle this case correctly. The code contains several issues: 1. When code calls kvm_queue_interrupt() it just passes it max_irr which represents the highest IRR currently pending in L1 LAPIC. This is problematic as interrupt was injected to guest but it's bit is still set in LAPIC IRR instead of being cleared from IRR and set in ISR. 2. Code doesn't check if LAPIC PPR is set to accept an interrupt of max_irr priority. It just checks if interrupts are enabled in guest with vmx_interrupt_allowed(). To fix the above issues: 1. Simplify vmx_hwapic_irr_update() to just update RVI. Note that this shouldn't happen when CPU is running L2 (See comment in code). 2. Since now vmx_hwapic_irr_update() only does logic for L1 virtual-interrupt-delivery, inject_pending_event() should be the one responsible for injecting the interrupt directly into L2. Therefore, change kvm_cpu_has_injectable_intr() to check L1 LAPIC when CPU is running L2. 3. Change vmx_sync_pir_to_irr() to set KVM_REQ_EVENT when L1 has a pending injectable interrupt. Fixes: 963fee165660 ("KVM: nVMX: Fix virtual interrupt delivery injection") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan Reviewed-by: Liam Merwick Signed-off-by: Liam Merwick Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/vmx.c | 41 +++++++++++++++++------------------------ 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 5c24811e8b0bca..f171051eecf347 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -79,7 +79,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; - if (kvm_vcpu_apicv_active(v)) + if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v)) return 0; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5fe94e375d2d5d..2a96b1abe77c8d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8948,30 +8948,16 @@ static void vmx_set_rvi(int vector) static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { - if (!is_guest_mode(vcpu)) { - vmx_set_rvi(max_irr); - return; - } - - if (max_irr == -1) - return; - /* - * In guest mode. If a vmexit is needed, vmx_check_nested_events - * handles it. + * When running L2, updating RVI is only relevant when + * vmcs12 virtual-interrupt-delivery enabled. + * However, it can be enabled only when L1 also + * intercepts external-interrupts and in that case + * we should not update vmcs02 RVI but instead intercept + * interrupt. Therefore, do nothing when running L2. */ - if (nested_exit_on_intr(vcpu)) - return; - - /* - * Else, fall back to pre-APICv interrupt injection since L2 - * is run without virtual interrupt delivery. - */ - if (!kvm_event_needs_reinjection(vcpu) && - vmx_interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, max_irr, false); - vmx_inject_irq(vcpu); - } + if (!is_guest_mode(vcpu)) + vmx_set_rvi(max_irr); } static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) @@ -8995,9 +8981,16 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * If we are running L2 and L1 has a new pending interrupt * which can be injected, we should re-evaluate * what should be done with this new L1 interrupt. + * If L1 intercepts external-interrupts, we should + * exit from L2 to L1. Otherwise, interrupt should be + * delivered directly to L2. */ - if (is_guest_mode(vcpu) && max_irr_updated) - kvm_vcpu_exiting_guest_mode(vcpu); + if (is_guest_mode(vcpu) && max_irr_updated) { + if (nested_exit_on_intr(vcpu)) + kvm_vcpu_exiting_guest_mode(vcpu); + else + kvm_make_request(KVM_REQ_EVENT, vcpu); + } } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } From 6b6977117f50d60455ace86b2d256f6fb4f3de05 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Thu, 9 Nov 2017 20:27:20 +0200 Subject: [PATCH 123/197] KVM: nVMX: Fix races when sending nested PI while dest enters/leaves L2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consider the following scenario: 1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI to CPU B via virtual posted-interrupt mechanism. 2. CPU B is currently executing L2 guest. 3. vmx_deliver_nested_posted_interrupt() calls kvm_vcpu_trigger_posted_interrupt() which will note that vcpu->mode == IN_GUEST_MODE. 4. Assume that before CPU A sends the physical POSTED_INTR_NESTED_VECTOR IPI, CPU B exits from L2 to L0 during event-delivery (valid IDT-vectoring-info). 5. CPU A now sends the physical IPI. The IPI is received in host and it's handler (smp_kvm_posted_intr_nested_ipi()) does nothing. 6. Assume that before CPU A sets pi_pending=true and KVM_REQ_EVENT, CPU B continues to run in L0 and reach vcpu_enter_guest(). As KVM_REQ_EVENT is not set yet, vcpu_enter_guest() will continue and resume L2 guest. 7. At this point, CPU A sets pi_pending=true and KVM_REQ_EVENT but it's too late! CPU B already entered L2 and KVM_REQ_EVENT will only be consumed at next L2 entry! Another scenario to consider: 1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI to CPU B via virtual posted-interrupt mechanism. 2. Assume that before CPU A calls kvm_vcpu_trigger_posted_interrupt(), CPU B is at L0 and is about to resume into L2. Further assume that it is in vcpu_enter_guest() after check for KVM_REQ_EVENT. 3. At this point, CPU A calls kvm_vcpu_trigger_posted_interrupt() which will note that vcpu->mode != IN_GUEST_MODE. Therefore, do nothing and return false. Then, will set pi_pending=true and KVM_REQ_EVENT. 4. Now CPU B continue and resumes into L2 guest without processing the posted-interrupt until next L2 entry! To fix both issues, we just need to change vmx_deliver_nested_posted_interrupt() to set pi_pending=true and KVM_REQ_EVENT before calling kvm_vcpu_trigger_posted_interrupt(). It will fix the first scenario by chaging step (6) to note that KVM_REQ_EVENT and pi_pending=true and therefore process nested posted-interrupt. It will fix the second scenario by two possible ways: 1. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B has changed vcpu->mode to IN_GUEST_MODE, physical IPI will be sent and will be received when CPU resumes into L2. 2. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B hasn't yet changed vcpu->mode to IN_GUEST_MODE, then after CPU B will change vcpu->mode it will call kvm_request_pending() which will return true and therefore force another round of vcpu_enter_guest() which will note that KVM_REQ_EVENT and pi_pending=true and therefore process nested posted-interrupt. Cc: stable@vger.kernel.org Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan [Add kvm_vcpu_kick to also handle the case where L1 doesn't intercept L2 HLT and L2 executes HLT instruction. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2a96b1abe77c8d..2c8749ee3221de 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5146,14 +5146,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { - /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu, true); /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); + /* the PIR and ON have been set by L1. */ + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) + kvm_vcpu_kick(vcpu); return 0; } return -1; From c5d167b27e00026711ad19a33a23d5d3d562148a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Dec 2017 11:05:19 +0100 Subject: [PATCH 124/197] KVM: vmx: shadow more fields that are read/written on every vmexits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compared to when VMCS shadowing was added to KVM, we are reading/writing a few more fields: the PML index, the interrupt status and the preemption timer value. The first two are because we are exposing more features to nested guests, the preemption timer is simply because we have grown a new optimization. Adding them to the shadow VMCS field lists reduces the cost of a vmexit by about 1000 clock cycles for each field that exists on bare metal. On the other hand, the guest BNDCFGS and TSC offset are not written on fast paths, so remove them. Suggested-by: Jim Mattson Cc: Jim Mattson Cc: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2c8749ee3221de..7b114a293c6633 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -726,11 +726,12 @@ static unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, - GUEST_BNDCFGS, + GUEST_PML_INDEX, + GUEST_INTR_STATUS, + VMX_PREEMPTION_TIMER_VALUE, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, - TSC_OFFSET, EXCEPTION_BITMAP, CPU_BASED_VM_EXEC_CONTROL, VM_ENTRY_EXCEPTION_ERROR_CODE, @@ -3903,9 +3904,22 @@ static void init_vmcs_shadow_fields(void) /* No checks for read only fields yet */ for (i = j = 0; i < max_shadow_read_write_fields; i++) { + /* + * PML and the preemption timer can be emulated, but the + * processor cannot vmwrite to fields that don't exist + * on bare metal. + */ switch (shadow_read_write_fields[i]) { - case GUEST_BNDCFGS: - if (!kvm_mpx_supported()) + case GUEST_PML_INDEX: + if (!cpu_has_vmx_pml()) + continue; + break; + case VMX_PREEMPTION_TIMER_VALUE: + if (!cpu_has_vmx_preemption_timer()) + continue; + break; + case GUEST_INTR_STATUS: + if (!cpu_has_vmx_apicv()) continue; break; default: @@ -6802,11 +6816,6 @@ static __init int hardware_setup(void) !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); - if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || !cpu_has_vmx_ept_mt_wb() || @@ -6926,6 +6935,11 @@ static __init int hardware_setup(void) kvm_x86_ops->cancel_hv_timer = NULL; } + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); + kvm_set_posted_intr_wakeup_handler(wakeup_handler); kvm_mce_cap_supported |= MCG_LMCE_P; From 44900ba65e16ab3c6608e105654f38f54d030caa Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Dec 2017 12:58:02 +0100 Subject: [PATCH 125/197] KVM: VMX: optimize shadow VMCS copying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because all fields can be read/written with a single vmread/vmwrite on 64-bit kernels, the switch statements in copy_vmcs12_to_shadow and copy_shadow_to_vmcs12 are unnecessary. What I did in this patch is to copy the two parts of 64-bit fields separately on 32-bit kernels, to keep all complicated #ifdef-ery in init_vmcs_shadow_fields. The disadvantage is that 64-bit fields have to be listed separately in shadow_read_only/read_write_fields, but those are few and we can validate the arrays when building the VMREAD and VMWRITE bitmaps. This saves a few hundred clock cycles per nested vmexit. However there is still a "switch" in vmcs_read_any and vmcs_write_any. So, while at it, this patch reorders the fields by type, hoping that the branch predictor appreciates it. Cc: Jim Mattson Cc: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 143 +++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 78 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7b114a293c6633..bdd3f155efae0c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -686,7 +686,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) [number##_HIGH] = VMCS12_OFFSET(name)+4 -static unsigned long shadow_read_only_fields[] = { +static u16 shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -699,49 +699,59 @@ static unsigned long shadow_read_only_fields[] = { * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified * by nested_vmx_failValid) */ + /* 32-bits */ VM_EXIT_REASON, VM_EXIT_INTR_INFO, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_INFO_FIELD, IDT_VECTORING_ERROR_CODE, VM_EXIT_INTR_ERROR_CODE, + + /* Natural width */ EXIT_QUALIFICATION, GUEST_LINEAR_ADDRESS, - GUEST_PHYSICAL_ADDRESS + + /* 64-bit */ + GUEST_PHYSICAL_ADDRESS, + GUEST_PHYSICAL_ADDRESS_HIGH, }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static unsigned long shadow_read_write_fields[] = { +static u16 shadow_read_write_fields[] = { + /* 16-bits */ + GUEST_CS_SELECTOR, + GUEST_INTR_STATUS, + GUEST_PML_INDEX, + HOST_FS_SELECTOR, + HOST_GS_SELECTOR, + + /* 32-bits */ + CPU_BASED_VM_EXEC_CONTROL, + EXCEPTION_BITMAP, + VM_ENTRY_EXCEPTION_ERROR_CODE, + VM_ENTRY_INTR_INFO_FIELD, + VM_ENTRY_INSTRUCTION_LEN, TPR_THRESHOLD, + GUEST_CS_LIMIT, + GUEST_CS_AR_BYTES, + GUEST_INTERRUPTIBILITY_INFO, + VMX_PREEMPTION_TIMER_VALUE, + + /* Natural width */ GUEST_RIP, GUEST_RSP, GUEST_CR0, GUEST_CR3, GUEST_CR4, - GUEST_INTERRUPTIBILITY_INFO, GUEST_RFLAGS, - GUEST_CS_SELECTOR, - GUEST_CS_AR_BYTES, - GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, - GUEST_PML_INDEX, - GUEST_INTR_STATUS, - VMX_PREEMPTION_TIMER_VALUE, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, - EXCEPTION_BITMAP, - CPU_BASED_VM_EXEC_CONTROL, - VM_ENTRY_EXCEPTION_ERROR_CODE, - VM_ENTRY_INTR_INFO_FIELD, - VM_ENTRY_INSTRUCTION_LEN, - VM_ENTRY_EXCEPTION_ERROR_CODE, HOST_FS_BASE, HOST_GS_BASE, - HOST_FS_SELECTOR, - HOST_GS_SELECTOR }; static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); @@ -3901,15 +3911,39 @@ static void init_vmcs_shadow_fields(void) { int i, j; - /* No checks for read only fields yet */ + for (i = j = 0; i < max_shadow_read_only_fields; i++) { + u16 field = shadow_read_only_fields[i]; + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 && + (i + 1 == max_shadow_read_only_fields || + shadow_read_only_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_only_field %x\n", + field + 1); + + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_only_fields[j] = field; + j++; + } + max_shadow_read_only_fields = j; for (i = j = 0; i < max_shadow_read_write_fields; i++) { + u16 field = shadow_read_write_fields[i]; + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 && + (i + 1 == max_shadow_read_write_fields || + shadow_read_write_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_write_field %x\n", + field + 1); + /* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist * on bare metal. */ - switch (shadow_read_write_fields[i]) { + switch (field) { case GUEST_PML_INDEX: if (!cpu_has_vmx_pml()) continue; @@ -3926,31 +3960,17 @@ static void init_vmcs_shadow_fields(void) break; } + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif if (j < i) - shadow_read_write_fields[j] = - shadow_read_write_fields[i]; + shadow_read_write_fields[j] = field; j++; } max_shadow_read_write_fields = j; - - /* shadowed fields guest access without vmexit */ - for (i = 0; i < max_shadow_read_write_fields; i++) { - unsigned long field = shadow_read_write_fields[i]; - - clear_bit(field, vmx_vmwrite_bitmap); - clear_bit(field, vmx_vmread_bitmap); - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { - clear_bit(field + 1, vmx_vmwrite_bitmap); - clear_bit(field + 1, vmx_vmread_bitmap); - } - } - for (i = 0; i < max_shadow_read_only_fields; i++) { - unsigned long field = shadow_read_only_fields[i]; - - clear_bit(field, vmx_vmread_bitmap); - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) - clear_bit(field + 1, vmx_vmread_bitmap); - } } static __init int alloc_kvm_area(void) @@ -7538,7 +7558,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - const unsigned long *fields = shadow_read_write_fields; + const u16 *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; preempt_disable(); @@ -7547,23 +7567,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < num_fields; i++) { field = fields[i]; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - field_value = vmcs_read16(field); - break; - case VMCS_FIELD_TYPE_U32: - field_value = vmcs_read32(field); - break; - case VMCS_FIELD_TYPE_U64: - field_value = vmcs_read64(field); - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - field_value = vmcs_readl(field); - break; - default: - WARN_ON(1); - continue; - } + field_value = __vmcs_readl(field); vmcs12_write_any(&vmx->vcpu, field, field_value); } @@ -7575,7 +7579,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - const unsigned long *fields[] = { + const u16 *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; @@ -7594,24 +7598,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(&vmx->vcpu, field, &field_value); - - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - vmcs_write16(field, (u16)field_value); - break; - case VMCS_FIELD_TYPE_U32: - vmcs_write32(field, (u32)field_value); - break; - case VMCS_FIELD_TYPE_U64: - vmcs_write64(field, (u64)field_value); - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - vmcs_writel(field, (long)field_value); - break; - default: - WARN_ON(1); - break; - } + __vmcs_writel(field, field_value); } } From 5b15706dbf5bdf0c9708ca9bb1f37e95e315daf2 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 22 Dec 2017 12:11:12 -0800 Subject: [PATCH 126/197] kvm: vmx: Introduce VMCS12_MAX_FIELD_INDEX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the highest index value used in any supported VMCS12 field encoding. It is used to populate the IA32_VMX_VMCS_ENUM MSR. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bdd3f155efae0c..bbfbed714decdf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -407,6 +407,12 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE 0x1000 +/* + * VMCS12_MAX_FIELD_INDEX is the highest index value used in any + * supported VMCS12 field encoding. + */ +#define VMCS12_MAX_FIELD_INDEX 0x17 + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -2923,7 +2929,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - vmx->nested.nested_vmx_vmcs_enum = 0x2e; + vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; } /* From d37f4267a79f7d423103cb9fdd67a4a3a8194335 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 22 Dec 2017 12:12:16 -0800 Subject: [PATCH 127/197] kvm: vmx: Change vmcs_field_type to vmcs_field_width MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the SDM, "[VMCS] Fields are grouped by width (16-bit, 32-bit, etc.) and type (guest-state, host-state, etc.)." Previously, the width was indicated by vmcs_field_type. To avoid confusion when we start dealing with both field width and field type, change vmcs_field_type to vmcs_field_width. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bbfbed714decdf..f1d41dd3744c19 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3894,17 +3894,17 @@ static void free_kvm_area(void) } } -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +enum vmcs_field_width { + VMCS_FIELD_WIDTH_U16 = 0, + VMCS_FIELD_WIDTH_U64 = 1, + VMCS_FIELD_WIDTH_U32 = 2, + VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 }; -static inline int vmcs_field_type(unsigned long field) +static inline int vmcs_field_width(unsigned long field) { if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; + return VMCS_FIELD_WIDTH_U32; return (field >> 13) & 0x3 ; } @@ -3919,7 +3919,7 @@ static void init_vmcs_shadow_fields(void) for (i = j = 0; i < max_shadow_read_only_fields; i++) { u16 field = shadow_read_only_fields[i]; - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 && + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_only_fields || shadow_read_only_fields[i + 1] != field + 1)) pr_err("Missing field from shadow_read_only_field %x\n", @@ -3938,7 +3938,7 @@ static void init_vmcs_shadow_fields(void) for (i = j = 0; i < max_shadow_read_write_fields; i++) { u16 field = shadow_read_write_fields[i]; - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 && + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_write_fields || shadow_read_write_fields[i + 1] != field + 1)) pr_err("Missing field from shadow_read_write_field %x\n", @@ -7511,17 +7511,17 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, p = ((char *)(get_vmcs12(vcpu))) + offset; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_NATURAL_WIDTH: + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *ret = *((natural_width *)p); return 0; - case VMCS_FIELD_TYPE_U16: + case VMCS_FIELD_WIDTH_U16: *ret = *((u16 *)p); return 0; - case VMCS_FIELD_TYPE_U32: + case VMCS_FIELD_WIDTH_U32: *ret = *((u32 *)p); return 0; - case VMCS_FIELD_TYPE_U64: + case VMCS_FIELD_WIDTH_U64: *ret = *((u64 *)p); return 0; default: @@ -7538,17 +7538,17 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, if (offset < 0) return offset; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_U16: *(u16 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_U32: + case VMCS_FIELD_WIDTH_U32: *(u32 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_U64: + case VMCS_FIELD_WIDTH_U64: *(u64 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *(natural_width *)p = field_value; return 0; default: From 58e9ffae5efd3ee9b655ee36702f02c36d51ead9 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 22 Dec 2017 12:13:13 -0800 Subject: [PATCH 128/197] kvm: vmx: Reduce size of vmcs_field_to_offset_table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vmcs_field_to_offset_table was a rather sparse table of short integers with a maximum index of 0x6c16, amounting to 55342 bytes. Now that we are considering support for multiple VMCS12 formats, it would be unfortunate to replicate that large, sparse table. Rotating the field encoding (as a 16-bit integer) left by 6 reduces that table to 5926 bytes. Suggested-by: Paolo Bonzini Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f1d41dd3744c19..d49bb747ebea53 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -686,10 +686,12 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [number] = VMCS12_OFFSET(name) -#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ - [number##_HIGH] = VMCS12_OFFSET(name)+4 +#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD64(number, name) \ + FIELD(number, name), \ + [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) static u16 shadow_read_only_fields[] = { @@ -908,9 +910,13 @@ static const unsigned short vmcs_field_to_offset_table[] = { static inline short vmcs_field_to_offset(unsigned long field) { - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + unsigned index; + + if (field >> 15) + return -ENOENT; - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) + index = ROL16(field, 6); + if (index >= ARRAY_SIZE(vmcs_field_to_offset_table)) return -ENOENT; /* @@ -919,10 +925,10 @@ static inline short vmcs_field_to_offset(unsigned long field) */ asm("lfence"); - if (vmcs_field_to_offset_table[field] == 0) + if (vmcs_field_to_offset_table[index] == 0) return -ENOENT; - return vmcs_field_to_offset_table[field]; + return vmcs_field_to_offset_table[index]; } static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) From c9e9deae76b8fbbd48e7357247689016bb0d6242 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 13:16:29 +0100 Subject: [PATCH 129/197] KVM: VMX: split list of shadowed VMCS field to a separate file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare for multiple inclusions of the list. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 64 ++-------------------------- arch/x86/kvm/vmx_shadow_fields.h | 71 ++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 60 deletions(-) create mode 100644 arch/x86/kvm/vmx_shadow_fields.h diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d49bb747ebea53..4530b2ba63b984 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -695,71 +695,15 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) static u16 shadow_read_only_fields[] = { - /* - * We do NOT shadow fields that are modified when L0 - * traps and emulates any vmx instruction (e.g. VMPTRLD, - * VMXON...) executed by L1. - * For example, VM_INSTRUCTION_ERROR is read - * by L1 if a vmx instruction fails (part of the error path). - * Note the code assumes this logic. If for some reason - * we start shadowing these fields then we need to - * force a shadow sync when L0 emulates vmx instructions - * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified - * by nested_vmx_failValid) - */ - /* 32-bits */ - VM_EXIT_REASON, - VM_EXIT_INTR_INFO, - VM_EXIT_INSTRUCTION_LEN, - IDT_VECTORING_INFO_FIELD, - IDT_VECTORING_ERROR_CODE, - VM_EXIT_INTR_ERROR_CODE, - - /* Natural width */ - EXIT_QUALIFICATION, - GUEST_LINEAR_ADDRESS, - - /* 64-bit */ - GUEST_PHYSICAL_ADDRESS, - GUEST_PHYSICAL_ADDRESS_HIGH, +#define SHADOW_FIELD_RO(x) x, +#include "vmx_shadow_fields.h" }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static u16 shadow_read_write_fields[] = { - /* 16-bits */ - GUEST_CS_SELECTOR, - GUEST_INTR_STATUS, - GUEST_PML_INDEX, - HOST_FS_SELECTOR, - HOST_GS_SELECTOR, - - /* 32-bits */ - CPU_BASED_VM_EXEC_CONTROL, - EXCEPTION_BITMAP, - VM_ENTRY_EXCEPTION_ERROR_CODE, - VM_ENTRY_INTR_INFO_FIELD, - VM_ENTRY_INSTRUCTION_LEN, - TPR_THRESHOLD, - GUEST_CS_LIMIT, - GUEST_CS_AR_BYTES, - GUEST_INTERRUPTIBILITY_INFO, - VMX_PREEMPTION_TIMER_VALUE, - - /* Natural width */ - GUEST_RIP, - GUEST_RSP, - GUEST_CR0, - GUEST_CR3, - GUEST_CR4, - GUEST_RFLAGS, - GUEST_CS_BASE, - GUEST_ES_BASE, - CR0_GUEST_HOST_MASK, - CR0_READ_SHADOW, - CR4_READ_SHADOW, - HOST_FS_BASE, - HOST_GS_BASE, +#define SHADOW_FIELD_RW(x) x, +#include "vmx_shadow_fields.h" }; static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h new file mode 100644 index 00000000000000..31d7a15338acd5 --- /dev/null +++ b/arch/x86/kvm/vmx_shadow_fields.h @@ -0,0 +1,71 @@ +#ifndef SHADOW_FIELD_RO +#define SHADOW_FIELD_RO(x) +#endif +#ifndef SHADOW_FIELD_RW +#define SHADOW_FIELD_RW(x) +#endif + +/* + * We do NOT shadow fields that are modified when L0 + * traps and emulates any vmx instruction (e.g. VMPTRLD, + * VMXON...) executed by L1. + * For example, VM_INSTRUCTION_ERROR is read + * by L1 if a vmx instruction fails (part of the error path). + * Note the code assumes this logic. If for some reason + * we start shadowing these fields then we need to + * force a shadow sync when L0 emulates vmx instructions + * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified + * by nested_vmx_failValid) + * + * Keeping the fields ordered by size is an attempt at improving + * branch prediction in vmcs_read_any and vmcs_write_any. + */ + +/* 16-bits */ +SHADOW_FIELD_RW(GUEST_CS_SELECTOR) +SHADOW_FIELD_RW(GUEST_INTR_STATUS) +SHADOW_FIELD_RW(GUEST_PML_INDEX) +SHADOW_FIELD_RW(HOST_FS_SELECTOR) +SHADOW_FIELD_RW(HOST_GS_SELECTOR) + +/* 32-bits */ +SHADOW_FIELD_RO(VM_EXIT_REASON) +SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) +SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) +SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) +SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) +SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) +SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) +SHADOW_FIELD_RW(EXCEPTION_BITMAP) +SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) +SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) +SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) +SHADOW_FIELD_RW(TPR_THRESHOLD) +SHADOW_FIELD_RW(GUEST_CS_LIMIT) +SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) +SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) +SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) + +/* Natural width */ +SHADOW_FIELD_RO(EXIT_QUALIFICATION) +SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) +SHADOW_FIELD_RW(GUEST_RIP) +SHADOW_FIELD_RW(GUEST_RSP) +SHADOW_FIELD_RW(GUEST_CR0) +SHADOW_FIELD_RW(GUEST_CR3) +SHADOW_FIELD_RW(GUEST_CR4) +SHADOW_FIELD_RW(GUEST_RFLAGS) +SHADOW_FIELD_RW(GUEST_CS_BASE) +SHADOW_FIELD_RW(GUEST_ES_BASE) +SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) +SHADOW_FIELD_RW(CR0_READ_SHADOW) +SHADOW_FIELD_RW(CR4_READ_SHADOW) +SHADOW_FIELD_RW(HOST_FS_BASE) +SHADOW_FIELD_RW(HOST_GS_BASE) + +/* 64-bit */ +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) + +#undef SHADOW_FIELD_RO +#undef SHADOW_FIELD_RW From 74a497fae754bd970ad0573c26d0a3c5e784fa0c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 13:55:39 +0100 Subject: [PATCH 130/197] KVM: nVMX: track dirty state of non-shadowed VMCS fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VMCS12 fields that are not handled through shadow VMCS are rarely written, and thus they are also almost constant in the vmcs02. We can thus optimize prepare_vmcs02 by skipping all the work for non-shadowed fields in the common case. This patch introduces the (pretty simple) tracking infrastructure; the next patches will move work to prepare_vmcs02_full and save a few hundred clock cycles per VMRESUME on a Haswell Xeon E5 system: before after cpuid 14159 13869 vmcall 15290 14951 inl_from_kernel 17703 17447 outl_to_kernel 16011 14692 self_ipi_sti_nop 16763 15825 self_ipi_tpr_sti_nop 17341 15935 wr_tsc_adjust_msr 14510 14264 rd_tsc_adjust_msr 15018 14311 mmio-wildcard-eventfd:pci-mem 16381 14947 mmio-datamatch-eventfd:pci-mem 18620 17858 portio-wildcard-eventfd:pci-io 15121 14769 portio-datamatch-eventfd:pci-io 15761 14831 (average savings 748, stdev 460). Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 29 ++++++++++++++++++++++++++++- arch/x86/kvm/vmx_shadow_fields.h | 6 ++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4530b2ba63b984..1cc787f639726e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -436,6 +436,7 @@ struct nested_vmx { * data hold by vmcs12 */ bool sync_shadow_vmcs; + bool dirty_vmcs12; bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ @@ -7623,8 +7624,10 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) { unsigned long field; gva_t gva; + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -7667,6 +7670,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } + switch (field) { +#define SHADOW_FIELD_RW(x) case x: +#include "vmx_shadow_fields.h" + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } + nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } @@ -7681,6 +7698,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; } + vmx->nested.dirty_vmcs12 = true; } /* Emulate the VMPTRLD instruction */ @@ -10297,6 +10315,11 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 0; } +static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry) +{ +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -10592,7 +10615,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx_flush_tlb(vcpu, true); } - } if (enable_pml) { @@ -10641,6 +10663,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); + if (vmx->nested.dirty_vmcs12) { + prepare_vmcs02_full(vcpu, vmcs12, from_vmentry); + vmx->nested.dirty_vmcs12 = false; + } + /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h index 31d7a15338acd5..cd0c75f6d037a9 100644 --- a/arch/x86/kvm/vmx_shadow_fields.h +++ b/arch/x86/kvm/vmx_shadow_fields.h @@ -17,6 +17,12 @@ * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified * by nested_vmx_failValid) * + * When adding or removing fields here, note that shadowed + * fields must always be synced by prepare_vmcs02, not just + * prepare_vmcs02_full. + */ + +/* * Keeping the fields ordered by size is an attempt at improving * branch prediction in vmcs_read_any and vmcs_write_any. */ From 8665c3f973203269fdc799da833f28c5dc217523 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 13:56:53 +0100 Subject: [PATCH 131/197] KVM: nVMX: initialize descriptor cache fields in prepare_vmcs02_full MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This part is separate for ease of review, because git prefers to move prepare_vmcs02 below the initial long sequence of vmcs_write* operations. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 56 ++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1cc787f639726e..f83f5e7bf87a43 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10317,28 +10317,10 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry) -{ -} - -/* - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 - * guest in a way that will both be appropriate to L1's requests, and our - * needs. In addition to modifying the active vmcs (which is vmcs02), this - * function also has additional necessary side-effects, like setting various - * vcpu->arch fields. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - bool from_vmentry, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control, vmcs12_exec_ctrl; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); @@ -10346,7 +10328,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); @@ -10356,15 +10337,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); @@ -10373,6 +10351,40 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); +} + +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry, u32 *entry_failure_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control, vmcs12_exec_ctrl; + + /* + * First, the fields that are shadowed. This must be kept in sync + * with vmx_shadow_fields.h. + */ + + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + + /* + * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, + * HOST_FS_BASE, HOST_GS_BASE. + */ if (from_vmentry && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { From 25a2e4fe8ef719de564370349dc408ac7263f455 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 14:05:21 +0100 Subject: [PATCH 132/197] KVM: nVMX: initialize more non-shadowed fields in prepare_vmcs02_full MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fields are also simple copies of the data in the vmcs12 struct. For some of them, prepare_vmcs02 was skipping the copy when the field was unused. In prepare_vmcs02_full, we copy them always as long as the field exists on the host, because the corresponding execution control might be one of the shadowed fields. Optimization opportunities remain for MSRs that, depending on the entry/exit controls, have to be copied from either the vmcs01 or the vmcs12: EFER (whose value is partly stored in the entry controls too), PAT, DEBUGCTL (and also DR7). Before moving these three and the entry/exit controls to prepare_vmcs02_full, KVM would have to set dirty_vmcs12 on writes to the L1 MSRs. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 162 +++++++++++++++++++++++---------------------- 1 file changed, 83 insertions(+), 79 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f83f5e7bf87a43..9b9e02c0f25e28 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10351,6 +10351,88 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + vmcs_write64(VMCS_LINK_POINTER, -1ull); + + if (cpu_has_vmx_posted_intr()) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); + + /* All VMFUNCs are currently emulated through L0 vmexits. */ + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + + if (cpu_has_vmx_apicv()) { + vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); + } + + /* + * Set host-state according to L0's settings (vmcs12 is irrelevant here) + * Some constant fields are set here by vmx_set_constant_host_state(). + * Other fields are different per CPU, and will be set later when + * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + */ + vmx_set_constant_host_state(vmx); + + /* + * Set the MSR load/store lists to match L0's settings. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + set_cr4_guest_host_mask(vmx); + + if (vmx_mpx_supported()) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + + if (enable_vpid) { + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + else + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + } + + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } } /* @@ -10408,16 +10490,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); - - if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; @@ -10431,7 +10504,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); } else { exec_control &= ~PIN_BASED_POSTED_INTR; } @@ -10442,25 +10514,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_preemption_timer(vmcs12)) vmx_start_preemption_timer(vcpu); - /* - * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. - * Note that below we don't need special code to set EB.PF beyond the - * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, - * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when - * !enable_ept, EB.PF is 1, so the "or" will always be 1. - */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); - if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx->secondary_exec_control; @@ -10479,22 +10532,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } - /* All VMFUNCs are currently emulated through L0 vmexits. */ - if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC) - vmcs_write64(VM_FUNCTION_CONTROL, 0); - - if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { - vmcs_write64(EOI_EXIT_BITMAP0, - vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, - vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, - vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, - vmcs12->eoi_exit_bitmap3); + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); - } /* * Write an illegal value to APIC_ACCESS_ADDR. Later, @@ -10507,24 +10547,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } - - /* - * Set host-state according to L0's settings (vmcs12 is irrelevant here) - * Some constant fields are set here by vmx_set_constant_host_state(). - * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. - */ - vmx_set_constant_host_state(vmx); - - /* - * Set the MSR load/store lists to match L0's settings. - */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); - /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before * entry, but only if the current (host) sp changed from the value @@ -10594,12 +10616,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } - set_cr4_guest_host_mask(vmx); - - if (from_vmentry && - vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset + vmcs12->tsc_offset); @@ -10618,13 +10634,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * even if spawn a lot of nested vCPUs. */ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true); } } else { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx_flush_tlb(vcpu, true); } } @@ -10688,16 +10702,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - /* - * L1 may access the L2's PDPTR, so save them to construct vmcs12 - */ - if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); - } - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); return 0; From 07f36616cde482a9fd65da9a64f504190c7a0edb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 1 Jan 2018 22:53:42 +0100 Subject: [PATCH 133/197] KVM: nVMX: remove unnecessary vmwrite from L2->L1 vmexit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The POSTED_INTR_NV field is constant (though it differs between the vmcs01 and vmcs02), there is no need to reload it on vmexit to L1. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9b9e02c0f25e28..bbadd8c7e59229 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11339,9 +11339,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, */ vmx_flush_tlb(vcpu, true); } - /* Restore posted intr vector. */ - if (nested_cpu_has_posted_intr(vmcs12)) - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); From 1f6e5b25643e539e55a4c7553e4be6562c88fb76 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 11:32:16 +0100 Subject: [PATCH 134/197] KVM: vmx: simplify MSR bitmap setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The APICv-enabled MSR bitmap is a superset of the APICv-disabled bitmap. Make that obvious in vmx_disable_intercept_msr_x2apic. Signed-off-by: Paolo Bonzini [Resolved rebase conflict after removing Intel PT. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bbadd8c7e59229..1f4a3037b99e09 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5017,14 +5017,13 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) +static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only) { - if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, - msr, type); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, - msr, type); - } else { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, + msr, type); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, + msr, type); + if (!apicv_only) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, msr, type); __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, @@ -6872,7 +6871,6 @@ static __init int hardware_setup(void) * TPR reads and writes can be virtualized even if virtual interrupt * delivery is not in use. */ - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); /* EOI */ From c992384bde84fb2f0318bdb4f6c710605dd7a217 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Dec 2017 14:16:30 +0100 Subject: [PATCH 135/197] KVM: vmx: speed up MSR bitmap merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bulk of the MSR bitmap is either immutable, or can be copied from the L1 bitmap. By initializing it at VMXON time, and copying the mutable parts one long at a time on vmentry (rather than one bit), about 4000 clock cycles (30%) can be saved on a nested VMLAUNCH/VMRESUME. The resulting for loop only has four iterations, so it is cheap enough to reinitialize the MSR write bitmaps on every iteration, and it makes the code simpler. Suggested-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 78 +++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1f4a3037b99e09..71b2c935442367 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4972,11 +4972,6 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, { int f = sizeof(unsigned long); - if (!cpu_has_vmx_msr_bitmap()) { - WARN_ON(1); - return; - } - /* * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals * have the write-low and read-high bitmap offsets the wrong way round. @@ -7177,6 +7172,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx->nested.msr_bitmap) goto out_msr_bitmap; + memset(vmx->nested.msr_bitmap, 0xff, PAGE_SIZE); } vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); @@ -9844,8 +9840,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, } } -static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) @@ -9934,11 +9930,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, (unsigned long)(vmcs12->posted_intr_desc_addr & (PAGE_SIZE - 1))); } - if (cpu_has_vmx_msr_bitmap() && - nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) && - nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) - ; - else + if (!nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_USE_MSR_BITMAPS); } @@ -10006,14 +9998,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. */ -static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { int msr; struct page *page; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + /* Nothing to do if the MSR bitmap is not in use. */ + if (!cpu_has_vmx_msr_bitmap() || + !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return false; + /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; @@ -10021,32 +10018,41 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); if (is_error_page(page)) return false; - msr_bitmap_l1 = (unsigned long *)kmap(page); - memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (nested_cpu_has_apic_reg_virt(vmcs12)) { + /* + * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it + * just lets the processor take the value from the virtual-APIC page; + * take those 256 bits directly from the L1 bitmap. + */ + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = msr_bitmap_l1[word]; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } else { + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = ~0; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } - if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { - if (nested_cpu_has_apic_reg_virt(vmcs12)) - for (msr = 0x800; msr <= 0x8ff; msr++) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - msr, MSR_TYPE_R); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_R | MSR_TYPE_W); - - if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); - } + msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); } kunmap(page); kvm_release_page_clean(page); From d7231e75f73f424068ad205358bd8ba6a7a2e113 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Dec 2017 00:47:55 +0100 Subject: [PATCH 136/197] KVM: VMX: introduce X2APIC_MSR macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove duplicate expression in nested_vmx_prepare_msr_bitmap, and make the register names clearer in hardware_setup. Suggested-by: Jim Mattson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini [Resolved rebase conflict after removing Intel PT. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 71b2c935442367..1e2ca9e8662ff2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5012,6 +5012,8 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) + static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, @@ -6857,7 +6859,7 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ for (msr = 0x800; msr <= 0x8ff; msr++) { - if (msr == 0x839 /* TMCCT */) + if (msr == X2APIC_MSR(APIC_TMCCT)) continue; vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); } @@ -6866,12 +6868,9 @@ static __init int hardware_setup(void) * TPR reads and writes can be virtualized even if virtual interrupt * delivery is not in use. */ - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); - - /* EOI */ - vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); - /* SELF-IPI */ - vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W, false); + vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_EOI), MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, true); if (enable_ept) vmx_enable_tdp(); @@ -10041,17 +10040,17 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), + X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_W); if (nested_cpu_has_vid(vmcs12)) { nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_EOI >> 4), + X2APIC_MSR(APIC_EOI), MSR_TYPE_W); nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } kunmap(page); From 43ff3f65234061e08d234bdef5a9aadc19832b74 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jan 2018 14:31:43 +1100 Subject: [PATCH 137/197] KVM: PPC: Book3S HV: Make sure we don't re-enter guest without XIVE loaded This fixes a bug where it is possible to enter a guest on a POWER9 system without having the XIVE (interrupt controller) context loaded. This can happen because we unload the XIVE context from the CPU before doing the real-mode handling for machine checks. After the real-mode handler runs, it is possible that we re-enter the guest via a fast path which does not load the XIVE context. To fix this, we move the unloading of the XIVE context to come after the real-mode machine check handler is called. Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 40 ++++++++++++------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c8ffd69adfec7a..76332a3f6c0dcd 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1423,6 +1423,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) blt deliver_guest_interrupt guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ + /* Save more register state */ + mfdar r6 + mfdsisr r7 + std r6, VCPU_DAR(r9) + stw r7, VCPU_DSISR(r9) + /* don't overwrite fault_dar/fault_dsisr if HDSI */ + cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE + beq mc_cont + std r6, VCPU_FAULT_DAR(r9) + stw r7, VCPU_FAULT_DSISR(r9) + + /* See if it is a machine check */ + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK + beq machine_check_realmode +mc_cont: +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r9, VCPU_TB_RMEXIT + mr r4, r9 + bl kvmhv_accumulate_time +#endif #ifdef CONFIG_KVM_XICS /* We are exiting, pull the VP from the XIVE */ lwz r0, VCPU_XIVE_PUSHED(r9) @@ -1460,26 +1480,6 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ eieio 1: #endif /* CONFIG_KVM_XICS */ - /* Save more register state */ - mfdar r6 - mfdsisr r7 - std r6, VCPU_DAR(r9) - stw r7, VCPU_DSISR(r9) - /* don't overwrite fault_dar/fault_dsisr if HDSI */ - cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE - beq mc_cont - std r6, VCPU_FAULT_DAR(r9) - stw r7, VCPU_FAULT_DSISR(r9) - - /* See if it is a machine check */ - cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK - beq machine_check_realmode -mc_cont: -#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING - addi r3, r9, VCPU_TB_RMEXIT - mr r4, r9 - bl kvmhv_accumulate_time -#endif mr r3, r12 /* Increment exit count, poke other threads to exit */ From 6964e6a4e4894c707e42d51d9d30683c57f43201 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jan 2018 14:51:02 +1100 Subject: [PATCH 138/197] KVM: PPC: Book3S HV: Do SLB load/unload with guest LPCR value loaded This moves the code that loads and unloads the guest SLB values so that it is done while the guest LPCR value is loaded in the LPCR register. The reason for doing this is that on POWER9, the behaviour of the slbmte instruction depends on the LPCR[UPRT] bit. If UPRT is 1, as it is for a radix host (or guest), the SLB index is truncated to 2 bits. This means that for a HPT guest on a radix host, the SLB was not being loaded correctly, causing the guest to crash. The SLB is now loaded much later in the guest entry path, after the LPCR is loaded, which for a secondary thread is after it sees that the primary thread has switched the MMU to the guest. The loop that waits for the primary thread has a branch out to the exit code that is taken if it sees that other threads have commenced exiting the guest. Since we have now not loaded the SLB at this point, we make this path branch to a new label 'guest_bypass' and we move the SLB unload code to before this label. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 109 ++++++++++++------------ 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 76332a3f6c0dcd..30ece4cebaf533 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -617,13 +617,6 @@ kvmppc_hv_entry: lbz r0, KVM_RADIX(r9) cmpwi cr7, r0, 0 - /* Clear out SLB if hash */ - bne cr7, 2f - li r6,0 - slbmte r6,r6 - slbia - ptesync -2: /* * POWER7/POWER8 host -> guest partition switch code. * We don't have to lock against concurrent tlbies, @@ -738,19 +731,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 10: cmpdi r4, 0 beq kvmppc_primary_no_guest kvmppc_got_guest: - - /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */ - lwz r5,VCPU_SLB_MAX(r4) - cmpwi r5,0 - beq 9f - mtctr r5 - addi r6,r4,VCPU_SLB -1: ld r8,VCPU_SLB_E(r6) - ld r9,VCPU_SLB_V(r6) - slbmte r9,r8 - addi r6,r6,VCPU_SLB_SIZE - bdnz 1b -9: /* Increment yield count if they have a VPA */ ld r3, VCPU_VPA(r4) cmpdi r3, 0 @@ -1017,6 +997,29 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) cmpdi r3, 512 /* 1 microsecond */ blt hdec_soon + /* For hash guest, clear out and reload the SLB */ + ld r6, VCPU_KVM(r4) + lbz r0, KVM_RADIX(r6) + cmpwi r0, 0 + bne 9f + li r6, 0 + slbmte r6, r6 + slbia + ptesync + + /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */ + lwz r5,VCPU_SLB_MAX(r4) + cmpwi r5,0 + beq 9f + mtctr r5 + addi r6,r4,VCPU_SLB +1: ld r8,VCPU_SLB_E(r6) + ld r9,VCPU_SLB_V(r6) + slbmte r9,r8 + addi r6,r6,VCPU_SLB_SIZE + bdnz 1b +9: + #ifdef CONFIG_KVM_XICS /* We are entering the guest on that thread, push VCPU to XIVE */ ld r10, HSTATE_XIVE_TIMA_PHYS(r13) @@ -1193,7 +1196,7 @@ hdec_soon: addi r3, r4, VCPU_TB_RMEXIT bl kvmhv_accumulate_time #endif - b guest_exit_cont + b guest_bypass /****************************************************************************** * * @@ -1481,34 +1484,12 @@ mc_cont: 1: #endif /* CONFIG_KVM_XICS */ - mr r3, r12 - /* Increment exit count, poke other threads to exit */ - bl kvmhv_commence_exit - nop - ld r9, HSTATE_KVM_VCPU(r13) - lwz r12, VCPU_TRAP(r9) - - /* Stop others sending VCPU interrupts to this physical CPU */ - li r0, -1 - stw r0, VCPU_CPU(r9) - stw r0, VCPU_THREAD_CPU(r9) - - /* Save guest CTRL register, set runlatch to 1 */ - mfspr r6,SPRN_CTRLF - stw r6,VCPU_CTRL(r9) - andi. r0,r6,1 - bne 4f - ori r6,r6,1 - mtspr SPRN_CTRLT,r6 -4: - /* Check if we are running hash or radix and store it in cr2 */ + /* For hash guest, read the guest SLB and save it away */ ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) - cmpwi cr2,r0,0 - - /* Read the guest SLB and save it away */ li r5, 0 - bne cr2, 3f /* for radix, save 0 entries */ + cmpwi r0, 0 + bne 3f /* for radix, save 0 entries */ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ mtctr r0 li r6,0 @@ -1524,8 +1505,34 @@ mc_cont: addi r5,r5,1 2: addi r6,r6,1 bdnz 1b + /* Finally clear out the SLB */ + li r0,0 + slbmte r0,r0 + slbia + ptesync 3: stw r5,VCPU_SLB_MAX(r9) +guest_bypass: + mr r3, r12 + /* Increment exit count, poke other threads to exit */ + bl kvmhv_commence_exit + nop + ld r9, HSTATE_KVM_VCPU(r13) + lwz r12, VCPU_TRAP(r9) + + /* Stop others sending VCPU interrupts to this physical CPU */ + li r0, -1 + stw r0, VCPU_CPU(r9) + stw r0, VCPU_THREAD_CPU(r9) + + /* Save guest CTRL register, set runlatch to 1 */ + mfspr r6,SPRN_CTRLF + stw r6,VCPU_CTRL(r9) + andi. r0,r6,1 + bne 4f + ori r6,r6,1 + mtspr SPRN_CTRLT,r6 +4: /* * Save the guest PURR/SPURR */ @@ -1803,7 +1810,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) cmpwi cr2, r0, 0 - beq cr2, 3f + beq cr2, 4f /* Radix: Handle the case where the guest used an illegal PID */ LOAD_REG_ADDR(r4, mmu_base_pid) @@ -1839,15 +1846,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) - b 4f +4: #endif /* CONFIG_PPC_RADIX_MMU */ - /* Hash: clear out SLB */ -3: li r5,0 - slbmte r5,r5 - slbia - ptesync -4: /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do From 0482b50546b12e82346903500af081ed329f915c Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 17 Jan 2018 12:35:27 +0100 Subject: [PATCH 139/197] arm64: mm: Add additional parameter to uaccess_ttbr0_disable Add an extra temporary register parameter to uaccess_ttbr0_disable which is about to be required for arm64 PAN support. This patch doesn't introduce any functional change but ensures that the kernel compiles once the KVM/ARM tree is merged with the arm64 tree by ensuring a trivially mergable conflict with commit 6b88a32c7af68895134872cdec3b6bfdb532d94e ("arm64: kpti: Fix the interaction between ASID switching and software PAN"). Cc: Will Deacon Acked-by: Catalin Marinas Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/asm-uaccess.h | 8 ++++---- arch/arm64/lib/clear_user.S | 2 +- arch/arm64/lib/copy_from_user.S | 2 +- arch/arm64/lib/copy_in_user.S | 2 +- arch/arm64/lib/copy_to_user.S | 2 +- arch/arm64/mm/cache.S | 4 ++-- arch/arm64/xen/hypercall.S | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index b67563d2024c7e..03064261ee0bb6 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -25,7 +25,7 @@ isb .endm - .macro uaccess_ttbr0_disable, tmp1 + .macro uaccess_ttbr0_disable, tmp1, tmp2 alternative_if_not ARM64_HAS_PAN __uaccess_ttbr0_disable \tmp1 alternative_else_nop_endif @@ -39,7 +39,7 @@ alternative_if_not ARM64_HAS_PAN alternative_else_nop_endif .endm #else - .macro uaccess_ttbr0_disable, tmp1 + .macro uaccess_ttbr0_disable, tmp1, tmp2 .endm .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 @@ -49,8 +49,8 @@ alternative_else_nop_endif /* * These macros are no-ops when UAO is present. */ - .macro uaccess_disable_not_uao, tmp1 - uaccess_ttbr0_disable \tmp1 + .macro uaccess_disable_not_uao, tmp1, tmp2 + uaccess_ttbr0_disable \tmp1, \tmp2 alternative_if ARM64_ALT_PAN_NOT_UAO SET_PSTATE_PAN(1) alternative_else_nop_endif diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index e88fb99c156163..8932e5f7a6f396 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -50,7 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 - uaccess_disable_not_uao x2 + uaccess_disable_not_uao x2, x3 ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 4b5d826895ff16..bc108634992c1f 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -67,7 +67,7 @@ ENTRY(__arch_copy_from_user) uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index b24a830419ad95..e6dd59dd40534a 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -68,7 +68,7 @@ ENTRY(raw_copy_in_user) uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 ret ENDPROC(raw_copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 351f0766f7a61c..bd20f9f7dd8425 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -66,7 +66,7 @@ ENTRY(__arch_copy_to_user) uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 ret ENDPROC(__arch_copy_to_user) diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 5a52811f47e9dc..758bde7e2fa68a 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -63,7 +63,7 @@ user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE invalidate_icache_by_line x0, x1, x2, x3, 9f mov x0, #0 1: - uaccess_ttbr0_disable x1 + uaccess_ttbr0_disable x1, x2 ret 9: mov x0, #-EFAULT @@ -85,7 +85,7 @@ ENTRY(invalidate_icache_range) invalidate_icache_by_line x0, x1, x2, x3, 2f mov x0, xzr 1: - uaccess_ttbr0_disable x1 + uaccess_ttbr0_disable x1, x2 ret 2: mov x0, #-EFAULT diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index acdbd2c9e899c1..c5f05c4a4d0088 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -107,6 +107,6 @@ ENTRY(privcmd_call) /* * Disable userspace access from kernel once the hyp call completed. */ - uaccess_ttbr0_disable x6 + uaccess_ttbr0_disable x6, x7 ret ENDPROC(privcmd_call); From 00608e1f007e4cf6031485c5630e0e504bceef9b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jan 2018 16:54:26 +1100 Subject: [PATCH 140/197] KVM: PPC: Book3S HV: Allow HPT and radix on the same core for POWER9 v2.2 POWER9 chip versions starting with "Nimbus" v2.2 can support running with some threads of a core in HPT mode and others in radix mode. This means that we don't have to prohibit independent-threads mode when running a HPT guest on a radix host, and we don't have to do any of the synchronization between threads that was introduced in commit c01015091a77 ("KVM: PPC: Book3S HV: Run HPT guests on POWER9 radix hosts", 2017-10-19). Rather than using up another CPU feature bit, we just do an explicit test on the PVR (processor version register) at module startup time to determine whether we have to take steps to avoid having some threads in HPT mode and some in radix mode (so-called "mixed mode"). We test for "Nimbus" (indicated by 0 or 1 in the top nibble of the lower 16 bits) v2.2 or later, or "Cumulus" (indicated by 2 or 3 in that nibble) v1.1 or later. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b2d448c750086b..76cf48051eb3e3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -118,6 +118,9 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif +/* If set, the threads on each CPU core have to be in the same MMU mode */ +static bool no_mixing_hpt_and_radix; + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -2386,8 +2389,8 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) static bool subcore_config_ok(int n_subcores, int n_threads) { /* - * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core - * mode, with one thread per subcore. + * POWER9 "SMT4" cores are permanently in what is effectively a 4-way + * split-core mode, with one thread per subcore. */ if (cpu_has_feature(CPU_FTR_ARCH_300)) return n_subcores <= 4 && n_threads == 1; @@ -2423,8 +2426,8 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return false; - /* POWER9 currently requires all threads to be in the same MMU mode */ - if (cpu_has_feature(CPU_FTR_ARCH_300) && + /* Some POWER9 chips require all threads to be in the same MMU mode */ + if (no_mixing_hpt_and_radix && kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) return false; @@ -2687,9 +2690,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. * On POWER9, we need to be not in independent-threads mode if - * this is a HPT guest on a radix host. + * this is a HPT guest on a radix host machine where the + * CPU threads may not be in different MMU modes. */ - hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm); + hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() && + !kvm_is_radix(vc->kvm); if (((controlled_threads > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || (hpt_on_radix && vc->kvm->arch.threads_indep)) { @@ -4446,6 +4451,19 @@ static int kvmppc_book3s_init_hv(void) if (kvmppc_radix_possible()) r = kvmppc_radix_init(); + + /* + * POWER9 chips before version 2.02 can't have some threads in + * HPT mode and some in radix mode on the same core. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + unsigned int pvr = mfspr(SPRN_PVR); + if ((pvr >> 16) == PVR_POWER9 && + (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) || + ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101))) + no_mixing_hpt_and_radix = true; + } + return r; } From c424c108233dc422a9a29ee833154006a5bdf9fc Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:11 +1100 Subject: [PATCH 141/197] KVM: PPC: Book3S HV: Add more info about XIVE queues in debugfs Add details about enabled queues and escalation interrupts. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index bf457843e03217..6cff5bdfd6b7ef 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1794,6 +1794,7 @@ static int xive_debug_show(struct seq_file *m, void *private) kvm_for_each_vcpu(i, vcpu, kvm) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + unsigned int i; if (!xc) continue; @@ -1803,6 +1804,33 @@ static int xive_debug_show(struct seq_file *m, void *private) xc->server_num, xc->cppr, xc->hw_cppr, xc->mfrr, xc->pending, xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + struct xive_q *q = &xc->queues[i]; + u32 i0, i1, idx; + + if (!q->qpage && !xc->esc_virq[i]) + continue; + + seq_printf(m, " [q%d]: ", i); + + if (q->qpage) { + idx = q->idx; + i0 = be32_to_cpup(q->qpage + idx); + idx = (idx + 1) & q->msk; + i1 = be32_to_cpup(q->qpage + idx); + seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1); + } + if (xc->esc_virq[i]) { + struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET); + seq_printf(m, "E:%c%c I(%d:%llx:%llx)", + (pq & XIVE_ESB_VAL_P) ? 'P' : 'p', + (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q', + xc->esc_virq[i], pq, xd->eoi_page); + seq_printf(m, "\n"); + } + } t_rm_h_xirr += xc->stat_rm_h_xirr; t_rm_h_ipoll += xc->stat_rm_h_ipoll; From bf4159da4751ab8eea43ca6e7c49193dbce8398c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:12 +1100 Subject: [PATCH 142/197] KVM: PPC: Book3S HV: Enable use of the new XIVE "single escalation" feature That feature, provided by Power9 DD2.0 and later, when supported by newer OPAL versions, allows us to sacrifice a queue (priority 7) in favor of merging all the escalation interrupts of the queues of a single VP into a single interrupt. This reduces the number of host interrupts used up by KVM guests especially when those guests use multiple priorities. It will also enable a future change to control the masking of the escalation interrupts more precisely to avoid spurious ones. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/include/asm/xive.h | 3 +- arch/powerpc/kvm/book3s_xive.c | 48 +++++++++++++++++++---------- arch/powerpc/kvm/book3s_xive.h | 15 ++++----- arch/powerpc/sysdev/xive/native.c | 18 +++++++++-- 5 files changed, 57 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 233c7504b1f20b..fc926743647ec6 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1073,6 +1073,7 @@ enum { /* Flags for OPAL_XIVE_GET/SET_VP_INFO */ enum { OPAL_XIVE_VP_ENABLED = 0x00000001, + OPAL_XIVE_VP_SINGLE_ESCALATION = 0x00000002, }; /* "Any chip" replacement for chip ID for allocation functions */ diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index b619a5585cd68b..e602903c3029e8 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -111,9 +111,10 @@ extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); extern void xive_native_sync_source(u32 hw_irq); extern bool is_xive_irq(struct irq_chip *chip); -extern int xive_native_enable_vp(u32 vp_id); +extern int xive_native_enable_vp(u32 vp_id, bool single_escalation); extern int xive_native_disable_vp(u32 vp_id); extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); +extern bool xive_native_has_single_escalation(void); #else diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 6cff5bdfd6b7ef..a102efeabf05a1 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -112,19 +112,21 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) return -EIO; } - /* - * Future improvement: start with them disabled - * and handle DD2 and later scheme of merged escalation - * interrupts - */ - name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", - vcpu->kvm->arch.lpid, xc->server_num, prio); + if (xc->xive->single_escalation) + name = kasprintf(GFP_KERNEL, "kvm-%d-%d", + vcpu->kvm->arch.lpid, xc->server_num); + else + name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", + vcpu->kvm->arch.lpid, xc->server_num, prio); if (!name) { pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n", prio, xc->server_num); rc = -ENOMEM; goto error; } + + pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio); + rc = request_irq(xc->esc_virq[prio], xive_esc_irq, IRQF_NO_THREAD, name, vcpu); if (rc) { @@ -191,12 +193,12 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) pr_devel("Provisioning prio... %d\n", prio); - /* Provision each VCPU and enable escalations */ + /* Provision each VCPU and enable escalations if needed */ kvm_for_each_vcpu(i, vcpu, kvm) { if (!vcpu->arch.xive_vcpu) continue; rc = xive_provision_queue(vcpu, prio); - if (rc == 0) + if (rc == 0 && !xive->single_escalation) xive_attach_escalation(vcpu, prio); if (rc) return rc; @@ -1081,6 +1083,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, /* Allocate IPI */ xc->vp_ipi = xive_native_alloc_irq(); if (!xc->vp_ipi) { + pr_err("Failed to allocate xive irq for VCPU IPI\n"); r = -EIO; goto bail; } @@ -1090,19 +1093,34 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, if (r) goto bail; + /* + * Enable the VP first as the single escalation mode will + * affect escalation interrupts numbering + */ + r = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + if (r) { + pr_err("Failed to enable VP in OPAL, err %d\n", r); + goto bail; + } + /* * Initialize queues. Initially we set them all for no queueing * and we enable escalation for queue 0 only which we'll use for * our mfrr change notifications. If the VCPU is hot-plugged, we - * do handle provisioning however. + * do handle provisioning however based on the existing "map" + * of enabled queues. */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { struct xive_q *q = &xc->queues[i]; + /* Single escalation, no queue 7 */ + if (i == 7 && xive->single_escalation) + break; + /* Is queue already enabled ? Provision it */ if (xive->qmap & (1 << i)) { r = xive_provision_queue(vcpu, i); - if (r == 0) + if (r == 0 && !xive->single_escalation) xive_attach_escalation(vcpu, i); if (r) goto bail; @@ -1122,11 +1140,6 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, if (r) goto bail; - /* Enable the VP */ - r = xive_native_enable_vp(xc->vp_id); - if (r) - goto bail; - /* Route the IPI */ r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI); if (!r) @@ -1473,6 +1486,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) pr_devel(" val=0x016%llx (server=0x%x, guest_prio=%d)\n", val, server, guest_prio); + /* * If the source doesn't already have an IPI, allocate * one and get the corresponding data @@ -1761,6 +1775,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) if (xive->vp_base == XIVE_INVALID_VP) ret = -ENOMEM; + xive->single_escalation = xive_native_has_single_escalation(); + if (ret) { kfree(xive); return ret; diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 6ba63f8e8a614e..a08ae6fd4c51fc 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -120,6 +120,8 @@ struct kvmppc_xive { u32 q_order; u32 q_page_order; + /* Flags */ + u8 single_escalation; }; #define KVMPPC_XIVE_Q_COUNT 8 @@ -201,25 +203,20 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp * is as follow. * * Guest request for 0...6 are honored. Guest request for anything - * higher results in a priority of 7 being applied. - * - * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb - * in order to match AIX expectations + * higher results in a priority of 6 being applied. * * Similar mapping is done for CPPR values */ static inline u8 xive_prio_from_guest(u8 prio) { - if (prio == 0xff || prio < 8) + if (prio == 0xff || prio < 6) return prio; - return 7; + return 6; } static inline u8 xive_prio_to_guest(u8 prio) { - if (prio == 0xff || prio < 7) - return prio; - return 0xb; + return prio; } static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index ebc244b08d6748..d22aeb0b69e107 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -42,6 +42,7 @@ static u32 xive_provision_chip_count; static u32 xive_queue_shift; static u32 xive_pool_vps = XIVE_INVALID_VP; static struct kmem_cache *xive_provision_cache; +static bool xive_has_single_esc; int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) { @@ -571,6 +572,10 @@ bool __init xive_native_init(void) break; } + /* Do we support single escalation */ + if (of_get_property(np, "single-escalation-support", NULL) != NULL) + xive_has_single_esc = true; + /* Configure Thread Management areas for KVM */ for_each_possible_cpu(cpu) kvmppc_set_xive_tima(cpu, r.start, tima); @@ -667,12 +672,15 @@ void xive_native_free_vp_block(u32 vp_base) } EXPORT_SYMBOL_GPL(xive_native_free_vp_block); -int xive_native_enable_vp(u32 vp_id) +int xive_native_enable_vp(u32 vp_id, bool single_escalation) { s64 rc; + u64 flags = OPAL_XIVE_VP_ENABLED; + if (single_escalation) + flags |= OPAL_XIVE_VP_SINGLE_ESCALATION; for (;;) { - rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0); + rc = opal_xive_set_vp_info(vp_id, flags, 0); if (rc != OPAL_BUSY) break; msleep(1); @@ -710,3 +718,9 @@ int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id) return 0; } EXPORT_SYMBOL_GPL(xive_native_get_vp_info); + +bool xive_native_has_single_escalation(void) +{ + return xive_has_single_esc; +} +EXPORT_SYMBOL_GPL(xive_native_has_single_escalation); From 2267ea7661798a42f0da648a2970e2a03f4bc370 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:13 +1100 Subject: [PATCH 143/197] KVM: PPC: Book3S HV: Don't use existing "prodded" flag for XIVE escalations The prodded flag is only cleared at the beginning of H_CEDE, so every time we have an escalation, we will cause the *next* H_CEDE to return immediately. Instead use a dedicated "irq_pending" flag to indicate that a guest interrupt is pending for the VCPU. We don't reuse the existing exception bitmap so as to avoid expensive atomic ops. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++++++ arch/powerpc/kvm/book3s_xive.c | 3 +-- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3aa5b577cd609c..bfe51356af5e16 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -709,6 +709,7 @@ struct kvm_vcpu_arch { u8 ceded; u8 prodded; u8 doorbell_request; + u8 irq_pending; /* Used by XIVE to signal pending guest irqs */ u32 last_inst; struct swait_queue_head *wqp; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6b958414b4e036..825089cf3e2359 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -514,6 +514,7 @@ int main(void) OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); + OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending); OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 76cf48051eb3e3..e5f81fc108e094 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2999,7 +2999,7 @@ static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) { if (!xive_enabled()) return false; - return vcpu->arch.xive_saved_state.pipr < + return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr < vcpu->arch.xive_saved_state.cppr; } #else diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7daf21be33d0b9..34dbab7deb39e5 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1035,6 +1035,16 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) li r9, 1 stw r9, VCPU_XIVE_PUSHED(r4) eieio + + /* + * We clear the irq_pending flag. There is a small chance of a + * race vs. the escalation interrupt happening on another + * processor setting it again, but the only consequence is to + * cause a spurrious wakeup on the next H_CEDE which is not an + * issue. + */ + li r0,0 + stb r0, VCPU_IRQ_PENDING(r4) no_xive: #endif /* CONFIG_KVM_XICS */ diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index a102efeabf05a1..eef9ccafdc098f 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -84,8 +84,7 @@ static irqreturn_t xive_esc_irq(int irq, void *data) { struct kvm_vcpu *vcpu = data; - /* We use the existing H_PROD mechanism to wake up the target */ - vcpu->arch.prodded = 1; + vcpu->arch.irq_pending = 1; smp_mb(); if (vcpu->arch.ceded) kvmppc_fast_vcpu_kick(vcpu); From 2662efd050953824de5c9b24449d6b5b342db10b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:14 +1100 Subject: [PATCH 144/197] KVM: PPC: Book3S HV: Check DR not IR to chose real vs virt mode MMIOs Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 34dbab7deb39e5..948f21cf84d569 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1464,7 +1464,7 @@ mc_cont: li r7, TM_SPC_PULL_OS_CTX li r6, TM_QW1_OS mfmsr r0 - andi. r0, r0, MSR_IR /* in real mode? */ + andi. r0, r0, MSR_DR /* in real mode? */ beq 2f ld r10, HSTATE_XIVE_TIMA_VIRT(r13) cmpldi cr0, r10, 0 From 35c2405efc0142860c4b698f4c6331567c4ca1ef Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:15 +1100 Subject: [PATCH 145/197] KVM: PPC: Book3S HV: Make xive_pushed a byte, not a word Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index bfe51356af5e16..0c44fa67608d95 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -739,7 +739,7 @@ struct kvm_vcpu_arch { struct kvmppc_icp *icp; /* XICS presentation controller */ struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */ __be32 xive_cam_word; /* Cooked W2 in proper endian with valid bit */ - u32 xive_pushed; /* Is the VP pushed on the physical CPU ? */ + u8 xive_pushed; /* Is the VP pushed on the physical CPU ? */ union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */ #endif diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 948f21cf84d569..a7f429bc6de077 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1033,7 +1033,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) li r9, TM_QW1_OS + TM_WORD2 stwcix r11,r9,r10 li r9, 1 - stw r9, VCPU_XIVE_PUSHED(r4) + stb r9, VCPU_XIVE_PUSHED(r4) eieio /* @@ -1458,7 +1458,7 @@ mc_cont: #endif #ifdef CONFIG_KVM_XICS /* We are exiting, pull the VP from the XIVE */ - lwz r0, VCPU_XIVE_PUSHED(r9) + lbz r0, VCPU_XIVE_PUSHED(r9) cmpwi cr0, r0, 0 beq 1f li r7, TM_SPC_PULL_OS_CTX @@ -1487,7 +1487,7 @@ mc_cont: /* Fixup some of the state for the next load */ li r10, 0 li r0, 0xff - stw r10, VCPU_XIVE_PUSHED(r9) + stb r10, VCPU_XIVE_PUSHED(r9) stb r10, (VCPU_XIVE_SAVED_STATE+3)(r9) stb r0, (VCPU_XIVE_SAVED_STATE+4)(r9) eieio From 9b9b13a6d1537ddc4caccd6f1c41b78edbc08437 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:16 +1100 Subject: [PATCH 146/197] KVM: PPC: Book3S HV: Keep XIVE escalation interrupt masked unless ceded This works on top of the single escalation support. When in single escalation, with this change, we will keep the escalation interrupt disabled unless the VCPU is in H_CEDE (idle). In any other case, we know the VCPU will be rescheduled and thus there is no need to take escalation interrupts in the host whenever a guest interrupt fires. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 3 ++ arch/powerpc/kernel/asm-offsets.c | 3 ++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 62 ++++++++++++++++++++++++- arch/powerpc/kvm/book3s_xive.c | 30 ++++++++++++ 4 files changed, 97 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0c44fa67608d95..fef8133becc85c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -740,7 +740,10 @@ struct kvm_vcpu_arch { struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */ __be32 xive_cam_word; /* Cooked W2 in proper endian with valid bit */ u8 xive_pushed; /* Is the VP pushed on the physical CPU ? */ + u8 xive_esc_on; /* Is the escalation irq enabled ? */ union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */ + u64 xive_esc_raddr; /* Escalation interrupt ESB real addr */ + u64 xive_esc_vaddr; /* Escalation interrupt ESB virt addr */ #endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 825089cf3e2359..1672dffd94e2e8 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -734,6 +734,9 @@ int main(void) DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu, arch.xive_cam_word)); DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed)); + DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on)); + DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr)); + DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr)); #endif #ifdef CONFIG_KVM_EXIT_TIMING diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index a7f429bc6de077..a7a20b85d8eb8d 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1045,6 +1045,41 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) */ li r0,0 stb r0, VCPU_IRQ_PENDING(r4) + + /* + * In single escalation mode, if the escalation interrupt is + * on, we mask it. + */ + lbz r0, VCPU_XIVE_ESC_ON(r4) + cmpwi r0,0 + beq 1f + ld r10, VCPU_XIVE_ESC_RADDR(r4) + li r9, XIVE_ESB_SET_PQ_01 + ldcix r0, r10, r9 + sync + + /* We have a possible subtle race here: The escalation interrupt might + * have fired and be on its way to the host queue while we mask it, + * and if we unmask it early enough (re-cede right away), there is + * a theorical possibility that it fires again, thus landing in the + * target queue more than once which is a big no-no. + * + * Fortunately, solving this is rather easy. If the above load setting + * PQ to 01 returns a previous value where P is set, then we know the + * escalation interrupt is somewhere on its way to the host. In that + * case we simply don't clear the xive_esc_on flag below. It will be + * eventually cleared by the handler for the escalation interrupt. + * + * Then, when doing a cede, we check that flag again before re-enabling + * the escalation interrupt, and if set, we abort the cede. + */ + andi. r0, r0, XIVE_ESB_VAL_P + bne- 1f + + /* Now P is 0, we can clear the flag */ + li r0, 0 + stb r0, VCPU_XIVE_ESC_ON(r4) +1: no_xive: #endif /* CONFIG_KVM_XICS */ @@ -2756,7 +2791,32 @@ kvm_cede_prodded: /* we've ceded but we want to give control to the host */ kvm_cede_exit: ld r9, HSTATE_KVM_VCPU(r13) - b guest_exit_cont +#ifdef CONFIG_KVM_XICS + /* Abort if we still have a pending escalation */ + lbz r5, VCPU_XIVE_ESC_ON(r9) + cmpwi r5, 0 + beq 1f + li r0, 0 + stb r0, VCPU_CEDED(r9) +1: /* Enable XIVE escalation */ + li r5, XIVE_ESB_SET_PQ_00 + mfmsr r0 + andi. r0, r0, MSR_DR /* in real mode? */ + beq 1f + ld r10, VCPU_XIVE_ESC_VADDR(r9) + cmpdi r10, 0 + beq 3f + ldx r0, r10, r5 + b 2f +1: ld r10, VCPU_XIVE_ESC_RADDR(r9) + cmpdi r10, 0 + beq 3f + ldcix r0, r10, r5 +2: sync + li r0, 1 + stb r0, VCPU_XIVE_ESC_ON(r9) +#endif /* CONFIG_KVM_XICS */ +3: b guest_exit_cont /* Try to handle a machine check in real mode */ machine_check_realmode: diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index eef9ccafdc098f..7a047bc88f1103 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -89,6 +89,17 @@ static irqreturn_t xive_esc_irq(int irq, void *data) if (vcpu->arch.ceded) kvmppc_fast_vcpu_kick(vcpu); + /* Since we have the no-EOI flag, the interrupt is effectively + * disabled now. Clearing xive_esc_on means we won't bother + * doing so on the next entry. + * + * This also allows the entry code to know that if a PQ combination + * of 10 is observed while xive_esc_on is true, it means the queue + * contains an unprocessed escalation interrupt. We don't make use of + * that knowledge today but might (see comment in book3s_hv_rmhandler.S) + */ + vcpu->arch.xive_esc_on = false; + return IRQ_HANDLED; } @@ -134,6 +145,25 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) goto error; } xc->esc_virq_names[prio] = name; + + /* In single escalation mode, we grab the ESB MMIO of the + * interrupt and mask it. Also populate the VCPU v/raddr + * of the ESB page for use by asm entry/exit code. Finally + * set the XIVE_IRQ_NO_EOI flag which will prevent the + * core code from performing an EOI on the escalation + * interrupt, thus leaving it effectively masked after + * it fires once. + */ + if (xc->xive->single_escalation) { + struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); + vcpu->arch.xive_esc_raddr = xd->eoi_page; + vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio; + xd->flags |= XIVE_IRQ_NO_EOI; + } + return 0; error: irq_dispose_mapping(xc->esc_virq[prio]); From c0b4bd219124fdb7306fb67b571aff4c56a5e8f9 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Wed, 13 Dec 2017 13:53:22 +0100 Subject: [PATCH 147/197] s390/mm: Remove superfluous parameter It seems it hasn't even been used before the last cleanup and was overlooked. Signed-off-by: Janosch Frank Message-Id: <1513169613-13509-12-git-send-email-frankja@linux.vnet.ibm.com> Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/mm/gmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 54cfd51a5a27e4..4e9d12af95cc9a 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1998,7 +1998,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_page); * Called with sg->parent->shadow_lock. */ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, - unsigned long gaddr, pte_t *pte) + unsigned long gaddr) { struct gmap_rmap *rmap, *rnext, *head; unsigned long start, end, bits, raddr; @@ -2083,7 +2083,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, spin_lock(&gmap->shadow_lock); list_for_each_entry_safe(sg, next, &gmap->children, list) - gmap_shadow_notify(sg, vmaddr, gaddr, pte); + gmap_shadow_notify(sg, vmaddr, gaddr); spin_unlock(&gmap->shadow_lock); } if (bits & PGSTE_IN_BIT) From 58d6b15e9da5042a99c9c30ad725792e4569150e Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 22 Jan 2018 18:19:06 +0000 Subject: [PATCH 148/197] KVM: arm/arm64: Handle CPU_PM_ENTER_FAILED cpu_pm_enter() calls the pm notifier chain with CPU_PM_ENTER, then if there is a failure: CPU_PM_ENTER_FAILED. When KVM receives CPU_PM_ENTER it calls cpu_hyp_reset() which will return us to the hyp-stub. If we subsequently get a CPU_PM_ENTER_FAILED, KVM does nothing, leaving the CPU running with the hyp-stub, at odds with kvm_arm_hardware_enabled. Add CPU_PM_ENTER_FAILED as a fallthrough for CPU_PM_EXIT, this reloads KVM based on kvm_arm_hardware_enabled. This is safe even if CPU_PM_ENTER never gets as far as KVM, as cpu_hyp_reinit() calls cpu_hyp_reset() to make sure the hyp-stub is loaded before reloading KVM. Fixes: 67f691976662 ("arm64: kvm: allows kvm cpu hotplug") Cc: # v4.7+ CC: Lorenzo Pieralisi Reviewed-by: Christoffer Dall Signed-off-by: James Morse Signed-off-by: Christoffer Dall --- virt/kvm/arm/arm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 59d8e04c19fa28..639dca0c056090 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1262,6 +1262,7 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self, cpu_hyp_reset(); return NOTIFY_OK; + case CPU_PM_ENTER_FAILED: case CPU_PM_EXIT: if (__this_cpu_read(kvm_arm_hardware_enabled)) /* The hardware was enabled before suspend. */ From b276f1b3b1eb52697f6645bb98f7d32ffb89df69 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Tue, 23 Jan 2018 15:11:14 +0000 Subject: [PATCH 149/197] KVM: arm/arm64: Fix trailing semicolon The trailing semicolon is an empty statement that does no operation. Removing it since it doesn't do anything. Signed-off-by: Luis de Bethencourt Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_emulate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 3d22eb87f919a8..9003bd19cb7018 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -131,7 +131,7 @@ static inline bool mode_has_spsr(struct kvm_vcpu *vcpu) static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu) { unsigned long cpsr_mode = vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr & MODE_MASK; - return cpsr_mode > USR_MODE;; + return cpsr_mode > USR_MODE; } static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu) From b3ecd4aa8632a86428605ab73393d14779019d82 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 16 Jan 2018 18:15:25 +0100 Subject: [PATCH 150/197] KVM: s390: vsie: use READ_ONCE to access some SCB fields Another VCPU might try to modify the SCB while we are creating the shadow SCB. In general this is no problem - unless the compiler decides to not load values once, but e.g. twice. For us, this is only relevant when checking/working with such values. E.g. the prefix value, the mso, state of transactional execution and addresses of satellite blocks. E.g. if we blindly forward values (e.g. general purpose registers or execution controls after masking), we don't care. Leaving unpin_blocks() untouched for now, will handle it separately. The worst thing right now that I can see would be a missed prefix un/remap (mso, prefix, tx) or using wrong guest addresses. Nothing critical, but let's try to avoid unpredictable behavior. Signed-off-by: David Hildenbrand Message-Id: <20180116171526.12343-2-david@redhat.com> Reviewed-by: Christian Borntraeger Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 50 +++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 5d6ae0326d9e8f..b2066fbb6341fc 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -28,7 +28,11 @@ struct vsie_page { * the same offset as that in struct sie_page! */ struct mcck_volatile_info mcck_info; /* 0x0200 */ - /* the pinned originial scb */ + /* + * The pinned original scb. Be aware that other VCPUs can modify + * it while we read from it. Values that are used for conditions or + * are reused conditionally, should be accessed via READ_ONCE. + */ struct kvm_s390_sie_block *scb_o; /* 0x0218 */ /* the shadow gmap in use by the vsie_page */ struct gmap *gmap; /* 0x0220 */ @@ -140,12 +144,13 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; - u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U; + const uint32_t crycbd_o = READ_ONCE(scb_o->crycbd); + const u32 crycb_addr = crycbd_o & 0x7ffffff8U; unsigned long *b1, *b2; u8 ecb3_flags; scb_s->crycbd = 0; - if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1)) + if (!(crycbd_o & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1)) return 0; /* format-1 is supported with message-security-assist extension 3 */ if (!test_kvm_facility(vcpu->kvm, 76)) @@ -183,12 +188,15 @@ static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + /* READ_ONCE does not work on bitfields - use a temporary variable */ + const uint32_t __new_ibc = scb_o->ibc; + const uint32_t new_ibc = READ_ONCE(__new_ibc) & 0x0fffU; __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU; scb_s->ibc = 0; /* ibc installed in g2 and requested for g3 */ - if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) { - scb_s->ibc = scb_o->ibc & 0x0fffU; + if (vcpu->kvm->arch.model.ibc && new_ibc) { + scb_s->ibc = new_ibc; /* takte care of the minimum ibc level of the machine */ if (scb_s->ibc < min_ibc) scb_s->ibc = min_ibc; @@ -253,6 +261,10 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + /* READ_ONCE does not work on bitfields - use a temporary variable */ + const uint32_t __new_prefix = scb_o->prefix; + const uint32_t new_prefix = READ_ONCE(__new_prefix); + const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE; bool had_tx = scb_s->ecb & ECB_TE; unsigned long new_mso = 0; int rc; @@ -299,14 +311,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->icpua = scb_o->icpua; if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) - new_mso = scb_o->mso & 0xfffffffffff00000UL; + new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL; /* if the hva of the prefix changes, we have to remap the prefix */ - if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix) + if (scb_s->mso != new_mso || scb_s->prefix != new_prefix) prefix_unmapped(vsie_page); /* SIE will do mso/msl validity and exception checks for us */ scb_s->msl = scb_o->msl & 0xfffffffffff00000UL; scb_s->mso = new_mso; - scb_s->prefix = scb_o->prefix; + scb_s->prefix = new_prefix; /* We have to definetly flush the tlb if this scb never ran */ if (scb_s->ihcpu != 0xffffU) @@ -318,11 +330,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT; /* transactional execution */ - if (test_kvm_facility(vcpu->kvm, 73)) { + if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) { /* remap the prefix is tx is toggled on */ - if ((scb_o->ecb & ECB_TE) && !had_tx) + if (!had_tx) prefix_unmapped(vsie_page); - scb_s->ecb |= scb_o->ecb & ECB_TE; + scb_s->ecb |= ECB_TE; } /* SIMD */ if (test_kvm_facility(vcpu->kvm, 129)) { @@ -529,9 +541,9 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) gpa_t gpa; int rc = 0; - gpa = scb_o->scaol & ~0xfUL; + gpa = READ_ONCE(scb_o->scaol) & ~0xfUL; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) - gpa |= (u64) scb_o->scaoh << 32; + gpa |= (u64) READ_ONCE(scb_o->scaoh) << 32; if (gpa) { if (!(gpa & ~0x1fffUL)) rc = set_validity_icpt(scb_s, 0x0038U); @@ -551,7 +563,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->scaol = (u32)(u64)hpa; } - gpa = scb_o->itdba & ~0xffUL; + gpa = READ_ONCE(scb_o->itdba) & ~0xffUL; if (gpa && (scb_s->ecb & ECB_TE)) { if (!(gpa & ~0x1fffU)) { rc = set_validity_icpt(scb_s, 0x0080U); @@ -566,7 +578,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->itdba = hpa; } - gpa = scb_o->gvrd & ~0x1ffUL; + gpa = READ_ONCE(scb_o->gvrd) & ~0x1ffUL; if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { if (!(gpa & ~0x1fffUL)) { rc = set_validity_icpt(scb_s, 0x1310U); @@ -584,7 +596,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->gvrd = hpa; } - gpa = scb_o->riccbd & ~0x3fUL; + gpa = READ_ONCE(scb_o->riccbd) & ~0x3fUL; if (gpa && (scb_s->ecb3 & ECB3_RI)) { if (!(gpa & ~0x1fffUL)) { rc = set_validity_icpt(scb_s, 0x0043U); @@ -602,8 +614,8 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { unsigned long sdnxc; - gpa = scb_o->sdnxo & ~0xfUL; - sdnxc = scb_o->sdnxo & 0xfUL; + gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL; + sdnxc = READ_ONCE(scb_o->sdnxo) & 0xfUL; if (!gpa || !(gpa & ~0x1fffUL)) { rc = set_validity_icpt(scb_s, 0x10b0U); goto unpin; @@ -768,7 +780,7 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page) static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U; + __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U; if (fac && test_kvm_facility(vcpu->kvm, 7)) { retry_vsie_icpt(vsie_page); From 15e5020e575d0c1a4eddd99bf7ffdc1f34a3b17d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 16 Jan 2018 18:15:26 +0100 Subject: [PATCH 151/197] KVM: s390: vsie: store guest addresses of satellite blocks in vsie_page This way, the values cannot change, even if another VCPU might try to mess with the nested SCB currently getting executed by another VCPU. We now always use the same gpa for pinning and unpinning a page (for unpinning, it is only relevant to mark the guest page dirty for migration). Signed-off-by: David Hildenbrand Message-Id: <20180116171526.12343-3-david@redhat.com> Reviewed-by: Christian Borntraeger Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b2066fbb6341fc..6090f4235cfc0d 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -38,7 +38,13 @@ struct vsie_page { struct gmap *gmap; /* 0x0220 */ /* address of the last reported fault to guest2 */ unsigned long fault_addr; /* 0x0228 */ - __u8 reserved[0x0700 - 0x0230]; /* 0x0230 */ + /* calculated guest addresses of satellite control blocks */ + gpa_t sca_gpa; /* 0x0230 */ + gpa_t itdba_gpa; /* 0x0238 */ + gpa_t gvrd_gpa; /* 0x0240 */ + gpa_t riccbd_gpa; /* 0x0248 */ + gpa_t sdnx_gpa; /* 0x0250 */ + __u8 reserved[0x0700 - 0x0258]; /* 0x0258 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; @@ -475,46 +481,42 @@ static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) /* unpin all blocks previously pinned by pin_blocks(), marking them dirty */ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { - struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; hpa_t hpa; - gpa_t gpa; hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol; if (hpa) { - gpa = scb_o->scaol & ~0xfUL; - if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) - gpa |= (u64) scb_o->scaoh << 32; - unpin_guest_page(vcpu->kvm, gpa, hpa); + unpin_guest_page(vcpu->kvm, vsie_page->sca_gpa, hpa); + vsie_page->sca_gpa = 0; scb_s->scaol = 0; scb_s->scaoh = 0; } hpa = scb_s->itdba; if (hpa) { - gpa = scb_o->itdba & ~0xffUL; - unpin_guest_page(vcpu->kvm, gpa, hpa); + unpin_guest_page(vcpu->kvm, vsie_page->itdba_gpa, hpa); + vsie_page->itdba_gpa = 0; scb_s->itdba = 0; } hpa = scb_s->gvrd; if (hpa) { - gpa = scb_o->gvrd & ~0x1ffUL; - unpin_guest_page(vcpu->kvm, gpa, hpa); + unpin_guest_page(vcpu->kvm, vsie_page->gvrd_gpa, hpa); + vsie_page->gvrd_gpa = 0; scb_s->gvrd = 0; } hpa = scb_s->riccbd; if (hpa) { - gpa = scb_o->riccbd & ~0x3fUL; - unpin_guest_page(vcpu->kvm, gpa, hpa); + unpin_guest_page(vcpu->kvm, vsie_page->riccbd_gpa, hpa); + vsie_page->riccbd_gpa = 0; scb_s->riccbd = 0; } hpa = scb_s->sdnxo; if (hpa) { - gpa = scb_o->sdnxo; - unpin_guest_page(vcpu->kvm, gpa, hpa); + unpin_guest_page(vcpu->kvm, vsie_page->sdnx_gpa, hpa); + vsie_page->sdnx_gpa = 0; scb_s->sdnxo = 0; } } @@ -559,6 +561,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } if (rc) goto unpin; + vsie_page->sca_gpa = gpa; scb_s->scaoh = (u32)((u64)hpa >> 32); scb_s->scaol = (u32)(u64)hpa; } @@ -575,6 +578,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc = set_validity_icpt(scb_s, 0x0080U); goto unpin; } + vsie_page->itdba_gpa = gpa; scb_s->itdba = hpa; } @@ -593,6 +597,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc = set_validity_icpt(scb_s, 0x1310U); goto unpin; } + vsie_page->gvrd_gpa = gpa; scb_s->gvrd = hpa; } @@ -609,6 +614,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) goto unpin; } /* Validity 0x0044 will be checked by SIE */ + vsie_page->riccbd_gpa = gpa; scb_s->riccbd = hpa; } if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { @@ -636,6 +642,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc = set_validity_icpt(scb_s, 0x10b0U); goto unpin; } + vsie_page->sdnx_gpa = gpa; scb_s->sdnxo = hpa | sdnxc; } return 0; From 5c528db0df6999cc7bf2151b0f9fb6446109ec54 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 23 Jan 2018 22:26:18 +0100 Subject: [PATCH 152/197] s390x/mm: simplify gmap_protect_rmap() We never call it with anything but PROT_READ. This is a left over from an old prototype. For creation of shadow page tables, we always only have to protect the original table in guest memory from write accesses, so we can properly invalidate the shadow on writes. Other protections are not needed. Signed-off-by: David Hildenbrand Message-Id: <20180123212618.32611-1-david@redhat.com> Reviewed-by: Janosch Frank Signed-off-by: Christian Borntraeger --- arch/s390/mm/gmap.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4e9d12af95cc9a..2c55a2b9d6c65b 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1021,18 +1021,17 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, } /** - * gmap_protect_rmap - modify access rights to memory and create an rmap + * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap * @sg: pointer to the shadow guest address space structure * @raddr: rmap address in the shadow gmap * @paddr: address in the parent guest address space * @len: length of the memory area to protect - * @prot: indicates access rights: none, read-only or read-write * * Returns 0 if successfully protected and the rmap was created, -ENOMEM * if out of memory and -EFAULT if paddr is invalid. */ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, - unsigned long paddr, unsigned long len, int prot) + unsigned long paddr, unsigned long len) { struct gmap *parent; struct gmap_rmap *rmap; @@ -1060,7 +1059,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, ptep = gmap_pte_op_walk(parent, paddr, &ptl); if (ptep) { spin_lock(&sg->guest_table_lock); - rc = ptep_force_prot(parent->mm, paddr, ptep, prot, + rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, PGSTE_VSIE_BIT); if (!rc) gmap_insert_rmap(sg, vmaddr, rmap); @@ -1070,7 +1069,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, radix_tree_preload_end(); if (rc) { kfree(rmap); - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); if (rc) return rc; continue; @@ -1609,7 +1608,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, origin = r2t & _REGION_ENTRY_ORIGIN; offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); @@ -1692,7 +1691,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, origin = r3t & _REGION_ENTRY_ORIGIN; offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); @@ -1776,7 +1775,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, origin = sgt & _REGION_ENTRY_ORIGIN; offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); @@ -1895,7 +1894,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* Make pgt read-only in parent gmap page table (not the pgste) */ raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; - rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); From 866c138c32547b690e4cab3cd209c763508a95ab Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 24 Jan 2018 12:27:01 +0100 Subject: [PATCH 153/197] KVM: s390: diagnoses are instructions as well Make the diagnose counters also appear as instruction counters. Signed-off-by: Christian Borntraeger Reviewed-by: Janosch Frank Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index cc5a1b4d74b7da..ce8ea8b94478e2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -118,12 +118,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) }, { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) }, { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) }, - { "diagnose_10", VCPU_STAT(diagnose_10) }, - { "diagnose_44", VCPU_STAT(diagnose_44) }, - { "diagnose_9c", VCPU_STAT(diagnose_9c) }, - { "diagnose_258", VCPU_STAT(diagnose_258) }, - { "diagnose_308", VCPU_STAT(diagnose_308) }, - { "diagnose_500", VCPU_STAT(diagnose_500) }, + { "instruction_diag_10", VCPU_STAT(diagnose_10) }, + { "instruction_diag_44", VCPU_STAT(diagnose_44) }, + { "instruction_diag_9c", VCPU_STAT(diagnose_9c) }, + { "instruction_diag_258", VCPU_STAT(diagnose_258) }, + { "instruction_diag_308", VCPU_STAT(diagnose_308) }, + { "instruction_diag_500", VCPU_STAT(diagnose_500) }, { NULL } }; From a37cb07a30f0a181bc45c6970e486ac2992e9cde Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 23 Jan 2018 13:28:40 +0100 Subject: [PATCH 154/197] KVM: s390: add vcpu stat counters for many instruction The overall instruction counter is larger than the sum of the single counters. We should try to catch all instruction handlers to make this match the summary counter. Let us add sck,tb,sske,iske,rrbe,tb,tpi,tsch,lpsw,pswe.... and remove other unused ones. Signed-off-by: Christian Borntraeger Acked-by: Janosch Frank Reviewed-by: David Hildenbrand --- arch/s390/include/asm/kvm_host.h | 21 +++++++++++++++++---- arch/s390/kvm/diag.c | 1 + arch/s390/kvm/kvm-s390.c | 21 +++++++++++++++++---- arch/s390/kvm/priv.c | 32 ++++++++++++++++++++++++++++++-- 4 files changed, 65 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e16a9f2a44ad69..029e8144c6ecc8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -2,7 +2,7 @@ /* * definition for kernel virtual machines on s390 * - * Copyright IBM Corp. 2008, 2009 + * Copyright IBM Corp. 2008, 2018 * * Author(s): Carsten Otte */ @@ -316,18 +316,30 @@ struct kvm_vcpu_stat { u64 deliver_program_int; u64 deliver_io_int; u64 exit_wait_state; + u64 instruction_epsw; + u64 instruction_gs; + u64 instruction_io_other; + u64 instruction_lpsw; + u64 instruction_lpswe; u64 instruction_pfmf; + u64 instruction_ptff; + u64 instruction_sck; + u64 instruction_sckpf; u64 instruction_stidp; u64 instruction_spx; u64 instruction_stpx; u64 instruction_stap; - u64 instruction_storage_key; + u64 instruction_iske; + u64 instruction_ri; + u64 instruction_rrbe; + u64 instruction_sske; u64 instruction_ipte_interlock; - u64 instruction_stsch; - u64 instruction_chsc; u64 instruction_stsi; u64 instruction_stfl; + u64 instruction_tb; + u64 instruction_tpi; u64 instruction_tprot; + u64 instruction_tsch; u64 instruction_sie; u64 instruction_essa; u64 instruction_sthyi; @@ -353,6 +365,7 @@ struct kvm_vcpu_stat { u64 diagnose_258; u64 diagnose_308; u64 diagnose_500; + u64 diagnose_other; }; #define PGM_OPERATION 0x01 diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 89aa114a2cbada..45634b3d2e0aed 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -257,6 +257,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) case 0x500: return __diag_virtio_hypercall(vcpu); default: + vcpu->stat.diagnose_other++; return -EOPNOTSUPP; } } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ce8ea8b94478e2..8a3a1d50ef99dc 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2,7 +2,7 @@ /* * hosting IBM Z kernel virtual machines (s390x) * - * Copyright IBM Corp. 2008, 2017 + * Copyright IBM Corp. 2008, 2018 * * Author(s): Carsten Otte * Christian Borntraeger @@ -87,19 +87,31 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, { "deliver_program_interruption", VCPU_STAT(deliver_program_int) }, { "exit_wait_state", VCPU_STAT(exit_wait_state) }, + { "instruction_epsw", VCPU_STAT(instruction_epsw) }, + { "instruction_gs", VCPU_STAT(instruction_gs) }, + { "instruction_io_other", VCPU_STAT(instruction_io_other) }, + { "instruction_lpsw", VCPU_STAT(instruction_lpsw) }, + { "instruction_lpswe", VCPU_STAT(instruction_lpswe) }, { "instruction_pfmf", VCPU_STAT(instruction_pfmf) }, + { "instruction_ptff", VCPU_STAT(instruction_ptff) }, { "instruction_stidp", VCPU_STAT(instruction_stidp) }, + { "instruction_sck", VCPU_STAT(instruction_sck) }, + { "instruction_sckpf", VCPU_STAT(instruction_sckpf) }, { "instruction_spx", VCPU_STAT(instruction_spx) }, { "instruction_stpx", VCPU_STAT(instruction_stpx) }, { "instruction_stap", VCPU_STAT(instruction_stap) }, - { "instruction_storage_key", VCPU_STAT(instruction_storage_key) }, + { "instruction_iske", VCPU_STAT(instruction_iske) }, + { "instruction_ri", VCPU_STAT(instruction_ri) }, + { "instruction_rrbe", VCPU_STAT(instruction_rrbe) }, + { "instruction_sske", VCPU_STAT(instruction_sske) }, { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) }, - { "instruction_stsch", VCPU_STAT(instruction_stsch) }, - { "instruction_chsc", VCPU_STAT(instruction_chsc) }, { "instruction_essa", VCPU_STAT(instruction_essa) }, { "instruction_stsi", VCPU_STAT(instruction_stsi) }, { "instruction_stfl", VCPU_STAT(instruction_stfl) }, + { "instruction_tb", VCPU_STAT(instruction_tb) }, + { "instruction_tpi", VCPU_STAT(instruction_tpi) }, { "instruction_tprot", VCPU_STAT(instruction_tprot) }, + { "instruction_tsch", VCPU_STAT(instruction_tsch) }, { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, { "instruction_sie", VCPU_STAT(instruction_sie) }, { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, @@ -124,6 +136,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_diag_258", VCPU_STAT(diagnose_258) }, { "instruction_diag_308", VCPU_STAT(diagnose_308) }, { "instruction_diag_500", VCPU_STAT(diagnose_500) }, + { "instruction_diag_other", VCPU_STAT(diagnose_other) }, { NULL } }; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 572496c688cc0c..dea0217922d6f8 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -2,7 +2,7 @@ /* * handling privileged instructions * - * Copyright IBM Corp. 2008, 2013 + * Copyright IBM Corp. 2008, 2018 * * Author(s): Carsten Otte * Christian Borntraeger @@ -34,6 +34,8 @@ static int handle_ri(struct kvm_vcpu *vcpu) { + vcpu->stat.instruction_ri++; + if (test_kvm_facility(vcpu->kvm, 64)) { VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (lazy)"); vcpu->arch.sie_block->ecb3 |= ECB3_RI; @@ -53,6 +55,8 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu) static int handle_gs(struct kvm_vcpu *vcpu) { + vcpu->stat.instruction_gs++; + if (test_kvm_facility(vcpu->kvm, 133)) { VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)"); preempt_disable(); @@ -85,6 +89,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) u8 ar; u64 op2, val; + vcpu->stat.instruction_sck++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -222,7 +228,6 @@ static int try_handle_skey(struct kvm_vcpu *vcpu) { int rc; - vcpu->stat.instruction_storage_key++; rc = kvm_s390_skey_check_enable(vcpu); if (rc) return rc; @@ -242,6 +247,8 @@ static int handle_iske(struct kvm_vcpu *vcpu) int reg1, reg2; int rc; + vcpu->stat.instruction_iske++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -274,6 +281,8 @@ static int handle_rrbe(struct kvm_vcpu *vcpu) int reg1, reg2; int rc; + vcpu->stat.instruction_rrbe++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -312,6 +321,8 @@ static int handle_sske(struct kvm_vcpu *vcpu) int reg1, reg2; int rc; + vcpu->stat.instruction_sske++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -392,6 +403,8 @@ static int handle_test_block(struct kvm_vcpu *vcpu) gpa_t addr; int reg2; + vcpu->stat.instruction_tb++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -424,6 +437,8 @@ static int handle_tpi(struct kvm_vcpu *vcpu) u64 addr; u8 ar; + vcpu->stat.instruction_tpi++; + addr = kvm_s390_get_base_disp_s(vcpu, &ar); if (addr & 3) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -484,6 +499,8 @@ static int handle_tsch(struct kvm_vcpu *vcpu) struct kvm_s390_interrupt_info *inti = NULL; const u64 isc_mask = 0xffUL << 24; /* all iscs set */ + vcpu->stat.instruction_tsch++; + /* a valid schid has at least one bit set */ if (vcpu->run->s.regs.gprs[1]) inti = kvm_s390_get_io_int(vcpu->kvm, isc_mask, @@ -527,6 +544,7 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->ipa == 0xb235) return handle_tsch(vcpu); /* Handle in userspace. */ + vcpu->stat.instruction_io_other++; return -EOPNOTSUPP; } else { /* @@ -592,6 +610,8 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) int rc; u8 ar; + vcpu->stat.instruction_lpsw++; + if (gpsw->mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -619,6 +639,8 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) int rc; u8 ar; + vcpu->stat.instruction_lpswe++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -828,6 +850,8 @@ static int handle_epsw(struct kvm_vcpu *vcpu) { int reg1, reg2; + vcpu->stat.instruction_epsw++; + kvm_s390_get_regs_rre(vcpu, ®1, ®2); /* This basically extracts the mask half of the psw. */ @@ -1332,6 +1356,8 @@ static int handle_sckpf(struct kvm_vcpu *vcpu) { u32 value; + vcpu->stat.instruction_sckpf++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -1347,6 +1373,8 @@ static int handle_sckpf(struct kvm_vcpu *vcpu) static int handle_ptff(struct kvm_vcpu *vcpu) { + vcpu->stat.instruction_ptff++; + /* we don't emulate any control instructions yet */ kvm_s390_set_psw_cc(vcpu, 3); return 0; From 2018224df3174763d4ece26c103c99438e0c42fc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 23 Jan 2018 18:05:28 +0100 Subject: [PATCH 155/197] KVM: s390: rename __set_cpuflag() to kvm_s390_set_cpuflags() No need to make this function special. Move it to a header right away. Signed-off-by: David Hildenbrand Message-Id: <20180123170531.13687-2-david@redhat.com> Reviewed-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 29 ++++++++++++----------------- arch/s390/kvm/kvm-s390.h | 5 +++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f8eb2cfa763acd..96ea3b80b67a4a 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -301,17 +301,12 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) } } -static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) -{ - atomic_or(flag, &vcpu->arch.sie_block->cpuflags); -} - static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) { if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK)) return; else if (psw_ioint_disabled(vcpu)) - __set_cpuflag(vcpu, CPUSTAT_IO_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT); else vcpu->arch.sie_block->lctl |= LCTL_CR6; } @@ -321,7 +316,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK)) return; if (psw_extint_disabled(vcpu)) - __set_cpuflag(vcpu, CPUSTAT_EXT_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); else vcpu->arch.sie_block->lctl |= LCTL_CR0; } @@ -339,7 +334,7 @@ static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) static void set_intercept_indicators_stop(struct kvm_vcpu *vcpu) { if (kvm_s390_is_stop_irq_pending(vcpu)) - __set_cpuflag(vcpu, CPUSTAT_STOP_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT); } /* Set interception request for non-deliverable interrupts */ @@ -1227,7 +1222,7 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) li->irq.ext = irq->u.ext; set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs); - __set_cpuflag(vcpu, CPUSTAT_EXT_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1252,7 +1247,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) return -EBUSY; *extcall = irq->u.extcall; - __set_cpuflag(vcpu, CPUSTAT_EXT_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1296,7 +1291,7 @@ static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (test_and_set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs)) return -EBUSY; stop->flags = irq->u.stop.flags; - __set_cpuflag(vcpu, CPUSTAT_STOP_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT); return 0; } @@ -1328,7 +1323,7 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu, set_bit(irq->u.emerg.code, li->sigp_emerg_pending); set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); - __set_cpuflag(vcpu, CPUSTAT_EXT_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1372,7 +1367,7 @@ static int __inject_ckc(struct kvm_vcpu *vcpu) 0, 0); set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); - __set_cpuflag(vcpu, CPUSTAT_EXT_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1385,7 +1380,7 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu) 0, 0); set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); - __set_cpuflag(vcpu, CPUSTAT_EXT_INT); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); return 0; } @@ -1568,13 +1563,13 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) /* make the VCPU drop out of the SIE, or wake it up if sleeping */ switch (type) { case KVM_S390_MCHK: - __set_cpuflag(dst_vcpu, CPUSTAT_STOP_INT); + kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT); break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - __set_cpuflag(dst_vcpu, CPUSTAT_IO_INT); + kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: - __set_cpuflag(dst_vcpu, CPUSTAT_EXT_INT); + kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_EXT_INT); break; } kvm_s390_vcpu_wakeup(dst_vcpu); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 8877116f015955..a9f2c8d1833dd8 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -47,6 +47,11 @@ do { \ d_args); \ } while (0) +static inline void kvm_s390_set_cpuflags(struct kvm_vcpu *vcpu, u32 flags) +{ + atomic_or(flags, &vcpu->arch.sie_block->cpuflags); +} + static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) { return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOPPED; From ef8f4f49fcfa56f7399db9886fcbb89f9f92a340 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 23 Jan 2018 18:05:29 +0100 Subject: [PATCH 156/197] KVM: s390: reuse kvm_s390_set_cpuflags() Use it in all places where we set cpuflags. Signed-off-by: David Hildenbrand Message-Id: <20180123170531.13687-3-david@redhat.com> Reviewed-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 17 ++++++++--------- arch/s390/kvm/vsie.c | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 96ea3b80b67a4a..404a127b1921a0 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -101,7 +101,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) /* another external call is pending */ return -EBUSY; } - atomic_or(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_ECALL_PEND); return 0; } @@ -277,7 +277,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { - atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8a3a1d50ef99dc..9dc996acb52cf5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2329,7 +2329,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { gmap_enable(vcpu->arch.enabled_gmap); - atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING); if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __start_cpu_timer_accounting(vcpu); vcpu->cpu = cpu; @@ -2436,9 +2436,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) CPUSTAT_STOPPED); if (test_kvm_facility(vcpu->kvm, 78)) - atomic_or(CPUSTAT_GED2, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED2); else if (test_kvm_facility(vcpu->kvm, 8)) - atomic_or(CPUSTAT_GED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED); kvm_s390_vcpu_setup_model(vcpu); @@ -2475,7 +2475,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; if (sclp.has_kss) - atomic_or(CPUSTAT_KSS, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS); else vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; @@ -2578,7 +2578,7 @@ static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu) * return immediately. */ void exit_sie(struct kvm_vcpu *vcpu) { - atomic_or(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT); while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE) cpu_relax(); } @@ -2822,7 +2822,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; /* enforce guest PER */ - atomic_or(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_P); if (dbg->control & KVM_GUESTDBG_USE_HW_BP) rc = kvm_s390_import_bp_data(vcpu, dbg); @@ -2911,8 +2911,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) { if (!ibs_enabled(vcpu)) { trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1); - atomic_or(CPUSTAT_IBS, - &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_IBS); } goto retry; } @@ -3591,7 +3590,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */ kvm_s390_clear_stop_irq(vcpu); - atomic_or(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED); __disable_ibs_on_vcpu(vcpu); for (i = 0; i < online_vcpus; i++) { diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 6090f4235cfc0d..87b3416390636a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -913,7 +913,7 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu, * External calls have to lead to a kick of the vcpu and * therefore the vsie -> Simulate Wait state. */ - atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); /* * We have to adjust the g3 epoch by the g2 epoch. The epoch will * automatically be adjusted on tod clock changes via kvm_sync_clock. From 9daecfc66015530ee5d2d84cce5d341f0fffd0ab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 23 Jan 2018 18:05:30 +0100 Subject: [PATCH 157/197] KVM: s390: introduce and use kvm_s390_clear_cpuflags() Use it just like kvm_s390_set_cpuflags(). Suggested-by: Cornelia Huck Signed-off-by: David Hildenbrand Message-Id: <20180123170531.13687-4-david@redhat.com> Reviewed-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 8 ++++---- arch/s390/kvm/kvm-s390.c | 11 +++++------ arch/s390/kvm/kvm-s390.h | 5 +++++ arch/s390/kvm/priv.c | 2 +- arch/s390/kvm/vsie.c | 2 +- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 404a127b1921a0..8687aed9a268f8 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -111,7 +111,7 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) if (!kvm_s390_use_sca_entries()) return; - atomic_andnot(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -283,14 +283,14 @@ static void __set_cpu_idle(struct kvm_vcpu *vcpu) static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { - atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) { - atomic_andnot(CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT, - &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IO_INT | CPUSTAT_EXT_INT | + CPUSTAT_STOP_INT); vcpu->arch.sie_block->lctl = 0x0000; vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9dc996acb52cf5..3eb97c4212cf6b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2340,7 +2340,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __stop_cpu_timer_accounting(vcpu); - atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING); vcpu->arch.enabled_gmap = gmap_get_enabled(); gmap_disable(vcpu->arch.enabled_gmap); @@ -2827,14 +2827,14 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & KVM_GUESTDBG_USE_HW_BP) rc = kvm_s390_import_bp_data(vcpu, dbg); } else { - atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P); vcpu->arch.guestdbg.last_bp = 0; } if (rc) { vcpu->guest_debug = 0; kvm_s390_clear_bp_data(vcpu); - atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P); } return rc; @@ -2919,8 +2919,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) { if (ibs_enabled(vcpu)) { trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0); - atomic_andnot(CPUSTAT_IBS, - &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IBS); } goto retry; } @@ -3564,7 +3563,7 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) __disable_ibs_on_all_vcpus(vcpu->kvm); } - atomic_andnot(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED); /* * Another VCPU might have used IBS while we were offline. * Let's play safe and flush the VCPU at startup. diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index a9f2c8d1833dd8..f727313a706f9d 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -52,6 +52,11 @@ static inline void kvm_s390_set_cpuflags(struct kvm_vcpu *vcpu, u32 flags) atomic_or(flags, &vcpu->arch.sie_block->cpuflags); } +static inline void kvm_s390_clear_cpuflags(struct kvm_vcpu *vcpu, u32 flags) +{ + atomic_andnot(flags, &vcpu->arch.sie_block->cpuflags); +} + static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) { return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOPPED; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index dea0217922d6f8..8205223f73322c 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -216,7 +216,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); if (!rc) { if (atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS) - atomic_andnot(CPUSTAT_KSS, &sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS); else sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 87b3416390636a..6d494ed5907ee1 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -935,7 +935,7 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu, */ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) { - atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); WRITE_ONCE(vcpu->arch.vsie_block, NULL); } From 8d5fb0dc4ec069ea02395593e9b6b2b39a92457e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 23 Jan 2018 18:05:31 +0100 Subject: [PATCH 158/197] KVM: s390: introduce and use kvm_s390_test_cpuflags() Use it just like kvm_s390_set_cpuflags() and kvm_s390_clear_cpuflags(). Signed-off-by: David Hildenbrand Message-Id: <20180123170531.13687-5-david@redhat.com> Reviewed-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 2 +- arch/s390/kvm/kvm-s390.c | 2 +- arch/s390/kvm/kvm-s390.h | 7 ++++++- arch/s390/kvm/priv.c | 4 ++-- arch/s390/kvm/sigp.c | 14 +++++--------- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 8687aed9a268f8..4b483b48436a94 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -36,7 +36,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { int c, scn; - if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND)) + if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND)) return 0; BUG_ON(!kvm_s390_use_sca_entries()); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3eb97c4212cf6b..3dd209de2e0ca6 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2875,7 +2875,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, static bool ibs_enabled(struct kvm_vcpu *vcpu) { - return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS; + return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); } static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index f727313a706f9d..f110fa96807e81 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -57,9 +57,14 @@ static inline void kvm_s390_clear_cpuflags(struct kvm_vcpu *vcpu, u32 flags) atomic_andnot(flags, &vcpu->arch.sie_block->cpuflags); } +static inline bool kvm_s390_test_cpuflags(struct kvm_vcpu *vcpu, u32 flags) +{ + return (atomic_read(&vcpu->arch.sie_block->cpuflags) & flags) == flags; +} + static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) { - return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOPPED; + return kvm_s390_test_cpuflags(vcpu, CPUSTAT_STOPPED); } static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 8205223f73322c..125a7ff98e2ad8 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -209,13 +209,13 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) trace_kvm_s390_skey_related_inst(vcpu); if (!(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) && - !(atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS)) + !kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS)) return rc; rc = s390_enable_skey(); VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); if (!rc) { - if (atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS) + if (kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS)) kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS); else sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 5cafd1e2651bc3..683036c1c92a8f 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -20,19 +20,18 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, u64 *reg) { - int cpuflags; + const bool stopped = kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_STOPPED); int rc; int ext_call_pending; - cpuflags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu); - if (!(cpuflags & CPUSTAT_STOPPED) && !ext_call_pending) + if (!stopped && !ext_call_pending) rc = SIGP_CC_ORDER_CODE_ACCEPTED; else { *reg &= 0xffffffff00000000UL; if (ext_call_pending) *reg |= SIGP_STATUS_EXT_CALL_PENDING; - if (cpuflags & CPUSTAT_STOPPED) + if (stopped) *reg |= SIGP_STATUS_STOPPED; rc = SIGP_CC_STATUS_STORED; } @@ -205,11 +204,9 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, u32 addr, u64 *reg) { - int flags; int rc; - flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); - if (!(flags & CPUSTAT_STOPPED)) { + if (!kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_STOPPED)) { *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INCORRECT_STATE; return SIGP_CC_STATUS_STORED; @@ -236,8 +233,7 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, return SIGP_CC_STATUS_STORED; } - if (atomic_read(&dst_vcpu->arch.sie_block->cpuflags) & - CPUSTAT_RUNNING) { + if (kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_RUNNING)) { /* running */ rc = SIGP_CC_ORDER_CODE_ACCEPTED; } else { From c7901a6ebee4b624971361bbd93f21ab0b359786 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 29 Jun 2017 18:39:27 +0200 Subject: [PATCH 159/197] KVM: s390: reverse bit ordering of irqs in pending mask This patch prepares a simplification of bit operations between the irq pending mask for emulated interrupts and the Interruption Pending Mask (IPM) which is part of the Guest Interruption State Area (GISA), a feature that allows interrupt delivery to guests by means of the SIE instruction. Without that change, a bit-wise *or* operation on parts of these two masks would either require a look-up table of size 256 bytes to map the IPM to the emulated irq pending mask bit orientation (all bits mirrored at half byte) or a sequence of up to 8 condidional branches to perform tests of single bit positions. Both options are to be rejected either by performance or space utilization reasons. Beyond that this change will be transparent. Signed-off-by: Michael Mueller Reviewed-by: Halil Pasic Reviewed-by: Pierre Morel Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 54 ++++++++++++++++---------------- arch/s390/kvm/interrupt.c | 12 +++---- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 029e8144c6ecc8..d2351d63afa3b5 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -422,35 +422,35 @@ struct kvm_vcpu_stat { #define PGM_PER 0x80 #define PGM_CRYPTO_OPERATION 0x119 -/* irq types in order of priority */ +/* irq types in ascend order of priorities */ enum irq_types { - IRQ_PEND_MCHK_EX = 0, - IRQ_PEND_SVC, - IRQ_PEND_PROG, - IRQ_PEND_MCHK_REP, - IRQ_PEND_EXT_IRQ_KEY, - IRQ_PEND_EXT_MALFUNC, - IRQ_PEND_EXT_EMERGENCY, - IRQ_PEND_EXT_EXTERNAL, - IRQ_PEND_EXT_CLOCK_COMP, - IRQ_PEND_EXT_CPU_TIMER, - IRQ_PEND_EXT_TIMING, - IRQ_PEND_EXT_SERVICE, - IRQ_PEND_EXT_HOST, - IRQ_PEND_PFAULT_INIT, - IRQ_PEND_PFAULT_DONE, - IRQ_PEND_VIRTIO, - IRQ_PEND_IO_ISC_0, - IRQ_PEND_IO_ISC_1, - IRQ_PEND_IO_ISC_2, - IRQ_PEND_IO_ISC_3, - IRQ_PEND_IO_ISC_4, - IRQ_PEND_IO_ISC_5, - IRQ_PEND_IO_ISC_6, - IRQ_PEND_IO_ISC_7, - IRQ_PEND_SIGP_STOP, + IRQ_PEND_SET_PREFIX = 0, IRQ_PEND_RESTART, - IRQ_PEND_SET_PREFIX, + IRQ_PEND_SIGP_STOP, + IRQ_PEND_IO_ISC_7, + IRQ_PEND_IO_ISC_6, + IRQ_PEND_IO_ISC_5, + IRQ_PEND_IO_ISC_4, + IRQ_PEND_IO_ISC_3, + IRQ_PEND_IO_ISC_2, + IRQ_PEND_IO_ISC_1, + IRQ_PEND_IO_ISC_0, + IRQ_PEND_VIRTIO, + IRQ_PEND_PFAULT_DONE, + IRQ_PEND_PFAULT_INIT, + IRQ_PEND_EXT_HOST, + IRQ_PEND_EXT_SERVICE, + IRQ_PEND_EXT_TIMING, + IRQ_PEND_EXT_CPU_TIMER, + IRQ_PEND_EXT_CLOCK_COMP, + IRQ_PEND_EXT_EXTERNAL, + IRQ_PEND_EXT_EMERGENCY, + IRQ_PEND_EXT_MALFUNC, + IRQ_PEND_EXT_IRQ_KEY, + IRQ_PEND_MCHK_REP, + IRQ_PEND_PROG, + IRQ_PEND_SVC, + IRQ_PEND_MCHK_EX, IRQ_PEND_COUNT }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 4b483b48436a94..b731d4fede837f 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -189,8 +189,8 @@ static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu) static inline int is_ioirq(unsigned long irq_type) { - return ((irq_type >= IRQ_PEND_IO_ISC_0) && - (irq_type <= IRQ_PEND_IO_ISC_7)); + return ((irq_type >= IRQ_PEND_IO_ISC_7) && + (irq_type <= IRQ_PEND_IO_ISC_0)); } static uint64_t isc_to_isc_bits(int isc) @@ -211,12 +211,12 @@ static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) static inline int isc_to_irq_type(unsigned long isc) { - return IRQ_PEND_IO_ISC_0 + isc; + return IRQ_PEND_IO_ISC_0 - isc; } static inline int irq_type_to_isc(unsigned long irq_type) { - return irq_type - IRQ_PEND_IO_ISC_0; + return IRQ_PEND_IO_ISC_0 - irq_type; } static unsigned long disable_iscs(struct kvm_vcpu *vcpu, @@ -1149,8 +1149,8 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); while ((irqs = deliverable_irqs(vcpu)) && !rc) { - /* bits are in the order of interrupt priority */ - irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT); + /* bits are in the reverse order of interrupt priority */ + irq_type = find_last_bit(&irqs, IRQ_PEND_COUNT); if (is_ioirq(irq_type)) { rc = __deliver_io(vcpu, irq_type); } else { From 19114beb73f774e466d9e39b8e8b961812c9f881 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Tue, 30 May 2017 14:26:02 +0200 Subject: [PATCH 160/197] KVM: s390: define GISA format-0 data structure In preperation to support pass-through adapter interrupts, the Guest Interruption State Area (GISA) and the Adapter Interruption Virtualization (AIV) features will be introduced here. This patch introduces format-0 GISA (that is defines the struct describing the GISA, allocates storage for it, and introduces fields for the GISA address in kvm_s390_sie_block and kvm_s390_vsie). As the GISA requires storage below 2GB, it is put in sie_page2, which is already allocated in ZONE_DMA. In addition, The GISA requires alignment to its integral boundary. This is already naturally aligned via the padding in the sie_page2. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 23 +++++++++++++++++++---- arch/s390/kvm/kvm-s390.c | 1 + 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d2351d63afa3b5..4f89b2783512e6 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -183,6 +183,7 @@ struct kvm_s390_sie_block { #define ECA_IB 0x40000000 #define ECA_SIGPI 0x10000000 #define ECA_MVPGI 0x01000000 +#define ECA_AIV 0x00200000 #define ECA_VX 0x00020000 #define ECA_PROTEXCI 0x00002000 #define ECA_SII 0x00000001 @@ -227,7 +228,8 @@ struct kvm_s390_sie_block { __u8 epdx; /* 0x0069 */ __u8 reserved6a[2]; /* 0x006a */ __u32 todpr; /* 0x006c */ - __u8 reserved70[16]; /* 0x0070 */ + __u32 gd; /* 0x0070 */ + __u8 reserved74[12]; /* 0x0074 */ __u64 mso; /* 0x0080 */ __u64 msl; /* 0x0088 */ psw_t gpsw; /* 0x0090 */ @@ -716,14 +718,27 @@ struct kvm_s390_crypto_cb { struct kvm_s390_apcb1 apcb1; /* 0x0080 */ }; +struct kvm_s390_gisa { + u32 next_alert; + u8 ipm; + u8 reserved01; + u8 : 6; + u8 g : 1; + u8 c : 1; + u8 iam; + u8 reserved02[4]; + u32 airq_count; +}; + /* - * sie_page2 has to be allocated as DMA because fac_list and crycb need - * 31bit addresses in the sie control block. + * sie_page2 has to be allocated as DMA because fac_list, crycb and + * gisa need 31bit addresses in the sie control block. */ struct sie_page2 { __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ struct kvm_s390_crypto_cb crycb; /* 0x0800 */ - u8 reserved900[0x1000 - 0x900]; /* 0x0900 */ + struct kvm_s390_gisa gisa; /* 0x0900 */ + u8 reserved910[0x1000 - 0x910]; /* 0x0910 */ }; struct kvm_s390_vsie { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3dd209de2e0ca6..2fbdb160108944 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1928,6 +1928,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.dbf) goto out_err; + BUILD_BUG_ON(sizeof(struct sie_page2) != 4096); kvm->arch.sie_page2 = (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!kvm->arch.sie_page2) From f3ec471a9832f8365836d70db4d52db9cc4dd6b1 Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Thu, 16 Mar 2017 17:45:40 +0100 Subject: [PATCH 161/197] s390/bitops: add test_and_clear_bit_inv() This patch adds a MSB0 bit numbering version of test_and_clear_bit(). Signed-off-by: Jens Freimann Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Acked-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/bitops.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 31e400c1a1f354..86e5b2fdee3c8e 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -261,6 +261,11 @@ static inline void clear_bit_inv(unsigned long nr, volatile unsigned long *ptr) return clear_bit(nr ^ (BITS_PER_LONG - 1), ptr); } +static inline int test_and_clear_bit_inv(unsigned long nr, volatile unsigned long *ptr) +{ + return test_and_clear_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + static inline void __set_bit_inv(unsigned long nr, volatile unsigned long *ptr) { return __set_bit(nr ^ (BITS_PER_LONG - 1), ptr); From d77e64141e322a3202de71a4afa7956c98d2a302 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Mon, 12 Jun 2017 12:37:57 +0200 Subject: [PATCH 162/197] KVM: s390: implement GISA IPM related primitives The patch implements routines to access the GISA to test and modify its Interruption Pending Mask (IPM) from the host side. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index b731d4fede837f..8985ce51f687a3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -203,6 +203,34 @@ static inline u8 int_word_to_isc(u32 int_word) return (int_word & 0x38000000) >> 27; } +/* + * To use atomic bitmap functions, we have to provide a bitmap address + * that is u64 aligned. However, the ipm might be u32 aligned. + * Therefore, we logically start the bitmap at the very beginning of the + * struct and fixup the bit number. + */ +#define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE) + +static inline void kvm_s390_gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +{ + set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); +} + +static inline u8 kvm_s390_gisa_get_ipm(struct kvm_s390_gisa *gisa) +{ + return READ_ONCE(gisa->ipm); +} + +static inline void kvm_s390_gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +{ + clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); +} + +static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +{ + return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); +} + static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.float_int.pending_irqs | From 72b523a30d9b9c7f749c3209e13de983aeb919c1 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Mon, 10 Jul 2017 11:03:42 +0200 Subject: [PATCH 163/197] s390/css: indicate the availability of the AIV facility The patch adds an indication for the presence Adapter Interruption Virtualization facility (AIV) of the general channel subsystem characteristics. Signed-off-by: Michael Mueller Reviewed-by: Halil Pasic Reviewed-by: Christian Borntraeger Acked-by: Cornelia Huck Acked-by: Martin Schwidefsky Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger [change wording] --- arch/s390/include/asm/css_chars.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h index a478eb61aaf7f3..fb56fa3283a2c3 100644 --- a/arch/s390/include/asm/css_chars.h +++ b/arch/s390/include/asm/css_chars.h @@ -20,7 +20,9 @@ struct css_general_char { u32 aif_tdd : 1; /* bit 56 */ u32 : 1; u32 qebsm : 1; /* bit 58 */ - u32 : 8; + u32 : 2; + u32 aiv : 1; /* bit 61 */ + u32 : 5; u32 aif_osa : 1; /* bit 67 */ u32 : 12; u32 eadm_rf : 1; /* bit 80 */ From d7c5cb0105ddeff56694f4c6222ee7221824bad3 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Mon, 12 Jun 2017 14:15:19 +0200 Subject: [PATCH 164/197] KVM: s390: exploit GISA and AIV for emulated interrupts The adapter interruption virtualization (AIV) facility is an optional facility that comes with functionality expected to increase the performance of adapter interrupt handling for both emulated and passed-through adapter interrupts. With AIV, adapter interrupts can be delivered to the guest without exiting SIE. This patch provides some preparations for using AIV for emulated adapter interrupts (including virtio) if it's available. When using AIV, the interrupts are delivered at the so called GISA by setting the bit corresponding to its Interruption Subclass (ISC) in the Interruption Pending Mask (IPM) instead of inserting a node into the floating interrupt list. To keep the change reasonably small, the handling of this new state is deferred in get_all_floating_irqs and handle_tpi. This patch concentrates on the code handling enqueuement of emulated adapter interrupts, and their delivery to the guest. Note that care is still required for adapter interrupts using AIV, because there is no guarantee that AIV is going to deliver the adapter interrupts pending at the GISA (consider all vcpus idle). When delivering GISA adapter interrupts by the host (usual mechanism) special attention is required to honor interrupt priorities. Empirical results show that the time window between making an interrupt pending at the GISA and doing kvm_s390_deliver_pending_interrupts is sufficient for a guest with at least moderate cpu activity to get adapter interrupts delivered within the SIE, and potentially save some SIE exits (if not other deliverable interrupts). The code will be activated with a follow-up patch. Signed-off-by: Michael Mueller Acked-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/interrupt.c | 93 +++++++++++++++++++++++++------- arch/s390/kvm/kvm-s390.c | 8 +++ arch/s390/kvm/kvm-s390.h | 3 ++ 4 files changed, 87 insertions(+), 18 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 4f89b2783512e6..76d6f0ef3dbdc1 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -785,6 +785,7 @@ struct kvm_arch{ struct kvm_s390_migration_state *migration_state; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + struct kvm_s390_gisa *gisa; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 8985ce51f687a3..f293c956e6dbb8 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -234,7 +234,8 @@ static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gis static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.float_int.pending_irqs | - vcpu->arch.local_int.pending_irqs; + vcpu->arch.local_int.pending_irqs | + kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; } static inline int isc_to_irq_type(unsigned long isc) @@ -919,18 +920,38 @@ static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io) +{ + int rc; + + rc = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID); + rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR); + rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM); + rc |= put_guest_lc(vcpu, io->io_int_word, (u32 *)__LC_IO_INT_WORD); + rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + return rc ? -EFAULT : 0; +} + static int __must_check __deliver_io(struct kvm_vcpu *vcpu, unsigned long irq_type) { struct list_head *isc_list; struct kvm_s390_float_interrupt *fi; struct kvm_s390_interrupt_info *inti = NULL; + struct kvm_s390_io_info io; + u32 isc; int rc = 0; fi = &vcpu->kvm->arch.float_int; spin_lock(&fi->lock); - isc_list = &fi->lists[irq_type_to_isc(irq_type)]; + isc = irq_type_to_isc(irq_type); + isc_list = &fi->lists[isc]; inti = list_first_entry_or_null(isc_list, struct kvm_s390_interrupt_info, list); @@ -958,24 +979,31 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, spin_unlock(&fi->lock); if (inti) { - rc = put_guest_lc(vcpu, inti->io.subchannel_id, - (u16 *)__LC_SUBCHANNEL_ID); - rc |= put_guest_lc(vcpu, inti->io.subchannel_nr, - (u16 *)__LC_SUBCHANNEL_NR); - rc |= put_guest_lc(vcpu, inti->io.io_int_parm, - (u32 *)__LC_IO_INT_PARM); - rc |= put_guest_lc(vcpu, inti->io.io_int_word, - (u32 *)__LC_IO_INT_WORD); - rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW, - &vcpu->arch.sie_block->gpsw, - sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW, - &vcpu->arch.sie_block->gpsw, - sizeof(psw_t)); + rc = __do_deliver_io(vcpu, &(inti->io)); kfree(inti); + goto out; } - return rc ? -EFAULT : 0; + if (vcpu->kvm->arch.gisa && + kvm_s390_gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) { + /* + * in case an adapter interrupt was not delivered + * in SIE context KVM will handle the delivery + */ + VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc); + memset(&io, 0, sizeof(io)); + io.io_int_word = (isc << 27) | 0x80000000; + vcpu->stat.deliver_io_int++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_INT_IO(1, 0, 0, 0), + ((__u32)io.subchannel_id << 16) | + io.subchannel_nr, + ((__u64)io.io_int_parm << 32) | + io.io_int_word); + rc = __do_deliver_io(vcpu, &io); + } +out: + return rc; } typedef int (*deliver_irq_t)(struct kvm_vcpu *vcpu); @@ -1539,6 +1567,15 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) struct list_head *list; int isc; + isc = int_word_to_isc(inti->io.io_int_word); + + if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) { + VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); + kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + kfree(inti); + return 0; + } + fi = &kvm->arch.float_int; spin_lock(&fi->lock); if (fi->counters[FIRQ_CNTR_IO] >= KVM_S390_MAX_FLOAT_IRQS) { @@ -1554,7 +1591,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) inti->io.subchannel_id >> 8, inti->io.subchannel_id >> 1 & 0x3, inti->io.subchannel_nr); - isc = int_word_to_isc(inti->io.io_int_word); list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; list_add_tail(&inti->list, list); set_bit(isc_to_irq_type(isc), &fi->pending_irqs); @@ -2705,3 +2741,24 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) return n; } + +void kvm_s390_gisa_clear(struct kvm *kvm) +{ + if (kvm->arch.gisa) { + memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); + kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; + VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); + } +} + +void kvm_s390_gisa_init(struct kvm *kvm) +{ + /* not implemented yet */ +} + +void kvm_s390_gisa_destroy(struct kvm *kvm) +{ + if (!kvm->arch.gisa) + return; + kvm->arch.gisa = NULL; +} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2fbdb160108944..2c5e25b394352b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1999,6 +1999,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) spin_lock_init(&kvm->arch.start_stop_lock); kvm_s390_vsie_init(kvm); + kvm_s390_gisa_init(kvm); KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -2061,6 +2062,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_free_vcpus(kvm); sca_dispose(kvm); debug_unregister(kvm->arch.dbf); + kvm_s390_gisa_destroy(kvm); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) gmap_remove(kvm->arch.gmap); @@ -2471,6 +2473,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 139)) vcpu->arch.sie_block->ecd |= ECD_MEF; + if (vcpu->arch.sie_block->gd) { + vcpu->arch.sie_block->eca |= ECA_AIV; + VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", + vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); + } vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) | SDNXC; vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; @@ -2523,6 +2530,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); + vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa; seqcount_init(&vcpu->arch.cputm_seqcount); rc = kvm_vcpu_init(vcpu, kvm, id); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index f110fa96807e81..bd31b37b0e6f83 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -382,6 +382,9 @@ int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu, void __user *buf, int len); int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len); +void kvm_s390_gisa_init(struct kvm *kvm); +void kvm_s390_gisa_clear(struct kvm *kvm); +void kvm_s390_gisa_destroy(struct kvm *kvm); /* implemented in guestdbg.c */ void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); From 2496c8e7fe9270bde5e125729c193120e7fa8c67 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Aug 2017 11:10:28 +0200 Subject: [PATCH 165/197] KVM: s390: abstract adapter interruption word generation from ISC The function isc_to_int_word() allows the generation of interruption words for adapter interrupts. Signed-off-by: Michael Mueller Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f293c956e6dbb8..a0ded3a23a5e0c 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -198,6 +198,11 @@ static uint64_t isc_to_isc_bits(int isc) return (0x80 >> isc) << 24; } +static inline u32 isc_to_int_word(u8 isc) +{ + return ((u32)isc << 27) | 0x80000000; +} + static inline u8 int_word_to_isc(u32 int_word) { return (int_word & 0x38000000) >> 27; @@ -992,7 +997,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, */ VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc); memset(&io, 0, sizeof(io)); - io.io_int_word = (isc << 27) | 0x80000000; + io.io_int_word = isc_to_int_word(isc); vcpu->stat.deliver_io_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_IO(1, 0, 0, 0), @@ -2299,7 +2304,7 @@ static int kvm_s390_inject_airq(struct kvm *kvm, struct kvm_s390_interrupt s390int = { .type = KVM_S390_INT_IO(1, 0, 0, 0), .parm = 0, - .parm64 = (adapter->isc << 27) | 0x80000000, + .parm64 = isc_to_int_word(adapter->isc), }; int ret = 0; From 24160af6cb289ace9bde980b33d11713c8fc8192 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 14 Jun 2017 13:21:32 +0200 Subject: [PATCH 166/197] KVM: s390: add GISA interrupts to FLIC ioctl interface Pending interrupts marked in the GISA IPM are required to become part of the answer of ioctl KVM_DEV_FLIC_GET_ALL_IRQS. The ioctl KVM_DEV_FLIC_ENQUEUE is already capable to enqueue adapter interrupts when a GISA is present. With ioctl KVM_DEV_FLIC_CLEAR_IRQS the GISA IPM wil be cleared now as well. Signed-off-by: Michael Mueller Reviewed-by: Halil Pasic Reviewed-by: Pierre Morel Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index a0ded3a23a5e0c..dd4c50b82a5320 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1879,6 +1879,7 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) for (i = 0; i < FIRQ_MAX_COUNT; i++) fi->counters[i] = 0; spin_unlock(&fi->lock); + kvm_s390_gisa_clear(kvm); }; static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) @@ -1906,6 +1907,22 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) max_irqs = len / sizeof(struct kvm_s390_irq); + if (kvm->arch.gisa && + kvm_s390_gisa_get_ipm(kvm->arch.gisa)) { + for (i = 0; i <= MAX_ISC; i++) { + if (n == max_irqs) { + /* signal userspace to try again */ + ret = -ENOMEM; + goto out_nolock; + } + if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { + irq = (struct kvm_s390_irq *) &buf[n]; + irq->type = KVM_S390_INT_IO(1, 0, 0, 0); + irq->u.io.io_int_word = isc_to_int_word(i); + n++; + } + } + } fi = &kvm->arch.float_int; spin_lock(&fi->lock); for (i = 0; i < FIRQ_LIST_COUNT; i++) { @@ -1944,6 +1961,7 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) out: spin_unlock(&fi->lock); +out_nolock: if (!ret && n > 0) { if (copy_to_user(usrbuf, buf, sizeof(struct kvm_s390_irq) * n)) ret = -EFAULT; From 4b35f65e67ee0e5bc4f394efa14a9fa3917cd2c1 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Fri, 7 Jul 2017 15:27:31 +0200 Subject: [PATCH 167/197] KVM: s390: make kvm_s390_get_io_int() aware of GISA The function returns a pending I/O interrupt with the highest priority defined by its ISC. Together with AIV activation, pending adapter interrupts are managed by the GISA IPM. Thus kvm_s390_get_io_int() needs to inspect the IPM as well when the interrupt with the highest priority has to be identified. In case classic and adapter interrupts with the same ISC are pending, the classic interrupt will be returned first. Signed-off-by: Michael Mueller Reviewed-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 74 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index dd4c50b82a5320..f2dc86884ea429 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1471,20 +1471,86 @@ static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm, return NULL; } +static struct kvm_s390_interrupt_info *get_top_io_int(struct kvm *kvm, + u64 isc_mask, u32 schid) +{ + struct kvm_s390_interrupt_info *inti = NULL; + int isc; + + for (isc = 0; isc <= MAX_ISC && !inti; isc++) { + if (isc_mask & isc_to_isc_bits(isc)) + inti = get_io_int(kvm, isc, schid); + } + return inti; +} + +static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid) +{ + unsigned long active_mask; + int isc; + + if (schid) + goto out; + if (!kvm->arch.gisa) + goto out; + + active_mask = (isc_mask & kvm_s390_gisa_get_ipm(kvm->arch.gisa) << 24) << 32; + while (active_mask) { + isc = __fls(active_mask) ^ (BITS_PER_LONG - 1); + if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) + return isc; + clear_bit_inv(isc, &active_mask); + } +out: + return -EINVAL; +} + /* * Dequeue and return an I/O interrupt matching any of the interruption * subclasses as designated by the isc mask in cr6 and the schid (if != 0). + * Take into account the interrupts pending in the interrupt list and in GISA. + * + * Note that for a guest that does not enable I/O interrupts + * but relies on TPI, a flood of classic interrupts may starve + * out adapter interrupts on the same isc. Linux does not do + * that, and it is possible to work around the issue by configuring + * different iscs for classic and adapter interrupts in the guest, + * but we may want to revisit this in the future. */ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 isc_mask, u32 schid) { - struct kvm_s390_interrupt_info *inti = NULL; + struct kvm_s390_interrupt_info *inti, *tmp_inti; int isc; - for (isc = 0; isc <= MAX_ISC && !inti; isc++) { - if (isc_mask & isc_to_isc_bits(isc)) - inti = get_io_int(kvm, isc, schid); + inti = get_top_io_int(kvm, isc_mask, schid); + + isc = get_top_gisa_isc(kvm, isc_mask, schid); + if (isc < 0) + /* no AI in GISA */ + goto out; + + if (!inti) + /* AI in GISA but no classical IO int */ + goto gisa_out; + + /* both types of interrupts present */ + if (int_word_to_isc(inti->io.io_int_word) <= isc) { + /* classical IO int with higher priority */ + kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + goto out; } +gisa_out: + tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL); + if (tmp_inti) { + tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0); + tmp_inti->io.io_int_word = isc_to_int_word(isc); + if (inti) + kvm_s390_reinject_io_int(kvm, inti); + inti = tmp_inti; + } else + kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); +out: return inti; } From f180bfdae024b34e71e89dcc82b037dd97f74c3a Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Fri, 23 Jun 2017 14:46:21 +0200 Subject: [PATCH 168/197] KVM: s390: activate GISA for emulated interrupts If the AIV facility is available, a GISA will be used to manage emulated adapter interrupts. Signed-off-by: Michael Mueller Reviewed-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f2dc86884ea429..488ecc7ea2a18e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2842,7 +2842,13 @@ void kvm_s390_gisa_clear(struct kvm *kvm) void kvm_s390_gisa_init(struct kvm *kvm) { - /* not implemented yet */ + if (!css_general_characteristics.aiv) + kvm->arch.gisa = NULL; + else { + kvm->arch.gisa = &kvm->arch.sie_page2->gisa; + VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); + kvm_s390_gisa_clear(kvm); + } } void kvm_s390_gisa_destroy(struct kvm *kvm) From 9e73ea7056bd5f7e1b6e66e3d503478bb5160f07 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Mon, 12 Jun 2017 13:49:28 +0200 Subject: [PATCH 169/197] s390/sclp: expose the GISA format facility The GISA format facility is required by the host to be able to process a format-1 GISA. If not available, the used GISA format will be format-0. All format-1 related extension will not be available in this case. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Reviewed-by: Christian Borntraeger Acked-by: David Hildenbrand Acked-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 1 + drivers/s390/char/sclp_early.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index d3c1a8a2e3ad4b..3cae9168f63c4f 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -77,6 +77,7 @@ struct sclp_info { unsigned char has_ibs : 1; unsigned char has_skey : 1; unsigned char has_kss : 1; + unsigned char has_gisaf : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index d06bc5674e5f95..6b1891539c840b 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -49,7 +49,7 @@ struct read_info_sccb { u8 _pad_112[116 - 112]; /* 112-115 */ u8 fac116; /* 116 */ u8 fac117; /* 117 */ - u8 _pad_118; /* 118 */ + u8 fac118; /* 118 */ u8 fac119; /* 119 */ u16 hcpua; /* 120-121 */ u8 _pad_122[124 - 122]; /* 122-123 */ @@ -100,6 +100,7 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) sclp.has_esca = !!(sccb->fac116 & 0x08); sclp.has_pfmfi = !!(sccb->fac117 & 0x40); sclp.has_ibs = !!(sccb->fac117 & 0x20); + sclp.has_gisaf = !!(sccb->fac118 & 0x08); sclp.has_hvs = !!(sccb->fac119 & 0x80); sclp.has_kss = !!(sccb->fac98 & 0x01); if (sccb->fac85 & 0x02) From 4b9f952577fb40875a2a163d80515a8daa0d6bef Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Fri, 23 Jun 2017 13:51:25 +0200 Subject: [PATCH 170/197] KVM: s390: introduce the format-1 GISA The patch modifies the previously defined GISA data structure to be able to store two GISA formats, format-0 and format-1. Additionally, it verifies the availability of the GISA format facility and enables the use of a format-1 GISA in the SIE control block accordingly. A format-1 can do everything that format-0 can and we will need it for real HW passthrough. As there are systems with only format-0 we keep both variants. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 44 ++++++++++++++++++++++++-------- arch/s390/kvm/interrupt.c | 4 +-- arch/s390/kvm/kvm-s390.c | 2 ++ 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 76d6f0ef3dbdc1..59dd46adf0e862 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -228,6 +228,7 @@ struct kvm_s390_sie_block { __u8 epdx; /* 0x0069 */ __u8 reserved6a[2]; /* 0x006a */ __u32 todpr; /* 0x006c */ +#define GISA_FORMAT1 0x00000001 __u32 gd; /* 0x0070 */ __u8 reserved74[12]; /* 0x0074 */ __u64 mso; /* 0x0080 */ @@ -719,15 +720,38 @@ struct kvm_s390_crypto_cb { }; struct kvm_s390_gisa { - u32 next_alert; - u8 ipm; - u8 reserved01; - u8 : 6; - u8 g : 1; - u8 c : 1; - u8 iam; - u8 reserved02[4]; - u32 airq_count; + union { + struct { /* common to all formats */ + u32 next_alert; + u8 ipm; + u8 reserved01[2]; + u8 iam; + }; + struct { /* format 0 */ + u32 next_alert; + u8 ipm; + u8 reserved01; + u8 : 6; + u8 g : 1; + u8 c : 1; + u8 iam; + u8 reserved02[4]; + u32 airq_count; + } g0; + struct { /* format 1 */ + u32 next_alert; + u8 ipm; + u8 simm; + u8 nimm; + u8 iam; + u8 aism[8]; + u8 : 6; + u8 g : 1; + u8 c : 1; + u8 reserved03[11]; + u32 airq_count; + } g1; + }; }; /* @@ -738,7 +762,7 @@ struct sie_page2 { __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ struct kvm_s390_crypto_cb crycb; /* 0x0800 */ struct kvm_s390_gisa gisa; /* 0x0900 */ - u8 reserved910[0x1000 - 0x910]; /* 0x0910 */ + u8 reserved920[0x1000 - 0x920]; /* 0x0920 */ }; struct kvm_s390_vsie { diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 488ecc7ea2a18e..aabf46f5f883d4 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2842,9 +2842,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm) void kvm_s390_gisa_init(struct kvm *kvm) { - if (!css_general_characteristics.aiv) - kvm->arch.gisa = NULL; - else { + if (css_general_characteristics.aiv) { kvm->arch.gisa = &kvm->arch.sie_page2->gisa; VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); kvm_s390_gisa_clear(kvm); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2c5e25b394352b..f85a405e2595a7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2531,6 +2531,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa; + if (vcpu->arch.sie_block->gd && sclp.has_gisaf) + vcpu->arch.sie_block->gd |= GISA_FORMAT1; seqcount_init(&vcpu->arch.cputm_seqcount); rc = kvm_vcpu_init(vcpu, kvm, id); From 89a8f6d4904c8cf3ff8fee9fdaff392a6bbb8bf6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:31 +0100 Subject: [PATCH 171/197] x86/hyperv: Check for required priviliges in hyperv_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In hyperv_init() its presumed that it always has access to VP index and hypercall MSRs while according to the specification it should be checked if it's allowed to access the corresponding MSRs before accessing them. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-2-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 189a398290dbb2..21f9d53d9f0043 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -110,12 +110,19 @@ static int hv_cpu_init(unsigned int cpu) */ void hyperv_init(void) { - u64 guest_id; + u64 guest_id, required_msrs; union hv_x64_msr_hypercall_contents hypercall_msr; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; + /* Absolutely required MSRs */ + required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE | + HV_X64_MSR_VP_INDEX_AVAILABLE; + + if ((ms_hyperv.features & required_msrs) != required_msrs) + return; + /* Allocate percpu VP index */ hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), GFP_KERNEL); From e2768eaa1ca4fbb7b778da5615cce3dd310352e6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:32 +0100 Subject: [PATCH 172/197] x86/hyperv: Add a function to read both TSC and TSC page value simulateneously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is going to be used from KVM code where both TSC and TSC page value are needed. Nothing is supposed to use the function when Hyper-V code is compiled out, just BUG(). Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-3-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 1 + arch/x86/include/asm/mshyperv.h | 23 +++++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 21f9d53d9f0043..1a6c63f721bc01 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -37,6 +37,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void) { return tsc_pg; } +EXPORT_SYMBOL_GPL(hv_get_tsc_page); static u64 read_hv_clock_tsc(struct clocksource *arg) { diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 8bf450b13d9f53..6b1d4ea78270fb 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -325,9 +325,10 @@ static inline void hyperv_setup_mmu_ops(void) {} #ifdef CONFIG_HYPERV_TSCPAGE struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) +static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc) { - u64 scale, offset, cur_tsc; + u64 scale, offset; u32 sequence; /* @@ -358,7 +359,7 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) scale = READ_ONCE(tsc_pg->tsc_scale); offset = READ_ONCE(tsc_pg->tsc_offset); - cur_tsc = rdtsc_ordered(); + *cur_tsc = rdtsc_ordered(); /* * Make sure we read sequence after we read all other values @@ -368,7 +369,14 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); - return mul_u64_u64_shr(cur_tsc, scale, 64) + offset; + return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; +} + +static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) +{ + u64 cur_tsc; + + return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc); } #else @@ -376,5 +384,12 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) { return NULL; } + +static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc) +{ + BUG(); + return U64_MAX; +} #endif #endif From 93286261de1b46339aa27cd4c639b21778f6cade Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:33 +0100 Subject: [PATCH 173/197] x86/hyperv: Reenlightenment notifications support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V supports Live Migration notification. This is supposed to be used in conjunction with TSC emulation: when a VM is migrated to a host with different TSC frequency for some short period the host emulates the accesses to TSC and sends an interrupt to notify about the event. When the guest is done updating everything it can disable TSC emulation and everything will start working fast again. These notifications weren't required until now as Hyper-V guests are not supposed to use TSC as a clocksource: in Linux the TSC is even marked as unstable on boot. Guests normally use 'tsc page' clocksource and host updates its values on migrations automatically. Things change when with nested virtualization: even when the PV clocksources (kvm-clock or tsc page) are passed through to the nested guests the TSC frequency and frequency changes need to be know.. Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that the fix in on the way. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-4-vkuznets@redhat.com --- arch/x86/entry/entry_32.S | 3 + arch/x86/entry/entry_64.S | 3 + arch/x86/hyperv/hv_init.c | 89 ++++++++++++++++++++++++++++++ arch/x86/include/asm/irq_vectors.h | 7 ++- arch/x86/include/asm/mshyperv.h | 9 +++ arch/x86/include/uapi/asm/hyperv.h | 27 +++++++++ arch/x86/kernel/cpu/mshyperv.c | 6 ++ 7 files changed, 143 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2a35b1e0fb902a..7a796eeddf99c8 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -895,6 +895,9 @@ BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, hyperv_vector_handler) +BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_intr) + #endif /* CONFIG_HYPERV */ ENTRY(page_fault) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a835704951629f..553aa49909ce9b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1245,6 +1245,9 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ #if IS_ENABLED(CONFIG_HYPERV) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler + +apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ + hyperv_reenlightenment_vector hyperv_reenlightenment_intr #endif /* CONFIG_HYPERV */ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1a6c63f721bc01..712ac40081f7e3 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -18,6 +18,8 @@ */ #include +#include +#include #include #include #include @@ -102,6 +104,93 @@ static int hv_cpu_init(unsigned int cpu) return 0; } +static void (*hv_reenlightenment_cb)(void); + +static void hv_reenlightenment_notify(struct work_struct *dummy) +{ + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + /* Don't issue the callback if TSC accesses are not emulated */ + if (hv_reenlightenment_cb && emu_status.inprogress) + hv_reenlightenment_cb(); +} +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); + +void hyperv_stop_tsc_emulation(void) +{ + u64 freq; + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + emu_status.inprogress = 0; + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + tsc_khz = div64_u64(freq, 1000); +} +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); + +static inline bool hv_reenlightenment_available(void) +{ + /* + * Check for required features and priviliges to make TSC frequency + * change notifications work. + */ + return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && + ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; +} + +__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) +{ + entering_ack_irq(); + + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); + + exiting_irq(); +} + +void set_hv_tscchange_cb(void (*cb)(void)) +{ + struct hv_reenlightenment_control re_ctrl = { + .vector = HYPERV_REENLIGHTENMENT_VECTOR, + .enabled = 1, + .target_vp = hv_vp_index[smp_processor_id()] + }; + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; + + if (!hv_reenlightenment_available()) { + pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + return; + } + + hv_reenlightenment_cb = cb; + + /* Make sure callback is registered before we write to MSRs */ + wmb(); + + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); +} +EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); + +void clear_hv_tscchange_cb(void) +{ + struct hv_reenlightenment_control re_ctrl; + + if (!hv_reenlightenment_available()) + return; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + re_ctrl.enabled = 0; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + + hv_reenlightenment_cb = NULL; +} +EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 67421f649cfa18..e71c1120426bbc 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -103,7 +103,12 @@ #endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef -#define LOCAL_TIMER_VECTOR 0xee + +#if IS_ENABLED(CONFIG_HYPERV) +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee +#endif + +#define LOCAL_TIMER_VECTOR 0xed #define NR_VECTORS 256 diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 6b1d4ea78270fb..1790002a2052d9 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -160,6 +160,7 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_set_synint_state(int_num, val) wrmsrl(int_num, val) void hyperv_callback_vector(void); +void hyperv_reenlightenment_vector(void); #ifdef CONFIG_TRACING #define trace_hyperv_callback_vector hyperv_callback_vector #endif @@ -316,11 +317,19 @@ void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs, long err); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); + +void hyperv_reenlightenment_intr(struct pt_regs *regs); +void set_hv_tscchange_cb(void (*cb)(void)); +void clear_hv_tscchange_cb(void); +void hyperv_stop_tsc_emulation(void); #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline bool hv_is_hypercall_page_setup(void) { return false; } static inline void hyperv_cleanup(void) {} static inline void hyperv_setup_mmu_ops(void) {} +static inline void set_hv_tscchange_cb(void (*cb)(void)) {} +static inline void clear_hv_tscchange_cb(void) {} +static inline void hyperv_stop_tsc_emulation(void) {}; #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 1a5bfead93b410..197c2e6c73765c 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -40,6 +40,9 @@ */ #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) +/* AccessReenlightenmentControls privilege */ +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) + /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available @@ -234,6 +237,30 @@ #define HV_X64_MSR_CRASH_PARAMS \ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) +/* TSC emulation after migration */ +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 + +struct hv_reenlightenment_control { + u64 vector:8; + u64 reserved1:8; + u64 enabled:1; + u64 reserved2:15; + u64 target_vp:32; +}; + +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 + +struct hv_tsc_emulation_control { + u64 enabled:1; + u64 reserved:63; +}; + +struct hv_tsc_emulation_status { + u64 inprogress:1; + u64 reserved:63; +}; + #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 85eb5fc180c819..9340f41ce8d3d3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -251,6 +251,12 @@ static void __init ms_hyperv_init_platform(void) hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + + /* Setup the IDT for reenlightenment notifications */ + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_vector); + #endif } From e7c4e36c447daca2b7df49024f6bf230871cb155 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:34 +0100 Subject: [PATCH 174/197] x86/hyperv: Redirect reenlightment notifications on CPU offlining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is very unlikely for CPUs to get offlined when running on Hyper-V as there is a protection in the vmbus module which prevents it when the guest has any VMBus devices assigned. This, however, may change in future if an option to reassign an already active channel will be added. It is also possible to run without any Hyper-V devices or to have a CPU with no assigned channels. Reassign reenlightenment notifications to some other active CPU when the CPU which is assigned to them goes offline. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-5-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 712ac40081f7e3..e4377e2f2a10b0 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -191,6 +191,26 @@ void clear_hv_tscchange_cb(void) } EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); +static int hv_cpu_die(unsigned int cpu) +{ + struct hv_reenlightenment_control re_ctrl; + unsigned int new_cpu; + + if (hv_reenlightenment_cb == NULL) + return 0; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + if (re_ctrl.target_vp == hv_vp_index[cpu]) { + /* Reassign to some other online CPU */ + new_cpu = cpumask_any_but(cpu_online_mask, cpu); + + re_ctrl.target_vp = hv_vp_index[new_cpu]; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + } + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -220,7 +240,7 @@ void hyperv_init(void) return; if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", - hv_cpu_init, NULL) < 0) + hv_cpu_init, hv_cpu_die) < 0) goto free_vp_index; /* From 51d4e5daa32808df4d50db511d167fde19fa114e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:35 +0100 Subject: [PATCH 175/197] x86/irq: Count Hyper-V reenlightenment interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V reenlightenment interrupts arrive when the VM is migrated, While they are not interesting in general it's important when L2 nested guests are running. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-6-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 2 ++ arch/x86/include/asm/hardirq.h | 3 +++ arch/x86/kernel/irq.c | 9 +++++++++ 3 files changed, 14 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e4377e2f2a10b0..a3adece392f14a 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -147,6 +147,8 @@ __visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) { entering_ack_irq(); + inc_irq_stat(irq_hv_reenlightenment_count); + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); exiting_irq(); diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 51cc979dd3642f..7c341a74ec8c49 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -38,6 +38,9 @@ typedef struct { #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif +#if IS_ENABLED(CONFIG_HYPERV) + unsigned int irq_hv_reenlightenment_count; +#endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 68e1867cca8045..45fb4d2565f80a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -141,6 +141,15 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->irq_hv_callback_count); seq_puts(p, " Hypervisor callback interrupts\n"); } +#endif +#if IS_ENABLED(CONFIG_HYPERV) + if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HRE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_reenlightenment_count); + seq_puts(p, " Hyper-V reenlightenment interrupts\n"); + } #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) From b0c39dc68e3b3d22bf9d2984f62f6c86788a49e7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:36 +0100 Subject: [PATCH 176/197] x86/kvm: Pass stable clocksource to guests when running nested on Hyper-V MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, KVM is able to work in 'masterclock' mode passing PVCLOCK_TSC_STABLE_BIT to guests when the clocksource which is used on the host is TSC. When running nested on Hyper-V the guest normally uses a different one: TSC page which is resistant to TSC frequency changes on events like L1 migration. Add support for it in KVM. The only non-trivial change is in vgettsc(): when updating the gtod copy both the clock readout and tsc value have to be updated now. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-7-vkuznets@redhat.com --- arch/x86/kvm/x86.c | 93 +++++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c53298dfbf50a7..b1ce368a07af31 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -67,6 +67,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -1377,6 +1378,11 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } +static inline int gtod_is_based_on_tsc(int mode) +{ + return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK; +} + static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 @@ -1396,7 +1402,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * perform request to enable masterclock. */ if (ka->use_master_clock || - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) + (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, @@ -1459,6 +1465,19 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vcpu->arch.tsc_offset = offset; } +static inline bool kvm_check_tsc_unstable(void) +{ +#ifdef CONFIG_X86_64 + /* + * TSC is marked unstable when we're running on Hyper-V, + * 'TSC page' clocksource is good. + */ + if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK) + return false; +#endif + return check_tsc_unstable(); +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1504,7 +1523,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) */ if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { - if (!check_tsc_unstable()) { + if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { @@ -1604,18 +1623,43 @@ static u64 read_tsc(void) return last; } -static inline u64 vgettsc(u64 *cycle_now) +static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) { long v; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + u64 tsc_pg_val; + + switch (gtod->clock.vclock_mode) { + case VCLOCK_HVCLOCK: + tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), + tsc_timestamp); + if (tsc_pg_val != U64_MAX) { + /* TSC page valid */ + *mode = VCLOCK_HVCLOCK; + v = (tsc_pg_val - gtod->clock.cycle_last) & + gtod->clock.mask; + } else { + /* TSC page invalid */ + *mode = VCLOCK_NONE; + } + break; + case VCLOCK_TSC: + *mode = VCLOCK_TSC; + *tsc_timestamp = read_tsc(); + v = (*tsc_timestamp - gtod->clock.cycle_last) & + gtod->clock.mask; + break; + default: + *mode = VCLOCK_NONE; + } - *cycle_now = read_tsc(); + if (*mode == VCLOCK_NONE) + *tsc_timestamp = v = 0; - v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; return v * gtod->clock.mult; } -static int do_monotonic_boot(s64 *t, u64 *cycle_now) +static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1624,9 +1668,8 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) do { seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; ns = gtod->nsec_base; - ns += vgettsc(cycle_now); + ns += vgettsc(tsc_timestamp, &mode); ns >>= gtod->clock.shift; ns += gtod->boot_ns; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -1635,7 +1678,7 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) return mode; } -static int do_realtime(struct timespec *ts, u64 *cycle_now) +static int do_realtime(struct timespec *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1644,10 +1687,9 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now) do { seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->nsec_base; - ns += vgettsc(cycle_now); + ns += vgettsc(tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -1657,25 +1699,26 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now) return mode; } -/* returns true if host is using tsc clocksource */ -static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) +/* returns true if host is using TSC based clocksource */ +static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; + return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns, + tsc_timestamp)); } -/* returns true if host is using tsc clocksource */ +/* returns true if host is using TSC based clocksource */ static bool kvm_get_walltime_and_clockread(struct timespec *ts, - u64 *cycle_now) + u64 *tsc_timestamp) { /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return do_realtime(ts, cycle_now) == VCLOCK_TSC; + return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); } #endif @@ -2869,13 +2912,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); - if (check_tsc_unstable()) { + if (kvm_check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); @@ -6110,9 +6153,9 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, update_pvclock_gtod(tk); /* disable master clock if host does not trust, or does not - * use, TSC clocksource + * use, TSC based clocksource. */ - if (gtod->clock.vclock_mode != VCLOCK_TSC && + if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) queue_work(system_long_wq, &pvclock_gtod_work); @@ -7767,7 +7810,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, { struct kvm_vcpu *vcpu; - if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); @@ -7924,7 +7967,7 @@ int kvm_arch_hardware_enable(void) return ret; local_tsc = rdtsc(); - stable = !check_tsc_unstable(); + stable = !kvm_check_tsc_unstable(); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (!stable && vcpu->cpu == smp_processor_id()) From 0092e4346f49558e5fe5a927c6d78d401dc4ed73 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:37 +0100 Subject: [PATCH 177/197] x86/kvm: Support Hyper-V reenlightenment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running nested KVM on Hyper-V guests its required to update masterclocks for all guests when L1 migrates to a host with different TSC frequency. Implement the procedure in the following way: - Pause all guests. - Tell the host (Hyper-V) to stop emulating TSC accesses. - Update the gtod copy, recompute clocks. - Unpause all guests. This is somewhat similar to cpufreq but there are two important differences: - TSC emulation can only be disabled globally (on all CPUs) - The new TSC frequency is not known until emulation is turned off so there is no way to 'prepare' for the event upfront. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-8-vkuznets@redhat.com --- arch/x86/kvm/x86.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b1ce368a07af31..879a9998740166 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,6 +68,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -5932,6 +5933,43 @@ static void tsc_khz_changed(void *data) __this_cpu_write(cpu_tsc_khz, khz); } +static void kvm_hyperv_tsc_notifier(void) +{ +#ifdef CONFIG_X86_64 + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int cpu; + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_make_mclock_inprogress_request(kvm); + + hyperv_stop_tsc_emulation(); + + /* TSC frequency always matches when on Hyper-V */ + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + kvm_max_guest_tsc_khz = tsc_khz; + + list_for_each_entry(kvm, &vm_list, vm_list) { + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(cpu, vcpu, kvm) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + + kvm_for_each_vcpu(cpu, vcpu, kvm) + kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); + + spin_unlock(&ka->pvclock_gtod_sync_lock); + } + spin_unlock(&kvm_lock); +#endif +} + static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -6217,6 +6255,9 @@ int kvm_arch_init(void *opaque) kvm_lapic_init(); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); + + if (x86_hyper_type == X86_HYPER_MS_HYPERV) + set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif return 0; @@ -6229,6 +6270,10 @@ int kvm_arch_init(void *opaque) void kvm_arch_exit(void) { +#ifdef CONFIG_X86_64 + if (x86_hyper_type == X86_HYPER_MS_HYPERV) + clear_hv_tscchange_cb(); +#endif kvm_lapic_exit(); perf_unregister_guest_info_callbacks(&kvm_guest_cbs); From 2fc616c06e125baed9e8d0cd166a4e58468939c8 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 12 Jan 2018 15:53:34 +0100 Subject: [PATCH 178/197] MAINTAINERS: add David as a reviewer for KVM/s390 Acked-by: Janosch Frank Acked-by: Christian Borntraeger Acked-by: David Hildenbrand Signed-off-by: Cornelia Huck --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 82ad0eabce4f3e..cd7a416c516b2c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7693,6 +7693,7 @@ F: arch/powerpc/kernel/kvm* KERNEL VIRTUAL MACHINE for s390 (KVM/s390) M: Christian Borntraeger M: Cornelia Huck +R: David Hildenbrand L: linux-s390@vger.kernel.org W: http://www.ibm.com/developerworks/linux/linux390/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git From 78269f3418e5c25e1426a21378952f6f7b78ad1f Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 12 Jan 2018 15:53:34 +0100 Subject: [PATCH 179/197] MAINTAINERS: add Halil as additional vfio-ccw maintainer Acked-by: Janosch Frank Acked-by: Christian Borntraeger Acked-by: Halil Pasic Acked-by: Dong Jia Shi Signed-off-by: Cornelia Huck --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index cd7a416c516b2c..990ac01c319dab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11863,6 +11863,7 @@ F: drivers/pci/hotplug/s390_pci_hpc.c S390 VFIO-CCW DRIVER M: Cornelia Huck M: Dong Jia Shi +M: Halil Pasic L: linux-s390@vger.kernel.org L: kvm@vger.kernel.org S: Supported From cd74ff94524e138e72e9feb5bf5b30ce28ba2311 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 12 Jan 2018 15:53:34 +0100 Subject: [PATCH 180/197] MAINTAINERS: update KVM/s390 maintainers As I have neither too much time nor access to the architecture documentation anymore, let's switch my status from maintainer to reviewer. Janosch will step in as second maintainer. Acked-by: Janosch Frank Acked-by: Christian Borntraeger Signed-off-by: Cornelia Huck --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 990ac01c319dab..71781ad422f0a7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7692,8 +7692,9 @@ F: arch/powerpc/kernel/kvm* KERNEL VIRTUAL MACHINE for s390 (KVM/s390) M: Christian Borntraeger -M: Cornelia Huck +M: Janosch Frank R: David Hildenbrand +R: Cornelia Huck L: linux-s390@vger.kernel.org W: http://www.ibm.com/developerworks/linux/linux390/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git From 13e59ece5b30f39e4e1e1fac2b2ddc7ed527f3cc Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 25 Jan 2018 14:20:19 +0100 Subject: [PATCH 181/197] KVM: arm/arm64: Fix incorrect timer_is_pending logic After the recently introduced support for level-triggered mapped interrupt, I accidentally left the VCPU thread busily going back and forward between the guest and the hypervisor whenever the guest was blocking, because I would always incorrectly report that a timer interrupt was pending. This is because the timer->irq.level field is not valid for mapped interrupts, where we offload the level state to the hardware, and as a result this field is always true. Luckily the problem can be relatively easily solved by not checking the cached signal state of either timer in kvm_timer_should_fire() but instead compute the timer state on the fly, which we do already if the cached signal state wasn't high. In fact, the only reason for checking the cached signal state was a tiny optimization which would only be potentially faster when the polling loop detects a pending timer interrupt, which is quite unlikely. Instead of duplicating the logic from kvm_arch_timer_handler(), we enlighten kvm_timer_should_fire() to report something valid when the timer state is loaded onto the hardware. We can then call this from kvm_arch_timer_handler() as well and avoid the call to __timer_snapshot_state() in kvm_arch_timer_get_input_level(). Reported-by: Tomasz Nowicki Tested-by: Tomasz Nowicki Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index cfcd0323deabed..63cf828f3c4f83 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -97,10 +97,9 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) pr_warn_once("Spurious arch timer IRQ on non-VCPU thread\n"); return IRQ_NONE; } - vtimer = vcpu_vtimer(vcpu); - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - if (kvm_timer_irq_can_fire(vtimer)) + vtimer = vcpu_vtimer(vcpu); + if (kvm_timer_should_fire(vtimer)) kvm_timer_update_irq(vcpu, true, vtimer); if (static_branch_unlikely(&userspace_irqchip_in_use) && @@ -230,6 +229,16 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { u64 cval, now; + if (timer_ctx->loaded) { + u32 cnt_ctl; + + /* Only the virtual timer can be loaded so far */ + cnt_ctl = read_sysreg_el0(cntv_ctl); + return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && + (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && + !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); + } + if (!kvm_timer_irq_can_fire(timer_ctx)) return false; @@ -244,15 +253,7 @@ bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - if (vtimer->irq.level || ptimer->irq.level) - return true; - - /* - * When this is called from withing the wait loop of kvm_vcpu_block(), - * the software view of the timer state is up to date (timer->loaded - * is false), and so we can simply check if the timer should fire now. - */ - if (!vtimer->loaded && kvm_timer_should_fire(vtimer)) + if (kvm_timer_should_fire(vtimer)) return true; return kvm_timer_should_fire(ptimer); @@ -270,9 +271,9 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) /* Populate the device bitmap with the timer states */ regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | KVM_ARM_DEV_EL1_PTIMER); - if (vtimer->irq.level) + if (kvm_timer_should_fire(vtimer)) regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; - if (ptimer->irq.level) + if (kvm_timer_should_fire(ptimer)) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } @@ -507,8 +508,8 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; - return vtimer->irq.level != vlevel || - ptimer->irq.level != plevel; + return kvm_timer_should_fire(vtimer) != vlevel || + kvm_timer_should_fire(ptimer) != plevel; } void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) @@ -801,9 +802,6 @@ bool kvm_arch_timer_get_input_level(int vintid) else BUG(); /* We only map the vtimer so far */ - if (timer->loaded) - __timer_snapshot_state(timer); - return kvm_timer_should_fire(timer); } From f1d7231cede93a42d75b6d3c5ca599e94a273e89 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 25 Jan 2018 18:32:29 +0100 Subject: [PATCH 182/197] KVM: arm/arm64: Fix userspace_irqchip_in_use counting We were not decrementing the static key count in the right location. kvm_arch_vcpu_destroy() is only called to clean up after a failed VCPU create attempt, whereas kvm_arch_vcpu_free() is called on teardown of the VM as well. Move the static key decrement call to kvm_arch_vcpu_free(). Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/arm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 639dca0c056090..04ee7a3278706a 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -295,6 +295,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { + if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) + static_branch_dec(&userspace_irqchip_in_use); + kvm_mmu_free_memory_caches(vcpu); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); @@ -304,8 +307,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) - static_branch_dec(&userspace_irqchip_in_use); kvm_arch_vcpu_free(vcpu); } From cd15d2050c044ca9525ba165e9073ac8e036b8d0 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 26 Jan 2018 16:20:22 +0100 Subject: [PATCH 183/197] KVM: arm/arm64: Fixup userspace irqchip static key optimization When I introduced a static key to avoid work in the critical path for userspace irqchips which is very rarely used, I accidentally messed up my logic and used && where I should have used ||, because the point was to short-circuit the evaluation in case userspace irqchips weren't even in use. This fixes an issue when running in-kernel irqchip VMs alongside userspace irqchip VMs. Acked-by: Marc Zyngier Fixes: c44c232ee2d3 ("KVM: arm/arm64: Avoid work when userspace iqchips are not used") Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 63cf828f3c4f83..fb6bd9b9845ea7 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -286,7 +286,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level); - if (!static_branch_unlikely(&userspace_irqchip_in_use) && + if (!static_branch_unlikely(&userspace_irqchip_in_use) || likely(irqchip_in_kernel(vcpu->kvm))) { ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq, From 5fa4ec9cb2e6679e2f828033726f758ea314b9c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Jan 2018 09:41:40 +0100 Subject: [PATCH 184/197] x86/kvm: Make it compile on 32bit and with HYPYERVISOR_GUEST=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reenlightment support for hyperv slapped a direct reference to x86_hyper_type into the kvm code which results in the following build failure when CONFIG_HYPERVISOR_GUEST=n: arch/x86/kvm/x86.c:6259:6: error: ‘x86_hyper_type’ undeclared (first use in this function) arch/x86/kvm/x86.c:6259:6: note: each undeclared identifier is reported only once for each function it appears in Use the proper helper function to cure that. The 32bit compile fails because of: arch/x86/kvm/x86.c:5936:13: warning: ‘kvm_hyperv_tsc_notifier’ defined but not used [-Wunused-function] which is a real trainwreck engineering artwork. The callsite is wrapped into #ifdef CONFIG_X86_64, but the function itself has the #ifdef inside the function body. Make the function itself wrapped into the ifdef to cure that. Qualiteee.... Fixes: 0092e4346f49 ("x86/kvm: Support Hyper-V reenlightenment") Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 879a9998740166..cd3b3bc67c5a68 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5933,9 +5933,9 @@ static void tsc_khz_changed(void *data) __this_cpu_write(cpu_tsc_khz, khz); } +#ifdef CONFIG_X86_64 static void kvm_hyperv_tsc_notifier(void) { -#ifdef CONFIG_X86_64 struct kvm *kvm; struct kvm_vcpu *vcpu; int cpu; @@ -5967,8 +5967,8 @@ static void kvm_hyperv_tsc_notifier(void) spin_unlock(&ka->pvclock_gtod_sync_lock); } spin_unlock(&kvm_lock); -#endif } +#endif static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -6256,7 +6256,7 @@ int kvm_arch_init(void *opaque) #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); - if (x86_hyper_type == X86_HYPER_MS_HYPERV) + if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif @@ -6271,7 +6271,7 @@ int kvm_arch_init(void *opaque) void kvm_arch_exit(void) { #ifdef CONFIG_X86_64 - if (x86_hyper_type == X86_HYPER_MS_HYPERV) + if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); #endif kvm_lapic_exit(); From a340b3e229b24a56f1c7f5826b15a3af0f4b13e5 Mon Sep 17 00:00:00 2001 From: KarimAllah Ahmed Date: Wed, 17 Jan 2018 19:18:56 +0100 Subject: [PATCH 185/197] kvm: Map PFN-type memory regions as writable (if possible) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For EPT-violations that are triggered by a read, the pages are also mapped with write permissions (if their memory region is also writable). That would avoid getting yet another fault on the same page when a write occurs. This optimization only happens when you have a "struct page" backing the memory region. So also enable it for memory regions that do not have a "struct page". Cc: Paolo Bonzini Cc: Radim Krčmář Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: KarimAllah Ahmed Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- virt/kvm/kvm_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b4414842b02392..8af42eab126dc6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1428,7 +1428,8 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) static int hva_to_pfn_remapped(struct vm_area_struct *vma, unsigned long addr, bool *async, - bool write_fault, kvm_pfn_t *p_pfn) + bool write_fault, bool *writable, + kvm_pfn_t *p_pfn) { unsigned long pfn; int r; @@ -1454,6 +1455,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, } + if (writable) + *writable = true; /* * Get a reference here because callers of *hva_to_pfn* and @@ -1519,7 +1522,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn); + r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) From e46b469278a59781f9b25ff608af84892963821b Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Sat, 20 Jan 2018 04:04:22 +0900 Subject: [PATCH 186/197] kvm: embed vcpu id to dentry of vcpu anon inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All d-entries for vcpu have the same, "anon_inode:kvm-vcpu". That means it is impossible to know the mapping between fds for vcpu and vcpu from userland. # LC_ALL=C ls -l /proc/617/fd | grep vcpu lrwx------. 1 qemu qemu 64 Jan 7 16:50 18 -> anon_inode:kvm-vcpu lrwx------. 1 qemu qemu 64 Jan 7 16:50 19 -> anon_inode:kvm-vcpu It is also impossible to know the mapping between vma for kvm_run structure and vcpu from userland. # LC_ALL=C grep vcpu /proc/617/maps 7f9d842d0000-7f9d842d3000 rw-s 00000000 00:0d 20393 anon_inode:kvm-vcpu 7f9d842d3000-7f9d842d6000 rw-s 00000000 00:0d 20393 anon_inode:kvm-vcpu This change adds vcpu id to d-entries for vcpu. With this change you can get the following output: # LC_ALL=C ls -l /proc/617/fd | grep vcpu lrwx------. 1 qemu qemu 64 Jan 7 16:50 18 -> anon_inode:kvm-vcpu:0 lrwx------. 1 qemu qemu 64 Jan 7 16:50 19 -> anon_inode:kvm-vcpu:1 # LC_ALL=C grep vcpu /proc/617/maps 7f9d842d0000-7f9d842d3000 rw-s 00000000 00:0d 20393 anon_inode:kvm-vcpu:0 7f9d842d3000-7f9d842d6000 rw-s 00000000 00:0d 20393 anon_inode:kvm-vcpu:1 With the mappings known from the output, a tool like strace can report more details of qemu-kvm process activities. Here is the strace output of my local prototype: # ./strace -KK -f -p 617 2>&1 | grep 'KVM_RUN\| K' ... [pid 664] ioctl(18, KVM_RUN, 0) = 0 (KVM_EXIT_MMIO) K ready_for_interrupt_injection=1, if_flag=0, flags=0, cr8=0000000000000000, apic_base=0x000000fee00d00 K phys_addr=0, len=1634035803, [33, 0, 0, 0, 0, 0, 0, 0], is_write=112 [pid 664] ioctl(18, KVM_RUN, 0) = 0 (KVM_EXIT_MMIO) K ready_for_interrupt_injection=1, if_flag=1, flags=0, cr8=0000000000000000, apic_base=0x000000fee00d00 K phys_addr=0, len=1634035803, [33, 0, 0, 0, 0, 0, 0, 0], is_write=112 ... Signed-off-by: Masatake YAMATO Acked-by: Christian Borntraeger Signed-off-by: Radim Krčmář --- virt/kvm/kvm_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8af42eab126dc6..8a937b7cde3575 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2415,7 +2415,10 @@ static struct file_operations kvm_vcpu_fops = { */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); + char name[8 + 1 + ITOA_MAX_LEN + 1]; + + snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); + return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) From d391f1207067268261add0485f0f34503539c5b0 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 25 Jan 2018 16:37:07 +0100 Subject: [PATCH 187/197] x86/kvm/vmx: do not use vm-exit instruction length for fast MMIO when running nested MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I was investigating an issue with seabios >= 1.10 which stopped working for nested KVM on Hyper-V. The problem appears to be in handle_ept_violation() function: when we do fast mmio we need to skip the instruction so we do kvm_skip_emulated_instruction(). This, however, depends on VM_EXIT_INSTRUCTION_LEN field being set correctly in VMCS. However, this is not the case. Intel's manual doesn't mandate VM_EXIT_INSTRUCTION_LEN to be set when EPT MISCONFIG occurs. While on real hardware it was observed to be set, some hypervisors follow the spec and don't set it; we end up advancing IP with some random value. I checked with Microsoft and they confirmed they don't fill VM_EXIT_INSTRUCTION_LEN on EPT MISCONFIG. Fix the issue by doing instruction skip through emulator when running nested. Fixes: 68c3b4d1676d870f0453c31d5a52e7e65c7448ae Suggested-by: Radim Krčmář Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Acked-by: Michael S. Tsirkin Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 16 +++++++++++++++- arch/x86/kvm/x86.c | 3 ++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1e2ca9e8662ff2..438802d0b01d8c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6577,7 +6577,21 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); - return kvm_skip_emulated_instruction(vcpu); + /* + * Doing kvm_skip_emulated_instruction() depends on undefined + * behavior: Intel's manual doesn't mandate + * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG + * occurs and while on real hardware it was observed to be set, + * other hypervisors (namely Hyper-V) don't set it, we end up + * advancing IP with some random value. Disable fast mmio when + * running nested and keep it for real hardware in hope that + * VM_EXIT_INSTRUCTION_LEN will always be set correctly. + */ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return kvm_skip_emulated_instruction(vcpu); + else + return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, + NULL, 0) == EMULATE_DONE; } ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0204b2b8a293f1..01571102b50c95 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5773,7 +5773,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, * handle watchpoints yet, those would be handled in * the emulate_ops. */ - if (kvm_vcpu_check_breakpoint(vcpu, &r)) + if (!(emulation_type & EMULTYPE_SKIP) && + kvm_vcpu_check_breakpoint(vcpu, &r)) return r; ctxt->interruptibility = 0; From 806793f5f76eb9c000ed05e307a8f7e9450b3291 Mon Sep 17 00:00:00 2001 From: Stanislav Lanci Date: Mon, 29 Jan 2018 11:39:44 -0500 Subject: [PATCH 188/197] KVM: x86: AMD Processor Topology Information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch allow to enable x86 feature TOPOEXT. This is needed to provide information about SMT on AMD Zen CPUs to the guest. Signed-off-by: Stanislav Lanci Tested-by: Nick Sarnie Reviewed-by: Paolo Bonzini Signed-off-by: Babu Moger Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ac0041c2f5afe7..20e491b94f44e4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -371,7 +371,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | + F(TOPOEXT); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = From 87cedc6be55954c6efd6eca2e694132513f65a2a Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Fri, 26 Jan 2018 17:34:08 +0800 Subject: [PATCH 189/197] kvm: x86: remove efer_reload entry in kvm_vcpu_stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The efer_reload is never used since commit 26bb0981b3ff ("KVM: VMX: Use shared msr infrastructure"), so remove it. Signed-off-by: Longpeng(Mike) Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ea7e40e9c1f0f4..dd6f57a54a2626 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -895,7 +895,6 @@ struct kvm_vcpu_stat { u64 request_irq_exits; u64 irq_exits; u64 host_state_reload; - u64 efer_reload; u64 fpu_reload; u64 insn_emulation; u64 insn_emulation_fail; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01571102b50c95..c13cd14c478093 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -177,7 +177,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "request_irq", VCPU_STAT(request_irq_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "efer_reload", VCPU_STAT(efer_reload) }, { "fpu_reload", VCPU_STAT(fpu_reload) }, { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, From 36ee41d161c67a6fcf696d4817a0da31f778938c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 30 Jan 2018 10:51:32 +1100 Subject: [PATCH 190/197] KVM: PPC: Book3S HV: Drop locks before reading guest memory Running with CONFIG_DEBUG_ATOMIC_SLEEP reveals that HV KVM tries to read guest memory, in order to emulate guest instructions, while preempt is disabled and a vcore lock is held. This occurs in kvmppc_handle_exit_hv(), called from post_guest_process(), when emulating guest doorbell instructions on POWER9 systems, and also when checking whether we have hit a hypervisor breakpoint. Reading guest memory can cause a page fault and thus cause the task to sleep, so we need to avoid reading guest memory while holding a spinlock or when preempt is disabled. To fix this, we move the preempt_enable() in kvmppc_run_core() to before the loop that calls post_guest_process() for each vcore that has just run, and we drop and re-take the vcore lock around the calls to kvmppc_emulate_debug_inst() and kvmppc_emulate_doorbell_instr(). Dropping the lock is safe with respect to the iteration over the runnable vcpus in post_guest_process(); for_each_runnable_thread is actually safe to use locklessly. It is possible for a vcpu to become runnable and add itself to the runnable_threads array (code near the beginning of kvmppc_run_vcpu()) and then get included in the iteration in post_guest_process despite the fact that it has not just run. This is benign because vcpu->arch.trap and vcpu->arch.ceded will be zero. Cc: stable@vger.kernel.org # v4.13+ Fixes: 579006944e0d ("KVM: PPC: Book3S HV: Virtualize doorbell facility on POWER9") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e5f81fc108e094..aa6130b56b5ee3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1008,8 +1008,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_vcpu *tvcpu; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - return EMULATE_FAIL; if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE) return RESUME_GUEST; if (get_op(inst) != 31) @@ -1059,6 +1057,7 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +/* Called with vcpu->arch.vcore->lock held */ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -1179,7 +1178,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, swab32(vcpu->arch.emul_inst) : vcpu->arch.emul_inst; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { + /* Need vcore unlocked to call kvmppc_get_last_inst */ + spin_unlock(&vcpu->arch.vcore->lock); r = kvmppc_emulate_debug_inst(run, vcpu); + spin_lock(&vcpu->arch.vcore->lock); } else { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; @@ -1194,8 +1196,13 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, */ case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: r = EMULATE_FAIL; - if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) + if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) && + cpu_has_feature(CPU_FTR_ARCH_300)) { + /* Need vcore unlocked to call kvmppc_get_last_inst */ + spin_unlock(&vcpu->arch.vcore->lock); r = kvmppc_emulate_doorbell_instr(vcpu); + spin_lock(&vcpu->arch.vcore->lock); + } if (r == EMULATE_FAIL) { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; @@ -2946,13 +2953,14 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); + preempt_enable(); + for (sub = 0; sub < core_info.n_subcores; ++sub) { pvc = core_info.vc[sub]; post_guest_process(pvc, pvc == vc); } spin_lock(&vc->lock); - preempt_enable(); out: vc->vcore_state = VCORE_INACTIVE; From 07ae5389e98c53bb9e9f308fce9c903bc3ee7720 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 31 Jan 2018 22:24:58 +0100 Subject: [PATCH 191/197] KVM: PPC: Book3S PR: Fix svcpu copying with preemption enabled When copying between the vcpu and svcpu, we may get scheduled away onto a different host CPU which in turn means our svcpu pointer may change. That means we need to atomically copy to and from the svcpu with preemption disabled, so that all code around it always sees a coherent state. Reported-by: Simon Guo Fixes: 3d3319b45eea ("KVM: PPC: Book3S: PR: Enable interrupts earlier") Signed-off-by: Alexander Graf Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s.h | 6 ++---- arch/powerpc/kvm/book3s_interrupts.S | 4 +--- arch/powerpc/kvm/book3s_pr.c | 20 +++++++++----------- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 9a667007bff81c..376ae803b69c60 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -249,10 +249,8 @@ extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd); extern void kvmppc_pr_init_default_hcalls(struct kvm *kvm); extern int kvmppc_hcall_impl_pr(unsigned long cmd); extern int kvmppc_hcall_impl_hv_realmode(unsigned long cmd); -extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, - struct kvm_vcpu *vcpu); -extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, - struct kvmppc_book3s_shadow_vcpu *svcpu); +extern void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu); +extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu); extern int kvm_irq_bypass; static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 901e6fe00c39cd..c18e845019ec52 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -96,7 +96,7 @@ kvm_start_entry: kvm_start_lightweight: /* Copy registers into shadow vcpu so we can access them in real mode */ - GET_SHADOW_VCPU(r3) + mr r3, r4 bl FUNC(kvmppc_copy_to_svcpu) nop REST_GPR(4, r1) @@ -165,9 +165,7 @@ after_sprg3_load: stw r12, VCPU_TRAP(r3) /* Transfer reg values from shadow vcpu back to vcpu struct */ - /* On 64-bit, interrupts are still off at this point */ - GET_SHADOW_VCPU(r4) bl FUNC(kvmppc_copy_from_svcpu) nop diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d0dc8624198f8e..d6df74a2f7449a 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -120,7 +120,7 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) #ifdef CONFIG_PPC_BOOK3S_64 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); if (svcpu->in_use) { - kvmppc_copy_from_svcpu(vcpu, svcpu); + kvmppc_copy_from_svcpu(vcpu); } memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; @@ -142,9 +142,10 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) } /* Copy data needed by real-mode code from vcpu to shadow vcpu */ -void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, - struct kvm_vcpu *vcpu) +void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->gpr[0] = vcpu->arch.gpr[0]; svcpu->gpr[1] = vcpu->arch.gpr[1]; svcpu->gpr[2] = vcpu->arch.gpr[2]; @@ -176,17 +177,14 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, if (cpu_has_feature(CPU_FTR_ARCH_207S)) vcpu->arch.entry_ic = mfspr(SPRN_IC); svcpu->in_use = true; + + svcpu_put(svcpu); } /* Copy data touched by real-mode code from shadow vcpu back to vcpu */ -void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, - struct kvmppc_book3s_shadow_vcpu *svcpu) +void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu) { - /* - * vcpu_put would just call us again because in_use hasn't - * been updated yet. - */ - preempt_disable(); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); /* * Maybe we were already preempted and synced the svcpu from @@ -232,7 +230,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, svcpu->in_use = false; out: - preempt_enable(); + svcpu_put(svcpu); } static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) From 8dbfb2bf1bb3848a8069164e205635b2675c24fe Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 20 Dec 2017 16:24:27 -0800 Subject: [PATCH 192/197] KVM: x86: don't forget vcpu_put() in kvm_arch_vcpu_ioctl_set_sregs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to a bad merge resolution between commit f29810335965 ("KVM/x86: Check input paging mode when cs.l is set") and commit b4ef9d4e8cb8 ("KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_sregs"), there is a case in kvm_arch_vcpu_ioctl_set_sregs() where vcpu_put() is not called after vcpu_get(). Fix it. Reported-by: syzbot Signed-off-by: Eric Biggers Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e27ee573bd52d..07d1c7f66d977a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7706,7 +7706,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, goto out; if (kvm_valid_sregs(vcpu, sregs)) - return -EINVAL; + goto out; apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; From 57ea5f161a7de5b1913c212d04f57a175b159fdf Mon Sep 17 00:00:00 2001 From: Ulf Magnusson Date: Mon, 5 Feb 2018 02:21:14 +0100 Subject: [PATCH 193/197] KVM: PPC: Book3S PR: Fix broken select due to misspelling Commit 76d837a4c0f9 ("KVM: PPC: Book3S PR: Don't include SPAPR TCE code on non-pseries platforms") added a reference to the globally undefined symbol PPC_SERIES. Looking at the rest of the commit, PPC_PSERIES was probably intended. Change PPC_SERIES to PPC_PSERIES. Discovered with the https://github.com/ulfalizer/Kconfiglib/blob/master/examples/list_undefined.py script. Fixes: 76d837a4c0f9 ("KVM: PPC: Book3S PR: Don't include SPAPR TCE code on non-pseries platforms") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Ulf Magnusson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index b12b8eb39c2978..648160334abf5c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -68,7 +68,7 @@ config KVM_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE - select SPAPR_TCE_IOMMU if IOMMU_SUPPORT && (PPC_SERIES || PPC_POWERNV) + select SPAPR_TCE_IOMMU if IOMMU_SUPPORT && (PPC_PSERIES || PPC_POWERNV) ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. From 05f2bb0313a2855e491dadfc8319b7da261d7074 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 7 Feb 2018 19:49:54 +1100 Subject: [PATCH 194/197] KVM: PPC: Book3S HV: Fix handling of secondary HPTEG in HPT resizing code This fixes the computation of the HPTE index to use when the HPT resizing code encounters a bolted HPTE which is stored in its secondary HPTE group. The code inverts the HPTE group number, which is correct, but doesn't then mask it with new_hash_mask. As a result, new_pteg will be effectively negative, resulting in new_hptep pointing before the new HPT, which will corrupt memory. In addition, this removes two BUG_ON statements. The condition that the BUG_ONs were testing -- that we have computed the hash value incorrectly -- has never been observed in testing, and if it did occur, would only affect the guest, not the host. Given that BUG_ON should only be used in conditions where the kernel (i.e. the host kernel, in this case) can't possibly continue execution, it is not appropriate here. Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 966097232d2147..d19649960bbf92 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1329,12 +1329,8 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, } new_pteg = hash & new_hash_mask; - if (vpte & HPTE_V_SECONDARY) { - BUG_ON(~pteg != (hash & old_hash_mask)); - new_pteg = ~new_pteg; - } else { - BUG_ON(pteg != (hash & old_hash_mask)); - } + if (vpte & HPTE_V_SECONDARY) + new_pteg = ~hash & new_hash_mask; new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); new_hptep = (__be64 *)(new->virt + (new_idx << 4)); From 790a9df5fbef982f2a6992194fe497dd2b794a3d Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 2 Feb 2018 14:29:08 +1100 Subject: [PATCH 195/197] KVM: PPC: Book3S HV: Make HPT resizing work on POWER9 This adds code to enable the HPT resizing code to work on POWER9, which uses a slightly modified HPT entry format compared to POWER8. On POWER9, we convert HPTEs read from the HPT from the new format to the old format so that the rest of the HPT resizing code can work as before. HPTEs written to the new HPT are converted to the new format as the last step before writing them into the new HPT. This takes out the checks added by commit bcd3bb63dbc8 ("KVM: PPC: Book3S HV: Disable HPT resizing on POWER9 for now", 2017-02-18), now that HPT resizing works on POWER9. On POWER9, when we pivot to the new HPT, we now call kvmppc_setup_partition_table() to update the partition table in order to make the hardware use the new HPT. [paulus@ozlabs.org - added kvmppc_setup_partition_table() call, wrote commit message.] Tested-by: Laurent Vivier Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 30 ++++++++++++++++++++++------- arch/powerpc/kvm/powerpc.c | 3 +-- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d19649960bbf92..cb34be7d1a49f3 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1261,6 +1261,11 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, /* Nothing to do */ goto out; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + rpte = be64_to_cpu(hptep[1]); + vpte = hpte_new_to_old_v(vpte, rpte); + } + /* Unmap */ rev = &old->rev[idx]; guest_rpte = rev->guest_rpte; @@ -1290,7 +1295,6 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, /* Reload PTE after unmap */ vpte = be64_to_cpu(hptep[0]); - BUG_ON(vpte & HPTE_V_VALID); BUG_ON(!(vpte & HPTE_V_ABSENT)); @@ -1299,6 +1303,12 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, goto out; rpte = be64_to_cpu(hptep[1]); + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + vpte = hpte_new_to_old_v(vpte, rpte); + rpte = hpte_new_to_old_r(rpte); + } + pshift = kvmppc_hpte_base_page_shift(vpte, rpte); avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23); pteg = idx / HPTES_PER_GROUP; @@ -1336,6 +1346,10 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, new_hptep = (__be64 *)(new->virt + (new_idx << 4)); replace_vpte = be64_to_cpu(new_hptep[0]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + unsigned long replace_rpte = be64_to_cpu(new_hptep[1]); + replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte); + } if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { BUG_ON(new->order >= old->order); @@ -1351,6 +1365,11 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, /* Discard the previous HPTE */ } + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + rpte = hpte_old_to_new_r(vpte, rpte); + vpte = hpte_old_to_new_v(vpte); + } + new_hptep[1] = cpu_to_be64(rpte); new->rev[new_idx].guest_rpte = guest_rpte; /* No need for a barrier, since new HPT isn't active */ @@ -1368,12 +1387,6 @@ static int resize_hpt_rehash(struct kvm_resize_hpt *resize) unsigned long i; int rc; - /* - * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs - * that POWER9 uses, and could well hit a BUG_ON on POWER9. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - return -EIO; for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { rc = resize_hpt_rehash_hpte(resize, i); if (rc != 0) @@ -1404,6 +1417,9 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize) synchronize_srcu_expedited(&kvm->srcu); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvmppc_setup_partition_table(kvm); + resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 77eb25abc60160..cf86aeb43fcfbc 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -633,8 +633,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_SPAPR_RESIZE_HPT: - /* Disable this on POWER9 until code handles new HPTE format */ - r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300); + r = !!hv_enabled; break; #endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE From d20fe50a7b3c8f936f7347e9cab2e9dd89c1199d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 8 Feb 2018 18:38:53 +0100 Subject: [PATCH 196/197] KVM: PPC: Book3S HV: Branch inside feature section We ended up with code that did a conditional branch inside a feature section to code outside of the feature section. Depending on how the object file gets organized, that might mean we exceed the 14bit relocation limit for conditional branches: arch/powerpc/kvm/built-in.o:arch/powerpc/kvm/book3s_hv_rmhandlers.S:416:(__ftr_alt_97+0x8): relocation truncated to fit: R_PPC64_REL14 against `.text'+1ca4 So instead of doing a conditional branch outside of the feature section, let's just jump at the end of the same, making the branch very short. Signed-off-by: Alexander Graf Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index a7a20b85d8eb8d..7d1459e77de2fe 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -413,10 +413,11 @@ FTR_SECTION_ELSE /* On P9 we use the split_info for coordinating LPCR changes */ lwz r4, KVM_SPLIT_DO_SET(r6) cmpwi r4, 0 - beq 63f + beq 1f mr r3, r6 bl kvmhv_p9_set_lpcr nop +1: ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 63: /* Order load of vcpu after load of vcore */ From 09f984961c137c4b252c368adab7e1c9f035fa59 Mon Sep 17 00:00:00 2001 From: Jose Ricardo Ziviani Date: Sat, 3 Feb 2018 18:24:26 -0200 Subject: [PATCH 197/197] KVM: PPC: Book3S: Add MMIO emulation for VMX instructions This patch provides the MMIO load/store vector indexed X-Form emulation. Instructions implemented: lvx: the quadword in storage addressed by the result of EA & 0xffff_ffff_ffff_fff0 is loaded into VRT. stvx: the contents of VRS are stored into the quadword in storage addressed by the result of EA & 0xffff_ffff_ffff_fff0. Reported-by: Gopesh Kumar Chaudhary Reported-by: Balamuruhan S Signed-off-by: Jose Ricardo Ziviani Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 2 + arch/powerpc/include/asm/kvm_ppc.h | 4 + arch/powerpc/include/asm/ppc-opcode.h | 6 ++ arch/powerpc/kvm/emulate_loadstore.c | 36 +++++++ arch/powerpc/kvm/powerpc.c | 150 ++++++++++++++++++++++++++ 5 files changed, 198 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index fef8133becc85c..1f53b562726fd9 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -690,6 +690,7 @@ struct kvm_vcpu_arch { u8 mmio_vsx_offset; u8 mmio_vsx_copy_type; u8 mmio_vsx_tx_sx_enabled; + u8 mmio_vmx_copy_nums; u8 osi_needed; u8 osi_enabled; u8 papr_enabled; @@ -804,6 +805,7 @@ struct kvm_vcpu_arch { #define KVM_MMIO_REG_QPR 0x0040 #define KVM_MMIO_REG_FQPR 0x0060 #define KVM_MMIO_REG_VSX 0x0080 +#define KVM_MMIO_REG_VMX 0x00c0 #define __KVM_HAVE_ARCH_WQP #define __KVM_HAVE_CREATE_DEVICE diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 941c2a3f231b90..28c2030035193f 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -81,6 +81,10 @@ extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int rt, unsigned int bytes, int is_default_endian, int mmio_sign_extend); +extern int kvmppc_handle_load128_by2x64(struct kvm_run *run, + struct kvm_vcpu *vcpu, unsigned int rt, int is_default_endian); +extern int kvmppc_handle_store128_by2x64(struct kvm_run *run, + struct kvm_vcpu *vcpu, unsigned int rs, int is_default_endian); extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_default_endian); diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index ce0930d68857c6..a51febca08c529 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -156,6 +156,12 @@ #define OP_31_XOP_LFDX 599 #define OP_31_XOP_LFDUX 631 +/* VMX Vector Load Instructions */ +#define OP_31_XOP_LVX 103 + +/* VMX Vector Store Instructions */ +#define OP_31_XOP_STVX 231 + #define OP_LWZ 32 #define OP_STFS 52 #define OP_STFSU 53 diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index af833531af3191..a382e15135e6d3 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -58,6 +58,18 @@ static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu) } #endif /* CONFIG_VSX */ +#ifdef CONFIG_ALTIVEC +static bool kvmppc_check_altivec_disabled(struct kvm_vcpu *vcpu) +{ + if (!(kvmppc_get_msr(vcpu) & MSR_VEC)) { + kvmppc_core_queue_vec_unavail(vcpu); + return true; + } + + return false; +} +#endif /* CONFIG_ALTIVEC */ + /* * XXX to do: * lfiwax, lfiwzx @@ -98,6 +110,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE; vcpu->arch.mmio_sp64_extend = 0; vcpu->arch.mmio_sign_extend = 0; + vcpu->arch.mmio_vmx_copy_nums = 0; switch (get_op(inst)) { case 31: @@ -459,6 +472,29 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) rs, 4, 1); break; #endif /* CONFIG_VSX */ + +#ifdef CONFIG_ALTIVEC + case OP_31_XOP_LVX: + if (kvmppc_check_altivec_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.vaddr_accessed &= ~0xFULL; + vcpu->arch.paddr_accessed &= ~0xFULL; + vcpu->arch.mmio_vmx_copy_nums = 2; + emulated = kvmppc_handle_load128_by2x64(run, vcpu, + KVM_MMIO_REG_VMX|rt, 1); + break; + + case OP_31_XOP_STVX: + if (kvmppc_check_altivec_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.vaddr_accessed &= ~0xFULL; + vcpu->arch.paddr_accessed &= ~0xFULL; + vcpu->arch.mmio_vmx_copy_nums = 2; + emulated = kvmppc_handle_store128_by2x64(run, vcpu, + rs, 1); + break; +#endif /* CONFIG_ALTIVEC */ + default: emulated = EMULATE_FAIL; break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index cf86aeb43fcfbc..47c7a302fd0319 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -924,6 +924,34 @@ static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu, } #endif /* CONFIG_VSX */ +#ifdef CONFIG_ALTIVEC +static inline void kvmppc_set_vmx_dword(struct kvm_vcpu *vcpu, + u64 gpr) +{ + int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK; + u32 hi, lo; + u32 di; + +#ifdef __BIG_ENDIAN + hi = gpr >> 32; + lo = gpr & 0xffffffff; +#else + lo = gpr >> 32; + hi = gpr & 0xffffffff; +#endif + + di = 2 - vcpu->arch.mmio_vmx_copy_nums; /* doubleword index */ + if (di > 1) + return; + + if (vcpu->arch.mmio_host_swabbed) + di = 1 - di; + + VCPU_VSX_VR(vcpu, index).u[di * 2] = hi; + VCPU_VSX_VR(vcpu, index).u[di * 2 + 1] = lo; +} +#endif /* CONFIG_ALTIVEC */ + #ifdef CONFIG_PPC_FPU static inline u64 sp_to_dp(u32 fprs) { @@ -1026,6 +1054,11 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, KVMPPC_VSX_COPY_DWORD_LOAD_DUMP) kvmppc_set_vsr_dword_dump(vcpu, gpr); break; +#endif +#ifdef CONFIG_ALTIVEC + case KVM_MMIO_REG_VMX: + kvmppc_set_vmx_dword(vcpu, gpr); + break; #endif default: BUG(); @@ -1302,6 +1335,111 @@ static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu, } #endif /* CONFIG_VSX */ +#ifdef CONFIG_ALTIVEC +/* handle quadword load access in two halves */ +int kvmppc_handle_load128_by2x64(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int rt, int is_default_endian) +{ + enum emulation_result emulated; + + while (vcpu->arch.mmio_vmx_copy_nums) { + emulated = __kvmppc_handle_load(run, vcpu, rt, 8, + is_default_endian, 0); + + if (emulated != EMULATE_DONE) + break; + + vcpu->arch.paddr_accessed += run->mmio.len; + vcpu->arch.mmio_vmx_copy_nums--; + } + + return emulated; +} + +static inline int kvmppc_get_vmx_data(struct kvm_vcpu *vcpu, int rs, u64 *val) +{ + vector128 vrs = VCPU_VSX_VR(vcpu, rs); + u32 di; + u64 w0, w1; + + di = 2 - vcpu->arch.mmio_vmx_copy_nums; /* doubleword index */ + if (di > 1) + return -1; + + if (vcpu->arch.mmio_host_swabbed) + di = 1 - di; + + w0 = vrs.u[di * 2]; + w1 = vrs.u[di * 2 + 1]; + +#ifdef __BIG_ENDIAN + *val = (w0 << 32) | w1; +#else + *val = (w1 << 32) | w0; +#endif + return 0; +} + +/* handle quadword store in two halves */ +int kvmppc_handle_store128_by2x64(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int rs, int is_default_endian) +{ + u64 val = 0; + enum emulation_result emulated = EMULATE_DONE; + + vcpu->arch.io_gpr = rs; + + while (vcpu->arch.mmio_vmx_copy_nums) { + if (kvmppc_get_vmx_data(vcpu, rs, &val) == -1) + return EMULATE_FAIL; + + emulated = kvmppc_handle_store(run, vcpu, val, 8, + is_default_endian); + if (emulated != EMULATE_DONE) + break; + + vcpu->arch.paddr_accessed += run->mmio.len; + vcpu->arch.mmio_vmx_copy_nums--; + } + + return emulated; +} + +static int kvmppc_emulate_mmio_vmx_loadstore(struct kvm_vcpu *vcpu, + struct kvm_run *run) +{ + enum emulation_result emulated = EMULATE_FAIL; + int r; + + vcpu->arch.paddr_accessed += run->mmio.len; + + if (!vcpu->mmio_is_write) { + emulated = kvmppc_handle_load128_by2x64(run, vcpu, + vcpu->arch.io_gpr, 1); + } else { + emulated = kvmppc_handle_store128_by2x64(run, vcpu, + vcpu->arch.io_gpr, 1); + } + + switch (emulated) { + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + r = RESUME_HOST; + break; + case EMULATE_FAIL: + pr_info("KVM: MMIO emulation failed (VMX repeat)\n"); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + r = RESUME_HOST; + break; + default: + r = RESUME_GUEST; + break; + } + return r; +} +#endif /* CONFIG_ALTIVEC */ + int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { int r = 0; @@ -1420,6 +1558,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return r; } } +#endif +#ifdef CONFIG_ALTIVEC + if (vcpu->arch.mmio_vmx_copy_nums > 0) + vcpu->arch.mmio_vmx_copy_nums--; + + if (vcpu->arch.mmio_vmx_copy_nums > 0) { + r = kvmppc_emulate_mmio_vmx_loadstore(vcpu, run); + if (r == RESUME_HOST) { + vcpu->mmio_needed = 1; + return r; + } + } #endif } else if (vcpu->arch.osi_needed) { u64 *gprs = run->osi.gprs;