In practice dat_crstep_xchg() is racy and hard to use correctly. Simply remove it and replace its uses with dat_crstep_xchg_atomic(). This solves some actual races that lead to system hangs / crashes. Signed-off-by: Claudio Imbrenda Fixes: 589071eaaa8f ("KVM: s390: KVM page table management functions: clear and replace") Fixes: 94fd9b16cc67 ("KVM: s390: KVM page table management functions: lifecycle management") --- arch/s390/kvm/dat.c | 53 ++++++++++++----------------------------- arch/s390/kvm/dat.h | 9 ++++--- arch/s390/kvm/gaccess.c | 4 +++- arch/s390/kvm/gmap.c | 32 +++++++++++++++---------- arch/s390/kvm/gmap.h | 32 ++++++++++++++----------- 5 files changed, 61 insertions(+), 69 deletions(-) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index 670404d4fa44..b673e86c8ae5 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -134,32 +134,6 @@ int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newt return 0; } -/** - * dat_crstep_xchg() - Exchange a gmap CRSTE with another. - * @crstep: Pointer to the CRST entry - * @new: Replacement entry. - * @gfn: The affected guest address. - * @asce: The ASCE of the address space. - * - * Context: This function is assumed to be called with kvm->mmu_lock held. - */ -void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce) -{ - if (crstep->h.i) { - WRITE_ONCE(*crstep, new); - return; - } else if (cpu_has_edat2()) { - crdte_crste(crstep, *crstep, new, gfn, asce); - return; - } - - if (machine_has_tlb_guest()) - idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL); - else - idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL); - WRITE_ONCE(*crstep, new); -} - /** * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. * @crstep: Pointer to the CRST entry. @@ -175,8 +149,8 @@ void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce * * Return: %true if the exchange was successful. */ -bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, - union asce asce) +bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, + gfn_t gfn, union asce asce) { if (old.h.i) return arch_try_cmpxchg((long *)crstep, &old.val, new.val); @@ -893,7 +867,8 @@ static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct d /* This table entry needs to be updated. */ if (walk->start <= gfn && walk->end >= next) { - dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce); + if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce)) + return -EINVAL; /* A lower level table was present, needs to be freed. */ if (!crste.h.fc && !crste.h.i) { if (is_pmd(crste)) @@ -1071,17 +1046,19 @@ int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) { - union crste crste = READ_ONCE(*crstep); + union crste newcrste, oldcrste; int *n = walk->priv; - if (!crste.h.fc || crste.h.i || crste.h.p) - return 0; - - *n = 2; - if (crste.s.fc1.prefix_notif) - return 0; - crste.s.fc1.prefix_notif = 1; - dat_crstep_xchg(crstep, crste, gfn, walk->asce); + do { + oldcrste = READ_ONCE(*crstep); + if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p) + return 0; + *n = 2; + if (oldcrste.s.fc1.prefix_notif) + return 0; + newcrste = oldcrste; + newcrste.s.fc1.prefix_notif = 1; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce)); return 0; } diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 123e11dcd70d..22dafc775335 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -938,11 +938,14 @@ static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pu return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); } -static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce) +static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce) { - union crste newcrste = _CRSTE_EMPTY(crstep->h.tt); + union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt); - dat_crstep_xchg(crstep, newcrste, gfn, asce); + do { + oldcrste = READ_ONCE(*crstep); + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce)); + return oldcrste; } static inline int get_level(union crste *crstep, union pte *ptep) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index a9da9390867d..e490ae87db44 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1478,7 +1478,9 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false); newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p); - dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce); + gfn = gpa_to_gfn(raddr); + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + ; return 0; } diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index ef0c6ebfdde2..3ae746fada36 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -313,13 +313,16 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st struct clear_young_pte_priv *priv = walk->priv; union crste crste, new; - crste = READ_ONCE(*crstep); + do { + crste = READ_ONCE(*crstep); + + if (!crste.h.fc) + return 0; + if (!crste.s.fc1.y && crste.h.i) + return 0; + if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) + break; - if (!crste.h.fc) - return 0; - if (!crste.s.fc1.y && crste.h.i) - return 0; - if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) { new = crste; new.h.i = 1; new.s.fc1.y = 0; @@ -328,8 +331,8 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st folio_set_dirty(phys_to_folio(crste_origin_large(crste))); new.s.fc1.d = 0; new.h.p = 1; - dat_crstep_xchg(crstep, new, gfn, walk->asce); - } + } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); + priv->young = 1; return 0; } @@ -673,7 +676,8 @@ static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, &crstep, &ptep); if (rc) return rc; - dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce); + while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), newcrste, c_gfn, gmap->asce)) + ; return 0; } @@ -777,8 +781,10 @@ static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) int rc; rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); - if (!rc) - dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce); + if (rc) + return; + while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) + ; } void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) @@ -1017,8 +1023,8 @@ static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); return; } - crste = READ_ONCE(*crstep); - dat_crstep_clear(crstep, r_gfn, sg->asce); + + crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); if (crste_leaf(crste) || crste.h.i) return; if (is_pmd(crste)) diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index ccb5cd751e31..3ef426abdc65 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -198,25 +198,29 @@ static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, uni gfn_t gfn, bool needs_lock) { unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11); + union crste oldcrste; lockdep_assert_held(&gmap->kvm->mmu_lock); if (!needs_lock) lockdep_assert_held(&gmap->children_lock); - gfn = ALIGN_DOWN(gfn, align); - if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) { - ne.s.fc1.prefix_notif = 0; - gmap_unmap_prefix(gmap, gfn, gfn + align); - } - if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif && - (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) { - ne.s.fc1.vsie_notif = 0; - if (needs_lock) - gmap_handle_vsie_unshadow_event(gmap, gfn); - else - _gmap_handle_vsie_unshadow_event(gmap, gfn); - } - dat_crstep_xchg(crstep, ne, gfn, gmap->asce); + do { + oldcrste = READ_ONCE(*crstep); + + gfn = ALIGN_DOWN(gfn, align); + if (crste_prefix(oldcrste) && (ne.h.p || ne.h.i || !crste_prefix(ne))) { + ne.s.fc1.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + align); + } + if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif && + (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) { + ne.s.fc1.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, ne, gfn, gmap->asce)); } static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, -- 2.53.0 When shadowing a nested guest, a check is performed and no shadowing is attempted if the nested guest is already shadowed. The existing check was incomplete; fix it by also checking whether the leaf DAT table entry in the existing shadow gmap has the same protection as the one specified in the guest DAT entry. Signed-off-by: Claudio Imbrenda Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") --- arch/s390/kvm/gaccess.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index e490ae87db44..f5ffb11c8ef9 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1505,7 +1505,8 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, return rc; /* A race occourred. The shadow mapping is already valid, nothing to do */ - if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table))) + if ((ptep && !ptep->h.i && ptep->h.p == w->p) || + (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p)) return 0; gl = get_level(table, ptep); -- 2.53.0 The slow path of the fault handler ultimately called gmap_link(), which assumed the fault was a major fault, and blindly called dat_link(). In case of minor faults, things were not always handled properly; in particular the prefix and vsie marker bits were ignored. Move dat_link() into gmap.c, renaming it accordingly. Once moved, the new _gmap_link() function will be able to correctly honour the prefix and vsie markers. Signed-off-by: Claudio Imbrenda Fixes: 94fd9b16cc67 ("KVM: s390: KVM page table management functions: lifecycle management") Fixes: a2c17f9270cc ("KVM: s390: New gmap code") --- arch/s390/kvm/dat.c | 48 -------------------------------------- arch/s390/kvm/dat.h | 2 -- arch/s390/kvm/gmap.c | 55 ++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 51 insertions(+), 54 deletions(-) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index b673e86c8ae5..bfa84074f433 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -996,54 +996,6 @@ bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end) return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0; } -int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, - bool uses_skeys, struct guest_fault *f) -{ - union crste oldval, newval; - union pte newpte, oldpte; - union pgste pgste; - int rc = 0; - - rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep); - if (rc == -EINVAL || rc == -ENOMEM) - return rc; - if (rc) - return -EAGAIN; - - if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level))) - return -EINVAL; - - if (f->ptep) { - pgste = pgste_get_lock(f->ptep); - oldpte = *f->ptep; - newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); - newpte.s.sd = oldpte.s.sd; - oldpte.s.sd = 0; - if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { - pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys); - if (f->callback) - f->callback(f); - } else { - rc = -EAGAIN; - } - pgste_set_unlock(f->ptep, pgste); - } else { - oldval = READ_ONCE(*f->crstep); - newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, - f->write_attempt | oldval.s.fc1.d); - newval.s.fc1.sd = oldval.s.fc1.sd; - if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && - crste_origin_large(oldval) != crste_origin_large(newval)) - return -EAGAIN; - if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce)) - return -EAGAIN; - if (f->callback) - f->callback(f); - } - - return rc; -} - static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) { union crste newcrste, oldcrste; diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 22dafc775335..efedcf96110c 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -540,8 +540,6 @@ int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gf u16 type, u16 param); int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); -int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, - bool uses_skeys, struct guest_fault *f); int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); long dat_reset_cmma(union asce asce, gfn_t start_gfn); diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 3ae746fada36..759a2ed17038 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -626,10 +626,59 @@ static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn) return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags); } +static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, + struct guest_fault *f) +{ + union crste oldval, newval; + union pte newpte, oldpte; + union pgste pgste; + int rc = 0; + + rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, + &f->crstep, &f->ptep); + if (rc == -ENOMEM) + return rc; + if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) + return rc; + if (rc) + return -EAGAIN; + if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) + return -EINVAL; + + if (f->ptep) { + pgste = pgste_get_lock(f->ptep); + oldpte = *f->ptep; + newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); + newpte.s.sd = oldpte.s.sd; + oldpte.s.sd = 0; + if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { + pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); + if (f->callback) + f->callback(f); + } else { + rc = -EAGAIN; + } + pgste_set_unlock(f->ptep, pgste); + } else { + oldval = READ_ONCE(*f->crstep); + newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, + f->write_attempt | oldval.s.fc1.d); + newval.s.fc1.sd = oldval.s.fc1.sd; + if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && + crste_origin_large(oldval) != crste_origin_large(newval)) + return -EAGAIN; + gmap_crstep_xchg(gmap, f->crstep, newval, f->gfn); + if (f->callback) + f->callback(f); + } + + return rc; +} + int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f) { unsigned int order; - int rc, level; + int level; lockdep_assert_held(&gmap->kvm->mmu_lock); @@ -641,9 +690,7 @@ int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fau else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn)) level = TABLE_TYPE_SEGMENT; } - rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f); - KVM_BUG_ON(rc == -EINVAL, gmap->kvm); - return rc; + return _gmap_link(mc, gmap, level, f); } static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, -- 2.53.0 In most cases gmap_put() was not called when it should have. Add the missing gmap_put() in vsie_run(). Signed-off-by: Claudio Imbrenda Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") --- arch/s390/kvm/vsie.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 0330829b4046..72895dddc39a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1328,7 +1328,7 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - struct gmap *sg; + struct gmap *sg = NULL; int rc = 0; while (1) { @@ -1368,6 +1368,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) sg = gmap_put(sg); cond_resched(); } + if (sg) + sg = gmap_put(sg); if (rc == -EFAULT) { /* -- 2.53.0 If shadowing causes the shadow gmap to get unshadowed, exit early to prevent an attempt to dereference the parent pointer, which at this point is NULL. Opportunistically add some more checks to prevent NULL parents. Signed-off-by: Claudio Imbrenda Fixes: a2c17f9270cc ("KVM: s390: New gmap code") Fixes: e5f98a6899bd ("KVM: s390: Add some helper functions needed for vSIE") Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") --- arch/s390/kvm/gaccess.c | 2 ++ arch/s390/kvm/gmap.c | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index f5ffb11c8ef9..3bcf988d6faa 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1520,6 +1520,8 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, entries[i - 1].pfn, i, entries[i - 1].writable); if (rc) return rc; + if (!sg->parent) + return -EAGAIN; } rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 759a2ed17038..ba921da48019 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -1154,6 +1154,7 @@ struct gmap_protect_asce_top_level { static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, struct gmap_protect_asce_top_level *context) { + struct gmap *parent; int rc, i; guard(write_lock)(&sg->kvm->mmu_lock); @@ -1161,7 +1162,12 @@ static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, s if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) return -EAGAIN; - scoped_guard(spinlock, &sg->parent->children_lock) { + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; for (i = 0; i < CRST_TABLE_PAGES; i++) { if (!context->f[i].valid) continue; @@ -1244,6 +1250,9 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare struct gmap *sg, *new; int rc; + if (WARN_ON(!parent)) + return ERR_PTR(-EINVAL); + scoped_guard(spinlock, &parent->children_lock) { sg = gmap_find_shadow(parent, asce, edat_level); if (sg) { -- 2.53.0 When shadowing, the guest page tables are write-protected, in order to trap changes and properly unshadow the shadow mapping for the nested guest. Already shadowed levels are skipped, so that only the needed levels are write protected. Currently the levels that get write protected are exactly one level too deep: the last level (nested guest memory) gets protected in the wrong way, and will be protected again correctly a few lines afterwards; most importantly, the highest non-shadowed level does *not* get write protected. This leads to all sorts of races and other issues. Write protect the correct levels, so that all the levels that need to be protected are protected, and avoid double protecting the last level. Signed-off-by: Claudio Imbrenda Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") --- arch/s390/kvm/gaccess.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 3bcf988d6faa..8b287fcf611d 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1516,8 +1516,8 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, * only the page containing the entry, not the whole table. */ for (i = gl ; i >= w->level; i--) { - rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr), - entries[i - 1].pfn, i, entries[i - 1].writable); + rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr), + entries[i].pfn, i + 1, entries[i].writable); if (rc) return rc; if (!sg->parent) -- 2.53.0 A previous commit changed the behaviour of the KVM_S390_VCPU_FAULT ioctl. The current (wrong) implementation will trigger a guest addressing exception if the requested address lies outside of a memslot, unless the VM is UCONTROL. Restore the previous behaviour by open coding the fault-in logic. Fixes: 3762e905ec2e ("KVM: s390: use __kvm_faultin_pfn()") Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/kvm-s390.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ebcb0ef8835e..aebc74974ddf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5520,9 +5520,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_S390_VCPU_FAULT: { - idx = srcu_read_lock(&vcpu->kvm->srcu); - r = vcpu_dat_fault_handler(vcpu, arg, 0); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + gpa_t gaddr = arg; + + scoped_guard(srcu, &vcpu->kvm->srcu) { + r = -EREMOTE; + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + break; + + r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false); + if (r <= 0) + break; + if (r == PGM_ADDRESSING) + r = -ENOENT; + else + KVM_BUG_ON(r, vcpu->kvm); + } break; } case KVM_ENABLE_CAP: -- 2.53.0