It is possible that some guest memory areas have not been touched yet when starting migration mode, and thus have no ptes allocated. Only existing and allocated ptes should count toward the total of dirty cmma entries. When starting migration mode, enable the migration_mode flag immediately, so that any subsequent ESSA will trap in the host and cause cmma_dirty_pages to be increased as needed. Subsequently, set the cmma_d bit on all existing cmma-clean PGSTEs, increasing cmma_dirty_pages as needed. Skipping cmma-dirty pages prevents double counting. Conversely, when disabling migration mode, set cmma_dirty_pages to 0 and clear the cmma_d bit in all existing PGSTEs. The invariant is that when migration mode is off, no PGSTE has its cmma_d bit set, and cmma_dirty_pages is 0. kvm->slots_lock protects kvm_s390_vm_start_migration() and kvm_s390_vm_stop_migration() from each other and from kvm_s390_get_cmma_bits(). Also fix dat_get_cmma() to properly wrap around if the first attempt reached the end of guest memory without finding cmma-dirty pages. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 3 +++ arch/s390/kvm/gmap.c | 31 ++++++++++++++++++++++++---- arch/s390/kvm/gmap.h | 12 ++++++++++- arch/s390/kvm/kvm-s390.c | 44 ++++++++++++++++++++++++++++++++-------- arch/s390/kvm/priv.c | 2 +- 5 files changed, 77 insertions(+), 15 deletions(-) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index cffac7782c4b..0ad4ebc80eba 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -1253,6 +1253,9 @@ int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, }; _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state); + /* If no dirty pages were found, wrap around and continue searching */ + if (*start && state.start == -1) + _dat_walk_gfn_range(0, *start, asce, &ops, DAT_WALK_IGN_HOLES, &state); if (state.start == -1) { *count = 0; diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index e6e786811db8..0f944944badf 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -1073,23 +1073,46 @@ int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gf return 0; } +static long __set_cmma_clean_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.cmma_d = 0; + pgste_set_unlock(ptep, pgste); + + if (need_resched()) + return next; + return 0; +} + static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) { - __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (!pgste.cmma_d) + atomic64_inc(walk->priv); + pgste.cmma_d = 1; + pgste_set_unlock(ptep, pgste); + if (need_resched()) return next; return 0; } -void gmap_set_cmma_all_dirty(struct gmap *gmap) +void _gmap_set_cmma_all(struct gmap *gmap, bool dirty) { - const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; + const struct dat_walk_ops ops = { + .pte_entry = dirty ? __set_cmma_dirty_pte : __set_cmma_clean_pte, + }; gfn_t gfn = 0; do { scoped_guard(read_lock, &gmap->kvm->mmu_lock) gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, - DAT_WALK_IGN_HOLES, NULL); + DAT_WALK_IGN_HOLES, + &gmap->kvm->arch.cmma_dirty_pages); cond_resched(); } while (gfn); } diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index 5374f21aaf8d..39cb2ee1eede 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -103,7 +103,7 @@ int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interr int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level); int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, kvm_pfn_t pfn, int level, bool wr); -void gmap_set_cmma_all_dirty(struct gmap *gmap); +void _gmap_set_cmma_all(struct gmap *gmap, bool dirty); void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn); struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, union asce asce, int edat_level); @@ -167,6 +167,16 @@ static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) return _gmap_unmap_prefix(gmap, gfn, end, false); } +static inline void gmap_set_cmma_all_dirty(struct gmap *gmap) +{ + _gmap_set_cmma_all(gmap, true); +} + +static inline void gmap_set_cmma_all_clean(struct gmap *gmap) +{ + _gmap_set_cmma_all(gmap, false); +} + /** * pte_needs_unshadow() -- Check if the pte operations triggers unshadowing. * @oldpte: the previous value for the guest pte. diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index fe0ae15e5ad0..91cf7a3c55c7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1187,13 +1187,13 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) /* * Must be called with kvm->srcu held to avoid races on memslots, and with - * kvm->slots_lock to avoid races with ourselves and kvm_s390_vm_stop_migration. + * kvm->slots_lock to avoid races with ourselves, kvm_s390_vm_stop_migration(), + * and kvm_s390_get_cmma_bits(). */ static int kvm_s390_vm_start_migration(struct kvm *kvm) { struct kvm_memory_slot *ms; struct kvm_memslots *slots; - unsigned long ram_pages = 0; int bkt; /* migration mode already enabled */ @@ -1210,28 +1210,54 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) kvm_for_each_memslot(ms, bkt, slots) { if (!ms->dirty_bitmap) return -EINVAL; - ram_pages += ms->npages; } - /* mark all the pages as dirty */ + /* + * Set the flag and let KVM handle ESSA manually, potentially setting + * the cmma_d bit in some PGSTEs and increasing cmma_dirty_pages. + * At this point cmma_dirty_pages is still 0, and all existing PGSTEs + * have their cmma_d bit set to 0. + * Any newly allocated page table has its entries marked as cmma-clean, + * which is fine because the CMMA values are not dirty. + */ + WRITE_ONCE(kvm->arch.migration_mode, 1); + /* + * Mark all PGSTEs as cmma-dirty, increasing cmma_dirty_pages as needed, + * but without double-counting pages that have become dirty on their own + * in the meantime. + * At this point some pages might have become dirty on their own already + * and cmma_dirty_pages might therefore be non-zero. + */ gmap_set_cmma_all_dirty(kvm->arch.gmap); - atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); - kvm->arch.migration_mode = 1; kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); return 0; } /* - * Must be called with kvm->slots_lock to avoid races with ourselves and - * kvm_s390_vm_start_migration. + * Must be called with kvm->slots_lock to avoid races with ourselves, + * kvm_s390_vm_start_migration() and kvm_s390_get_cmma_bits(). */ static int kvm_s390_vm_stop_migration(struct kvm *kvm) { /* migration mode already disabled */ if (!kvm->arch.migration_mode) return 0; - kvm->arch.migration_mode = 0; + /* + * Unset the flag and propagate to all vCPUs. From now on the cmma_d + * bit will not be touched on any PGSTE. + * At this point cmma_dirty_pages is possibly non-zero, and thus some + * PGSTEs might have cmma_d set. + */ + WRITE_ONCE(kvm->arch.migration_mode, 0); if (kvm->arch.use_cmma) kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); + /* Clear cmma_d on all existing PGSTEs and set cmma_dirty_pages to 0. */ + gmap_set_cmma_all_clean(kvm->arch.gmap); + atomic64_set(&kvm->arch.cmma_dirty_pages, 0); + /* + * At this point the system has the expected state: migration_mode is 0, + * cmma_dirty_pages is 0, and all existing PGSTEs have their cmma_d bit + * set to 0. + */ return 0; } diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 9bc6fd02ff77..ad0ddc433a73 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1236,7 +1236,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) : ESSA_SET_STABLE_IF_RESIDENT)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (!vcpu->kvm->arch.migration_mode) { + if (!READ_ONCE(vcpu->kvm->arch.migration_mode)) { /* * CMMA is enabled in the KVM settings, but is disabled in * the SIE block and in the mm_context, and we are not doing -- 2.54.0