Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds functions to handle CMMA and the ESSA instruction. Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 259 ++++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 27 +++++ 2 files changed, 286 insertions(+) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index 4249400a9d21..bf9c8af1d74a 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -999,3 +999,262 @@ int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn) return -EAGAIN; return 0; } + +/** + * dat_perform_essa() - perform ESSA actions on the PGSTE. + * @asce: the asce to operate on. + * @gfn: the guest page frame to operate on. + * @orc: the specific action to perform, see the ESSA_SET_* macros. + * @state: the storage attributes to be returned to the guest. + * @dirty: returns whether the function dirtied a previously clean entry. + * + * Context: Called with kvm->mmu_lock held. + * + * Return: + * * 1 if the page state has been altered and the page is to be added to the CBRL + * * 0 if the page state has been altered, but the page is not to be added to the CBRL + * * -1 if the page state has not been altered and the page is not to be added to the CBRL + */ +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int res = 0; + + if (dat_entry_walk(gfn, asce, 0, LEVEL_PTE, &crstep, &ptep)) { + *state = (union essa_state) { .exception = 1 }; + return -1; + } + + pgste = pgste_get_lock(ptep); + + *state = (union essa_state) { + .content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero), + .nodat = pgste.nodat, + .usage = pgste.usage, + }; + + switch (orc) { + case ESSA_GET_STATE: + res = -1; + break; + case ESSA_SET_STABLE: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 0; + break; + case ESSA_SET_UNUSED: + pgste.usage = PGSTE_GPS_USAGE_UNUSED; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + if (!ptep->h.i) { + pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE; + } else if (pgste.zero) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + } else if (!pgste.gc) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + res = 1; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!ptep->h.i) + pgste.usage = PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_STABLE_NODAT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 1; + break; + default: + WARN_ONCE(1, "Invalid ORC!"); + res = -1; + break; + } + /* If we are discarding a page, set it to logical zero */ + pgste.zero = res == 1; + if (orc > 0) { + *dirty = !pgste.cmma_d; + pgste.cmma_d = 1; + } + + pgste_set_unlock(ptep, pgste); + + return res; +} + +static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.usage = 0; + pgste.nodat = 0; + pgste.cmma_d = 0; + pgste_set_unlock(ptep, pgste); + if (need_resched()) + return next; + return 0; +} + +long dat_reset_cmma(union asce asce, gfn_t start) +{ + const struct dat_walk_ops dat_reset_cmma_ops = { + .pte_entry = dat_reset_cmma_pte, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops, + DAT_WALK_IGN_HOLES, NULL); +} + +struct dat_get_cmma_state { + gfn_t start; + gfn_t end; + unsigned int count; + u8 *values; + atomic64_t *remaining; +}; + +static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6); + pgste_set_unlock(ptep, pgste); + state->end = next; + + return 0; +} + +static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + + if (crstep->h.i) + state->end = min(walk->end, next); + return 0; +} + +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values) +{ + const struct dat_walk_ops ops = { + .pte_entry = __dat_peek_cmma_pte, + .pmd_entry = __dat_peek_cmma_crste, + .pud_entry = __dat_peek_cmma_crste, + .p4d_entry = __dat_peek_cmma_crste, + .pgd_entry = __dat_peek_cmma_crste, + }; + struct dat_get_cmma_state state = { .values = values, }; + int rc; + + rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state); + *count = state.end - start; + /* Return success if at least one value was saved, otherwise an error. */ + return (rc == -EFAULT && *count > 0) ? 0 : rc; +} + +static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + if (state->start != -1) { + if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE) + return 1; + if (gfn - state->start >= state->count) + return 1; + } + + if (!READ_ONCE(*pgste_of(ptep)).cmma_d) + return 0; + + pgste = pgste_get_lock(ptep); + if (pgste.cmma_d) { + if (state->start == -1) + state->start = gfn; + pgste.cmma_d = 0; + atomic64_dec(state->remaining); + state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6; + state->end = next; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, }; + struct dat_get_cmma_state state = { + .remaining = rem, + .values = values, + .count = *count, + .start = -1, + }; + + _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state); + + if (state.start == -1) { + *count = 0; + } else { + *count = state.end - state.start; + *start = state.start; + } + + return 0; +} + +struct dat_set_cmma_state { + unsigned long mask; + const u8 *bits; +}; + +static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_set_cmma_state *state = walk->priv; + union pgste pgste, tmp; + + tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask; + + pgste = pgste_get_lock(ptep); + pgste.usage = tmp.usage; + pgste.nodat = tmp.nodat; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +/* + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.uses_cmm flag is set. + */ +int dat_set_cmma_bits(union asce asce, gfn_t gfn, unsigned long count, + unsigned long mask, const uint8_t *bits) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, }; + struct dat_set_cmma_state state = { .mask = mask, .bits = bits, }; + union crste *crstep; + union pte *ptep; + gfn_t cur; + int rc; + + for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) + dat_entry_walk(cur, asce, DAT_WALK_ALLOC, LEVEL_PTE, &crstep, &ptep); + rc = _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state); + return rc; +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index b695eae5d763..4d0ceeada40f 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -18,6 +18,15 @@ #include #include +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* for consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + #define _ASCE(x) ((union asce) { .val = (x), }) #define NULL_ASCE _ASCE(0) @@ -418,6 +427,17 @@ static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool w, bool d) return res; } +union essa_state { + unsigned char val; + struct { + unsigned char : 2; + unsigned char nodat : 1; + unsigned char exception : 1; + unsigned char usage : 2; + unsigned char content : 2; + }; +}; + /** * 0 1 2 3 4 5 6 7 * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -459,6 +479,13 @@ int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); int dat_link(kvm_pfn_t pfn, gfn_t gfn, union asce asce, int level, bool w, bool d, bool s, bool sk); +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); +long dat_reset_cmma(union asce asce, gfn_t start_gfn); +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values); +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem); +int dat_set_cmma_bits(union asce asce, gfn_t gfn, unsigned long count, unsigned long mask, + const uint8_t *bits); + static inline struct crst_table *crste_table_start(union crste *crstep) { return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE); -- 2.51.0